In [None]:
'''Ensemble can give you a boost in accuracy on our dataset.
  1. Bagging ensemble methods such as bagged decision trees,random forest and extra trees
  2. Boosting ensemble methods such as AdaBoost and stochastic gradient boosting.'''

In [None]:
# Bagging trees

In [4]:
# Bagged Decision Tree Classifier

# Load data
import pandas as pd
filename='pima-indians-diabetes.csv'
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
dataset=pd.read_csv(filename,names=names)
array=dataset.values
x=array[:,:-1]
y=array[:,-1]


# Create Model
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
estimators=[]
trees=100
decision=DecisionTreeClassifier()
model=BaggingClassifier(base_estimator=decision,n_estimators=trees,random_state=0)

# Evaluate Model
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
fold=KFold(n_splits=10,random_state=0)
accuracy=cross_val_score(model,x,y,cv=fold)
print("Accuracy for bagging Decision Tree Classifier:",accuracy)
print("Mean Accuracy for bagging decision tree classifier",accuracy.mean())

('Accuracy for bagging Decision Tree Classifier:', array([0.62337662, 0.80519481, 0.75324675, 0.62337662, 0.83116883,
       0.83116883, 0.84415584, 0.84415584, 0.71052632, 0.76315789]))
('Mean Accuracy for bagging decision tree classifier', 0.7629528366370472)


In [11]:
# Random Forest Classifier 
# Random Forest Classifier is an extension of bagged decision tree
# the trees are constructed in a way that reduces the correlation between individual classifier.
import pandas as pd
filename='pima-indians-diabetes.csv'
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
dataset=pd.read_csv(filename,names=names)
array=dataset.values
x=array[:,:-1]
y=array[:,-1]

# Create a pipeline
estimators=[]
num_tree=100
max_features=3
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
estimators.append(('StandardScaler',StandardScaler()))
estimators.append(('random_forest',RandomForestClassifier(n_estimators=num_tree,max_features=max_features,random_state=1)))
model=Pipeline(estimators)

# Evaluate a pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
fold=KFold(n_splits=10,random_state=1)
accuracy=cross_val_score(model,x,y,cv=fold)
print("Accuracy For Random Forest Classifier",accuracy)
print("Mean Accuracy For Random Forest Classifier",accuracy.mean())

('Accuracy For Random Forest Classifier', array([0.68831169, 0.81818182, 0.7012987 , 0.66233766, 0.79220779,
       0.80519481, 0.87012987, 0.87012987, 0.72368421, 0.80263158]))
('Mean Accuracy For Random Forest Classifier', 0.7734107997265892)


In [15]:
# Extra tree are another modification of bagging where random trees are constructed from sample
# of the training dataset

#load data
import pandas as pd
filename='pima-indians-diabetes.csv'
names=['preg','plas','pres','skin','test','mass','pedi','age','class']
dataset=pd.read_csv(filename,names=names)
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values


# feature extraction
features=[]
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
features.append(('pca',PCA(n_components=3)))
features.append(('select best features',SelectKBest(k=6)))
feature=FeatureUnion(features)

# Create a Pipeline
num_tree=100
estimators=[]
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
estimators.append(('features select',feature))
estimators.append(('Extra Tree Classifier',ExtraTreesClassifier(n_estimators=num_tree,random_state=1)))
model=Pipeline(estimators)

# Evaluate a Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
fold=KFold(n_splits=10,random_state=1)
accuracy=cross_val_score(model,x,y,cv=fold)
print("Accuracy For Extra Tree",accuracy)
print("Mean Accuracy For Extra Tree",accuracy.mean())

('Accuracy For Extra Tree', array([0.72727273, 0.88311688, 0.68831169, 0.64935065, 0.84415584,
       0.80519481, 0.81818182, 0.83116883, 0.75      , 0.77631579]))
('Mean Accuracy For Extra Tree', 0.7773069036226932)
