In [105]:
from sklearn.datasets import make_moons,load_iris
from sklearn.ensemble import VotingClassifier,RandomForestClassifier,BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [3]:
x,y=make_moons(n_samples=500,noise=0.30,random_state=42)

In [4]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

In [6]:
voting_clf=VotingClassifier([('lr',LogisticRegression(random_state=42)),('rf',RandomForestClassifier(random_state=42)),
                 ('svc',SVC(random_state=42))])

In [10]:
voting_clf

In [15]:
voting_clf.estimators_

[LogisticRegression(random_state=42),
 RandomForestClassifier(random_state=42),
 SVC(random_state=42)]

In [14]:
voting_clf.named_estimators_

{'lr': LogisticRegression(random_state=42),
 'rf': RandomForestClassifier(random_state=42),
 'svc': SVC(random_state=42)}

In [13]:
voting_clf.fit(x_train,y_train)

In [16]:
for name,clf in voting_clf.named_estimators_.items():
    print(name,'=',clf.score(x_test,y_test))

lr = 0.864
rf = 0.896
svc = 0.896


In [36]:
voting_clf.predict(x_test[[0]])

array([1], dtype=int64)

In [45]:
[clf.predict(x_test[[0]]) for clf in voting_clf.estimators_]
    

[array([1], dtype=int64), array([1], dtype=int64), array([0], dtype=int64)]

In [46]:
voting_clf.score(x_test,y_test)

0.912

In [None]:
#The accuracy of voting classifier is more comapared to individual classifiers. Voting classifiers perform hard voting.
#For example, the voting classifier predicts class 1 for the first instance of the test set, bucause two of the three
#classifiers predict this instance to be class .


In [47]:
#If all classifiers are able to estimate class probabilities(ie., if all have a pridict_proba() method) then we can tell 
#sklearn to predict the class with the highest class probability, averaged over all the individual classes. This is called 
#soft voting.


In [48]:
#All we need to do is to set voting classifier's voting hyper parameter to 'soft' and ensure that all classifers can 
#estimate class probabilities. 
#SVC doesn't estimate class probability by default. It should be done by setting probability hyperparameter to True.(This 
#will make the SVC class use cross-validation to estimate class probabilities, slowing down training and it will add a 
#predict_proba() method)

In [49]:
voting_clf.voting='soft'


In [52]:
voting_clf.named_estimators['svc'].probability=True

In [53]:
voting_clf.fit(x_train,y_train)

In [56]:
voting_clf.score(x_test,y_test)

0.92

In [57]:
#We reach to 92% accuracy by using soft voting compared to 91.2% from hard voting.

# Bagging and pasting

In [58]:
#One way to get a diverse set of classifiers is to use very diffrent training algorithms. Another aproach is to use the same
#traiging algo for every preditor but train them on different rondom subsets of the training set. When sampling is performed
#with replacement it is called bagging and sampling without replacement is called pasting.

In [60]:
bag_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=100,n_jobs=-1,random_state=42)

In [61]:
bag_clf.fit(x_train,y_train)

In [62]:
#The above code trains an ensemble of 500 decision tree classifers :each is trained on 100 training instances randomly 
#sampled from the training set with replacement(if we want pasting instead set bootstrap=False)

# Out of bag evaluation

In [63]:
#with bagging, some training instances may be sampled several times for any given predictor, while others may not be sampled
#at all. By default a BaggingClassifier samples m training instances with replacement(bootstrap=True), where m is the size 
#of training the training set. With this process, it can be shown mathematically that only about 63% of the training instanc
#are sampled on average for each predictor. THe remaining 37% of the training instances that are not sampled are called out
# of bag instances. NOte that they are not the same 37% for all predictors. 

In [64]:
#A bagging ensemble can be evaluated using OOB instances, with out need for a seperate validation set:indeed if there are
#enough estimators, then each instance in the training set will likely be an OOB instance of several estimators, so these 
#estimators can be used to make a fair ensemble prediction for the several estimators, so these estimatores can be used to 
#make a fair ensemble prediction for that instance, we can compute the ensemble's prediction accuracy.


In [70]:
bag_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,oob_score=True,n_jobs=-1,random_state=42)

In [71]:
bag_clf.fit(x_train,y_train)

In [72]:
bag_clf.oob_score_

0.896

In [74]:
y_predict=bag_clf.predict(x_test)

In [77]:
accuracy_score(y_test,y_predict)

0.92

In [80]:
bag_clf.oob_decision_function_

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.06145251, 0.93854749],
       [0.35465116, 0.64534884],
       [0.01142857, 0.98857143],
       [0.98930481, 0.01069519],
       [0.97927461, 0.02072539],
       [0.75586854, 0.24413146],
       [0.0049505 , 0.9950495 ],
       [0.75520833, 0.24479167],
       [0.82122905, 0.17877095],
       [0.98461538, 0.01538462],
       [0.06315789, 0.93684211],
       [0.00490196, 0.99509804],
       [0.99004975, 0.00995025],
       [0.92513369, 0.07486631],
       [1.        , 0.        ],
       [0.03409091, 0.96590909],
       [0.35087719, 0.64912281],
       [0.91111111, 0.08888889],
       [1.        , 0.        ],
       [0.96319018, 0.03680982],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.6635514 , 0.3364486 ],
       [0.

In [81]:
bag_clf.oob_decision_function_.shape

(375, 2)

In [83]:
bag_clf.oob_decision_function_[:3] #probas for first three training instances

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

In [84]:
bag_clf.estimators_

[DecisionTreeClassifier(random_state=1952926171),
 DecisionTreeClassifier(random_state=1761383086),
 DecisionTreeClassifier(random_state=1449071958),
 DecisionTreeClassifier(random_state=1910541088),
 DecisionTreeClassifier(random_state=1341730541),
 DecisionTreeClassifier(random_state=1286572245),
 DecisionTreeClassifier(random_state=1005142668),
 DecisionTreeClassifier(random_state=502852014),
 DecisionTreeClassifier(random_state=186414760),
 DecisionTreeClassifier(random_state=1956263048),
 DecisionTreeClassifier(random_state=15592051),
 DecisionTreeClassifier(random_state=1628376228),
 DecisionTreeClassifier(random_state=1638437331),
 DecisionTreeClassifier(random_state=116435712),
 DecisionTreeClassifier(random_state=588556688),
 DecisionTreeClassifier(random_state=358068376),
 DecisionTreeClassifier(random_state=67998415),
 DecisionTreeClassifier(random_state=825108120),
 DecisionTreeClassifier(random_state=1237545031),
 DecisionTreeClassifier(random_state=1708477288),
 DecisionT

In [89]:
bag_clf.predict_proba(x_test)

array([[0.342, 0.658],
       [0.616, 0.384],
       [0.736, 0.264],
       [0.   , 1.   ],
       [0.024, 0.976],
       [0.222, 0.778],
       [1.   , 0.   ],
       [0.942, 0.058],
       [0.946, 0.054],
       [0.874, 0.126],
       [0.   , 1.   ],
       [1.   , 0.   ],
       [0.082, 0.918],
       [0.116, 0.884],
       [0.002, 0.998],
       [0.998, 0.002],
       [1.   , 0.   ],
       [0.044, 0.956],
       [0.16 , 0.84 ],
       [0.946, 0.054],
       [1.   , 0.   ],
       [0.002, 0.998],
       [0.492, 0.508],
       [1.   , 0.   ],
       [0.956, 0.044],
       [0.86 , 0.14 ],
       [0.002, 0.998],
       [1.   , 0.   ],
       [0.116, 0.884],
       [0.998, 0.002],
       [0.046, 0.954],
       [0.   , 1.   ],
       [0.518, 0.482],
       [1.   , 0.   ],
       [0.   , 1.   ],
       [0.954, 0.046],
       [1.   , 0.   ],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.73 , 0.27 ],
       [0.018, 0.982],
       [0.708, 0.292],
       [0.6

In [92]:
bag_clf.oob_decision_function_

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.06145251, 0.93854749],
       [0.35465116, 0.64534884],
       [0.01142857, 0.98857143],
       [0.98930481, 0.01069519],
       [0.97927461, 0.02072539],
       [0.75586854, 0.24413146],
       [0.0049505 , 0.9950495 ],
       [0.75520833, 0.24479167],
       [0.82122905, 0.17877095],
       [0.98461538, 0.01538462],
       [0.06315789, 0.93684211],
       [0.00490196, 0.99509804],
       [0.99004975, 0.00995025],
       [0.92513369, 0.07486631],
       [1.        , 0.        ],
       [0.03409091, 0.96590909],
       [0.35087719, 0.64912281],
       [0.91111111, 0.08888889],
       [1.        , 0.        ],
       [0.96319018, 0.03680982],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.6635514 , 0.3364486 ],
       [0.

# Random Forests

In [93]:
#Random forest is nothing but a bagging classifier with max_samples set to sizie of training set but it uses a subset of
#sqrt(n) out of total n features by random sampling which introduces in a greater tree diversity, which trades a higher 
#bias for a lower variance

In [94]:
rnd_clf=RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1,random_state=42)

In [95]:
rnd_clf.fit(x_train,y_train)

In [98]:
y_predict_rf=rnd_clf.predict(x_test)

In [99]:
y_predict.shape

(125,)

In [100]:
#The above random forest classifier is same as below baggingclassifier

In [102]:
bag_clf=BaggingClassifier(DecisionTreeClassifier(max_features='sqrt',max_leaf_nodes=16),
                          n_estimators=500,n_jobs=-1,random_state=42)

# Feature importance

In [103]:
#sklearn measures a feature's importance by looking at how much the tree nodes that use that feature reduce impurity on 
#average across all trees in the forest. More precisely , its weighted average, where each node's weight is equal to the no.
# of training samples that are associated with it.

In [104]:
#sklearn computes this score automatically for each feature after training , then it scales the results so that the sum of
#all imprtances is equal to 1. 
#we can access the results using feature_importances_ variable

In [106]:
iris=load_iris(as_frame=True)

In [109]:
iris['data']

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [112]:
iris['target']

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int32

In [113]:
rnd_clf=RandomForestClassifier(n_estimators=500,random_state=42)

In [115]:
rnd_clf.fit(iris['data'],iris['target'])

In [118]:
for score,name in zip(rnd_clf.feature_importances_,iris['data'].columns):
    print(round(score,2),name)

0.11 sepal length (cm)
0.02 sepal width (cm)
0.44 petal length (cm)
0.42 petal width (cm)


# Boosting

In [119]:
#THe general idea of most boosting methods is to train predictors sequentially, each trying to correct its predecessor. Most 
#popular are Adaboost(Adaptive boosting) and Gradient boosting

Adaboost

In [120]:
#One way for a new predictor to correct its predecessor is to pay a bit more attention to the training instances that the
#predecessor underfit.This results in new predictors focussing more and more on the hard cases. 

Gradient boost

In [121]:
#Just like adaboost,this works by sequentially adding predictors to an ensemble, each one correcting its predecessor. 
#This  method tries to fit the new predictor to the residual error made by the previous predictor.