In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('gapminder.csv')
df.head()

Unnamed: 0,population,fertility,HIV,CO2,BMI_male,GDP,BMI_female,life,child_mortality,Region
0,34811059.0,2.73,0.1,3.328945,24.5962,12314.0,129.9049,75.3,29.5,Middle East & North Africa
1,19842251.0,6.43,2.0,1.474353,22.25083,7103.0,130.1247,58.3,192.0,Sub-Saharan Africa
2,40381860.0,2.24,0.5,4.78517,27.5017,14646.0,118.8915,75.5,15.4,America
3,2975029.0,1.4,0.1,1.804106,25.35542,7383.0,132.8108,72.5,20.0,Europe & Central Asia
4,21370348.0,1.96,0.1,18.016313,27.56373,41312.0,117.3755,81.5,5.2,East Asia & Pacific


In [3]:
X= df.drop(['Region'],1)
y = df['Region']

In [4]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import train_test_split as tts,GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [5]:
X_train,X_test,y_train,y_test = tts(X,y,random_state=42, test_size=0.25)

In [6]:
knn = KNeighborsClassifier()
dte = DecisionTreeClassifier()

In [7]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [8]:
br = BaggingClassifier(base_estimator=dte,
    n_estimators=100,
    bootstrap=True,
    oob_score=True)

In [9]:
br.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort=False,
                                                        random_state=None,
                                                        splitter='best'),
    

In [10]:
y_pred = br.predict(X_test)

In [11]:
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

                            precision    recall  f1-score   support

                   America       0.67      1.00      0.80         4
       East Asia & Pacific       0.00      0.00      0.00         2
     Europe & Central Asia       0.75      0.86      0.80        14
Middle East & North Africa       0.00      0.00      0.00         4
                South Asia       1.00      1.00      1.00         1
        Sub-Saharan Africa       1.00      1.00      1.00        10

                  accuracy                           0.77        35
                 macro avg       0.57      0.64      0.60        35
              weighted avg       0.69      0.77      0.73        35

0.7714285714285715


In [12]:
rfe = RandomForestClassifier()

In [13]:
rfe

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
rfe.fit(X_train,y_train)
y_pred1 = rfe.predict(X_test)
print(classification_report(y_test,y_pred1))
print(accuracy_score(y_test,y_pred1))

                            precision    recall  f1-score   support

                   America       0.57      1.00      0.73         4
       East Asia & Pacific       0.00      0.00      0.00         2
     Europe & Central Asia       0.80      0.86      0.83        14
Middle East & North Africa       0.50      0.25      0.33         4
                South Asia       0.00      0.00      0.00         1
        Sub-Saharan Africa       1.00      1.00      1.00        10

                  accuracy                           0.77        35
                 macro avg       0.48      0.52      0.48        35
              weighted avg       0.73      0.77      0.74        35

0.7714285714285715


  'precision', 'predicted', average, warn_for)


In [15]:
import numpy as np

In [16]:
param = {
    'n_estimators' : np.arange(100,300,50),
    'max_depth' : np.arange(4,12,1),
    'max_features' : np.arange(4,8,1),
#     'criterion': ['gini','entropy'],
#     'min_samples_split': np.arange(0.01,0.05,0.01)
}

In [17]:
rfe_cv = GridSearchCV(estimator=rfe,param_grid=param,cv=3)

In [18]:
rfe_cv.fit(X,y)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='wa

In [19]:
rfe_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features=4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
rfe_cv.best_score_

0.762589928057554

In [21]:
rfe_cv.best_estimator_.feature_importances_

array([0.03667506, 0.21113852, 0.08749559, 0.08715011, 0.12142524,
       0.04263076, 0.12759565, 0.11592961, 0.16995947])

In [22]:
from sklearn.ensemble import VotingClassifier

In [23]:
knn = KNeighborsClassifier()
dte =DecisionTreeClassifier()
rfe= RandomForestClassifier()

In [24]:
vc = VotingClassifier(estimators= [('knn',knn),('dt',dte),('rf',rfe)],
    voting='soft',
    weights=None,
#     n_jobs=None,
#     flatten_transform=True,
                     )

In [25]:
vc.fit(X_train,y_train)



VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=5,
                                                   p=2, weights='uniform')),
                             ('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features=None,
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_

In [26]:
y_pred2 = vc.predict(X_test)
print(classification_report(y_test,y_pred2))
print(accuracy_score(y_test,y_pred2))

                            precision    recall  f1-score   support

                   America       0.80      1.00      0.89         4
       East Asia & Pacific       0.50      0.50      0.50         2
     Europe & Central Asia       0.76      0.93      0.84        14
Middle East & North Africa       0.00      0.00      0.00         4
                South Asia       1.00      1.00      1.00         1
        Sub-Saharan Africa       1.00      1.00      1.00        10

                  accuracy                           0.83        35
                 macro avg       0.68      0.74      0.70        35
              weighted avg       0.74      0.83      0.78        35

0.8285714285714286


  'precision', 'predicted', average, warn_for)
