# Classification Modeling

In [1]:
# The long list of packages I need to run all my models
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier

In [2]:
aid = pd.read_csv('./aid_data/combined_data/aid_sums.csv')

In [43]:
aid.head()

Unnamed: 0,country,world_bank_totals,chinese_aid_totals,usaid_aid,hdi_00,hdi_14,pr_score00,cl_score00,fh_status00,pr_score14,...,debt_to_gdp,fh_total00,fh_total14,fh_change,hdi_change,world_bank_pc,chinese_aid_pc,usaid_pc,max_total_aid,chinese_aid_total_max
0,Algeria,438050500.0,567497100.0,247662100.0,0.646,0.749,6.0,5.0,NF,6.0,...,7.673,11.0,11.0,0.0,0.103,11.254086,14.579737,6.362761,567497100.0,1
1,Angola,803087300.0,38249270000.0,2939893000.0,0.394,0.557,6.0,6.0,NF,6.0,...,40.676,12.0,11.0,-1.0,0.163,29.808251,1419.701152,109.120217,38249270000.0,1
2,Benin,1107820000.0,1212040000.0,1785451000.0,0.398,0.505,2.0,2.0,F,2.0,...,30.452,4.0,4.0,0.0,0.107,107.692898,117.824339,173.566471,1785451000.0,0
3,Botswana,385871900.0,2585561000.0,2141447000.0,0.578,0.709,2.0,2.0,F,3.0,...,17.346,4.0,5.0,1.0,0.131,184.750231,1237.931268,1025.295634,2585561000.0,1
4,Burkina Faso,2858288000.0,0.0,2048434000.0,0.286,0.405,4.0,4.0,PF,6.0,...,30.387,8.0,9.0,1.0,0.119,162.531851,0.0,116.480855,2048434000.0,0


In [44]:
print(aid.columns)

Index(['country', 'world_bank_totals', 'chinese_aid_totals', 'usaid_aid',
       'hdi_00', 'hdi_14', 'pr_score00', 'cl_score00', 'fh_status00',
       'pr_score14', 'cl_score14', 'fh_status14', 'cpi_2014', 'population',
       'gdp_per_cap', 'resource_rents', 'debt_to_gdp', 'fh_total00',
       'fh_total14', 'fh_change', 'hdi_change', 'world_bank_pc',
       'chinese_aid_pc', 'usaid_pc', 'max_total_aid', 'chinese_aid_total_max'],
      dtype='object')


In [45]:
aid = aid[['country', 'world_bank_totals', 'chinese_aid_totals', 'usaid_aid', 'hdi_14', 'cpi_2014', 'population', 'gdp_per_cap', 'resource_rents', 'debt_to_gdp', 'hdi_change', 'fh_change', 'chinese_aid_total_max']]

In [46]:
ss = StandardScaler()
X_scaled = ss.fit_transform(aid[['hdi_14', 'cpi_2014', 'population', 'gdp_per_cap', 'resource_rents', 'debt_to_gdp', 'hdi_change', 'fh_change']])

In [47]:
aid.head()

Unnamed: 0,country,world_bank_totals,chinese_aid_totals,usaid_aid,hdi_14,cpi_2014,population,gdp_per_cap,resource_rents,debt_to_gdp,hdi_change,fh_change,chinese_aid_total_max
0,Algeria,438050500.0,567497100.0,247662100.0,0.749,36.0,38923687,4702.0917,24.602722,7.673,0.103,0.0,1
1,Angola,803087300.0,38249270000.0,2939893000.0,0.557,19.0,26941779,3843.198241,23.38193,40.676,0.163,-1.0,1
2,Benin,1107820000.0,1212040000.0,1785451000.0,0.505,39.0,10286842,834.443596,4.872945,30.452,0.107,0.0,0
3,Botswana,385871900.0,2585561000.0,2141447000.0,0.709,63.0,2088614,7864.253281,2.516289,17.346,0.131,1.0,1
4,Burkina Faso,2858288000.0,0.0,2048434000.0,0.405,38.0,17586017,639.708096,16.981603,30.387,0.119,1.0,0


In [48]:
# Assigning my X and y variables
X = aid[['hdi_14', 'cpi_2014', 'population', 'gdp_per_cap', 'resource_rents', 'debt_to_gdp', 'hdi_change', 'fh_change']]
y = aid['chinese_aid_total_max']

In [49]:
# Train test split my data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=22)

In [50]:
pipe = Pipeline([('rf', RandomForestClassifier())])
pipe_params = {
    'rf__n_estimators' : [10, 100, 1000, 2000],
    'rf__max_features' : ['auto', 'sqrt'],
    'rf__max_depth' : [1, 5, 20, 30],
    'rf__min_samples_split' : [2, 5, 10], 
    'rf__min_samples_leaf' : [1, 2, 4],
    'rf__bootstrap' : [True, False]
    
}

gs = GridSearchCV(pipe, 
                  param_grid=pipe_params) 
# Fit GridSearch to training data.
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [51]:
# Printing the best score 
print(gs.best_score_)

0.6472727272727272


In [52]:
gs.best_params_

{'rf__bootstrap': True,
 'rf__max_depth': 20,
 'rf__max_features': 'sqrt',
 'rf__min_samples_leaf': 2,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 10}

In [53]:
gs_model = gs.best_estimator_

In [54]:
gs_model.score(X_train, y_train)

0.9411764705882353

In [55]:
gs_model.score(X_test, y_test)

0.6111111111111112

In [80]:
pipe2 = Pipeline([('xt', ExtraTreesClassifier())])
pipe_params2 = {
    'xt__n_estimators': [5, 10, 15], 
    'xt__criterion': ['gini', 'entropy'],  
    'xt__max_depth': [1, 5, 20, 30],
    'xt__bootstrap': [True, False],
}

gs2 = GridSearchCV(pipe2, 
                  param_grid=pipe_params2) 
# Fit GridSearch to training data.
gs2.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('xt',
                                        ExtraTreesClassifier(bootstrap=False,
                                                             ccp_alpha=0.0,
                                                             class_weight=None,
                                                             criterion='gini',
                                                             max_depth=None,
                                                             max_features='auto',
                                                             max_leaf_nodes=None,
                                                             max_samples=None,
                                                             min_impurity_decrease=0.0,
                                                             min_impurity_split=None,
                                                             mi

In [89]:
print(gs2.best_score_)

0.6709090909090909


In [90]:
gs2.best_params_

{'xt__bootstrap': True,
 'xt__criterion': 'gini',
 'xt__max_depth': 5,
 'xt__n_estimators': 15}

In [91]:
gs_model2 = gs2.best_estimator_

In [92]:
gs_model2.score(X_train, y_train)

0.9019607843137255

In [93]:
gs_model2.score(X_test, y_test)

0.6111111111111112

In [86]:
pipe3 = Pipeline([('ada', AdaBoostClassifier())])
pipe_params3 = {
    'ada__n_estimators': [20, 50, 100], 
    'ada__learning_rate': [1, 5, 10],  
}

gs3 = GridSearchCV(pipe3, 
                  param_grid=pipe_params3) 
# Fit GridSearch to training data.
gs3.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ada',
                                        AdaBoostClassifier(algorithm='SAMME.R',
                                                           base_estimator=None,
                                                           learning_rate=1.0,
                                                           n_estimators=50,
                                                           random_state=None))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'ada__learning_rate': [1, 5, 10],
                         'ada__n_estimators': [20, 50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [87]:
print(gs3.best_score_)

0.5490909090909091


In [88]:
gs3.best_params_

{'ada__learning_rate': 10, 'ada__n_estimators': 20}

In [94]:
gs_model3 = gs3.best_estimator_

In [95]:
gs_model3.score(X_train, y_train)

0.5882352941176471

In [96]:
gs_model3.score(X_test, y_test)

0.6666666666666666

In [97]:
pipe4 = Pipeline([('knn', KNeighborsClassifier())])
pipe_params4 = {
    'knn__n_neighbors': [3, 5, 7, 9], 
    'knn__weights': ['uniform', 'distance'],  
    'knn__leaf_size': [10, 30, 50],
    'knn__n_jobs': [None, 3, 5]
}

gs4 = GridSearchCV(pipe4, 
                  param_grid=pipe_params4) 
# Fit GridSearch to training data.
gs4.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'knn__leaf_size': [10, 30, 50],
                         'knn__n_jobs': [None, 3, 5],
                         'knn__n_neighbors': [3, 5, 7, 9],
                         'knn__weights': ['uniform', 

In [98]:
print(gs4.best_score_)

0.6690909090909092


In [99]:
gs4.best_params_

{'knn__leaf_size': 10,
 'knn__n_jobs': None,
 'knn__n_neighbors': 3,
 'knn__weights': 'uniform'}

In [100]:
gs_model4 = gs4.best_estimator_

In [101]:
gs_model4.score(X_train, y_train)

0.803921568627451

In [102]:
gs_model4.score(X_test, y_test)

0.7222222222222222