# Classification Modeling

I chose to do classification modeling with my predicted variable as the whether a country had more Chinese aid than United States aid. I used several of my 

In [73]:
# Importing necessary models 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier

In [74]:
# Reading in data 
aid = pd.read_csv('./aid_data/combined_data/aid_sums.csv')

In [75]:
# Printing out my columns 
print(aid.columns)

Index(['country', 'world_bank_totals', 'chinese_aid_totals', 'usaid_aid',
       'hdi_00', 'hdi_14', 'pr_score00', 'cl_score00', 'fh_status00',
       'pr_score14', 'cl_score14', 'fh_status14', 'cpi_2014', 'population',
       'gdp_per_cap14', 'resource_rents', 'gdp_per_cap00', 'debt_to_gdp',
       'fh_change', 'pc_gdp_change', 'hdi_change', 'world_bank_pc',
       'chinese_aid_pc', 'usaid_pc', 'chinese_aid_total_max'],
      dtype='object')


In [76]:
# Dropping some of my columns 
aid = aid[['country', 'world_bank_totals', 'chinese_aid_totals', 'usaid_aid', 'hdi_14', 'cpi_2014', 'population', 'gdp_per_cap00', 'gdp_per_cap14', 'resource_rents', 'debt_to_gdp', 'hdi_change', 'fh_change', 'pc_gdp_change', 'chinese_aid_total_max']]

In [77]:
# Looking at my new data frame 
aid.head()

Unnamed: 0,country,world_bank_totals,chinese_aid_totals,usaid_aid,hdi_14,cpi_2014,population,gdp_per_cap00,gdp_per_cap14,resource_rents,debt_to_gdp,hdi_change,fh_change,pc_gdp_change,chinese_aid_total_max
0,Algeria,438050500.0,588090100.0,247662100.0,0.749,36.0,38923687,1764.888222,4702.0917,24.602722,7.673,0.103,0.0,2937.203478,1
1,Angola,803087300.0,38642330000.0,2939893000.0,0.557,19.0,26941779,556.836318,3843.198241,23.38193,40.676,0.163,-1.0,3286.361923,1
2,Benin,1107820000.0,1155924000.0,1785451000.0,0.505,39.0,10286842,374.192394,834.443596,4.872945,30.452,0.107,0.0,460.251202,0
3,Botswana,385871900.0,2563158000.0,2141447000.0,0.709,63.0,2088614,3522.308678,7864.253281,2.516289,17.346,0.131,1.0,4341.944603,1
4,Burkina Faso,2858288000.0,0.0,2048434000.0,0.405,38.0,17586017,226.475981,639.708096,16.981603,30.387,0.119,1.0,413.232114,0


In [109]:
# Model needs to beat the baseline score of 63.2%
aid['chinese_aid_total_max'].mean()

0.36764705882352944

In [78]:
# Scaling my features and setting my X and y variable 
ss = StandardScaler()
X_scaled = ss.fit_transform(aid[['cpi_2014', 'resource_rents', 'debt_to_gdp', 'hdi_change', 'fh_change', 'pc_gdp_change']])
y = aid['chinese_aid_total_max']

In [81]:
# Train test split my data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=22)

In [82]:
# Pipeline with Random Forest model 
pipe = Pipeline([('rf', RandomForestClassifier())])
pipe_params = {
    'rf__n_estimators' : [10, 100, 1000, 2000],
    'rf__max_features' : ['auto', 'sqrt'],
    'rf__max_depth' : [1, 5, 20, 30],
    'rf__min_samples_split' : [2, 5, 10], 
    'rf__min_samples_leaf' : [1, 2, 4],
    'rf__bootstrap' : [True, False]
    
}

gs = GridSearchCV(pipe, 
                  param_grid=pipe_params) 
# Fit GridSearch to training data.
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [83]:
# Printing the best score 
print(gs.best_score_)

0.6472727272727272


In [84]:
# Looking at my best parameters 
gs.best_params_

{'rf__bootstrap': True,
 'rf__max_depth': 1,
 'rf__max_features': 'auto',
 'rf__min_samples_leaf': 4,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 100}

In [85]:
# Setting my best estimator as the model 
gs_model = gs.best_estimator_

In [86]:
# Training score 
gs_model.score(X_train, y_train)

0.6470588235294118

In [87]:
# Testing score, did not beat baseline 
gs_model.score(X_test, y_test)

0.5882352941176471

In [88]:
# Pipeline for Extra Trees Classifier 
pipe2 = Pipeline([('xt', ExtraTreesClassifier())])
pipe_params2 = {
    'xt__n_estimators': [5, 10, 15], 
    'xt__criterion': ['gini', 'entropy'],  
    'xt__max_depth': [1, 5, 20, 30],
    'xt__bootstrap': [True, False],
}

gs2 = GridSearchCV(pipe2, 
                  param_grid=pipe_params2) 
# Fit GridSearch to training data.
gs2.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('xt',
                                        ExtraTreesClassifier(bootstrap=False,
                                                             ccp_alpha=0.0,
                                                             class_weight=None,
                                                             criterion='gini',
                                                             max_depth=None,
                                                             max_features='auto',
                                                             max_leaf_nodes=None,
                                                             max_samples=None,
                                                             min_impurity_decrease=0.0,
                                                             min_impurity_split=None,
                                                             mi

In [89]:
# Looking at my best score 
print(gs2.best_score_)

0.6472727272727272


In [90]:
# Looking at my best parameters 
gs2.best_params_

{'xt__bootstrap': True,
 'xt__criterion': 'gini',
 'xt__max_depth': 1,
 'xt__n_estimators': 5}

In [91]:
# Setting the model to be the best estimator 
gs_model2 = gs2.best_estimator_

In [92]:
# Training score 
gs_model2.score(X_train, y_train)

0.6470588235294118

In [93]:
# Testing score, did not beat baseline 
gs_model2.score(X_test, y_test)

0.5882352941176471

In [94]:
# Pipeline with AdaBoost Classifier 
pipe3 = Pipeline([('ada', AdaBoostClassifier())])
pipe_params3 = {
    'ada__n_estimators': [20, 50, 100], 
    'ada__learning_rate': [1, 5, 10],  
}

gs3 = GridSearchCV(pipe3, 
                  param_grid=pipe_params3) 
# Fit GridSearch to training data.
gs3.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('ada',
                                        AdaBoostClassifier(algorithm='SAMME.R',
                                                           base_estimator=None,
                                                           learning_rate=1.0,
                                                           n_estimators=50,
                                                           random_state=None))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'ada__learning_rate': [1, 5, 10],
                         'ada__n_estimators': [20, 50, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [95]:
# Looking at best score 
print(gs3.best_score_)

0.5490909090909091


In [96]:
# Best parameters 
gs3.best_params_

{'ada__learning_rate': 10, 'ada__n_estimators': 20}

In [97]:
# Setting my model as the best estimator 
gs_model3 = gs3.best_estimator_

In [98]:
# Training score 
gs_model3.score(X_train, y_train)

0.43137254901960786

In [99]:
# Testing score, does not beat baseline 
gs_model3.score(X_test, y_test)

0.4117647058823529

In [100]:
# Pipeline with K nearest neighbors 
pipe4 = Pipeline([('knn', KNeighborsClassifier())])
pipe_params4 = {
    'knn__n_neighbors': [3, 5, 7, 9], 
    'knn__weights': ['uniform', 'distance'],  
    'knn__leaf_size': [10, 30, 50],
    'knn__n_jobs': [None, 3, 5]
}

gs4 = GridSearchCV(pipe4, 
                  param_grid=pipe_params4) 
# Fit GridSearch to training data.
gs4.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'knn__leaf_size': [10, 30, 50],
                         'knn__n_jobs': [None, 3, 5],
                         'knn__n_neighbors': [3, 5, 7, 9],
                         'knn__weights': ['uniform', 

In [101]:
# best score 
print(gs4.best_score_)

0.6236363636363637


In [102]:
# best parameters 
gs4.best_params_

{'knn__leaf_size': 10,
 'knn__n_jobs': None,
 'knn__n_neighbors': 5,
 'knn__weights': 'uniform'}

In [103]:
# Setting the model to be the best estimator 
gs_model4 = gs4.best_estimator_

In [104]:
# Training score 
gs_model4.score(X_train, y_train)

0.7843137254901961

In [105]:
# Testing score barely beat out the training score 
gs_model4.score(X_test, y_test)

0.6470588235294118

### Conclusions 

The classification models were not more successful than the regression models. The data set probably is probably not big enough for this to work well, and I think it is hard to predict whether a country will be a recipient of Chinese aid based on this data. There are other geo-political factors that are likely much more important. 