# Classification Modeling 

I tried classification modeling without a lot of success. 

In [121]:
# Imported libraries for data manipulation, analysis, and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor 
%matplotlib inline

In [122]:
# Reading in data
aid = pd.read_csv('./aid_data/combined_data/aid_sums.csv')

In [123]:
# Looking at my data
aid.head()

Unnamed: 0,country,world_bank_totals,chinese_aid_totals,usaid_aid,hdi_00,hdi_14,pr_score00,cl_score00,fh_status00,pr_score14,...,resource_rents,gdp_per_cap00,debt_to_gdp,fh_change,pc_gdp_change,hdi_change,world_bank_pc,chinese_aid_pc,usaid_pc,chinese_aid_total_max
0,Algeria,438050500.0,588090100.0,247662100.0,0.646,0.749,6.0,5.0,NF,6.0,...,24.602722,1764.888222,7.673,0.0,2937.203478,0.103,11.254086,15.108798,6.362761,1
1,Angola,803087300.0,38642330000.0,2939893000.0,0.394,0.557,6.0,6.0,NF,6.0,...,23.38193,556.836318,40.676,-1.0,3286.361923,0.163,29.808251,1434.290317,109.120217,1
2,Benin,1107820000.0,1155924000.0,1785451000.0,0.398,0.505,2.0,2.0,F,2.0,...,4.872945,374.192394,30.452,0.0,460.251202,0.107,107.692898,112.369168,173.566471,0
3,Botswana,385871900.0,2563158000.0,2141447000.0,0.578,0.709,2.0,2.0,F,3.0,...,2.516289,3522.308678,17.346,1.0,4341.944603,0.131,184.750231,1227.205008,1025.295634,1
4,Burkina Faso,2858288000.0,0.0,2048434000.0,0.286,0.405,4.0,4.0,PF,6.0,...,16.981603,226.475981,30.387,1.0,413.232114,0.119,162.531851,0.0,116.480855,0


In [124]:
# Printing out my columns 
print(aid.columns)

Index(['country', 'world_bank_totals', 'chinese_aid_totals', 'usaid_aid',
       'hdi_00', 'hdi_14', 'pr_score00', 'cl_score00', 'fh_status00',
       'pr_score14', 'cl_score14', 'fh_status14', 'cpi_2014', 'population',
       'gdp_per_cap14', 'resource_rents', 'gdp_per_cap00', 'debt_to_gdp',
       'fh_change', 'pc_gdp_change', 'hdi_change', 'world_bank_pc',
       'chinese_aid_pc', 'usaid_pc', 'chinese_aid_total_max'],
      dtype='object')


In [125]:
# Scaling my features, assigning that as my X variable 
ss = StandardScaler()
X = ss.fit_transform(aid[['hdi_change', 'fh_change', 'resource_rents', 'pc_gdp_change', 'cpi_2014', 'hdi_14', 'pr_score14', 'cl_score14', 'gdp_per_cap14']])

In [126]:
# Chinese aid totals is my predicted variable 
y = aid['chinese_aid_pc']

## Linear Regression Model

In [127]:
# Train test split and fitting a linear regression model 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [128]:
# Training score is not good 
lr.score(X_train, y_train)

0.18013500569330487

In [129]:
# Testing score is even worse
lr.score(X_test, y_test)

-1.6022660340505035

## Extra Trees Model

In [130]:
# Fitting an extra trees model 
xt = ExtraTreesRegressor()
xt.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)

In [131]:
# Training score means this is probably overfit
xt.score(X_train, y_train)

1.0

In [132]:
# Model is overfit, and testing score is not good
xt.score(X_test, y_test)

-0.998908941820656

## Bagging Regressor 

In [133]:
# Fitting a Bagging Regressor 
br = BaggingRegressor()
br.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [134]:
# Training score
br.score(X_train, y_train)

0.5900194029790065

In [135]:
# Testing score, which is not good 
br.score(X_test, y_test)

-4.208294610147553

## Random Forest Regressor 

In [136]:
# Fitting a Random Forest Model 
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [137]:
# Training score 
rf.score(X_train, y_train)

0.769624289223955

In [138]:
# Testing score, not good 
rf.score(X_test, y_test)

-5.302333752181181

### Regression modeling was unsuccessful. I think this shows that Chinese aid is difficult to predict just based on factors like corruption, HDI, type of government, etc. There are a lot of factors that go into Chinese aid that cannot be explained by data alone. 