# Data analysis, classifing models and predictions

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#from statistics import mean, variance, stdev
#from statsmodels.distributions.empirical_distribution import ECDF
#from math import sqrt

%matplotlib inline
#%config InlineBackend.figure_format = 'svg'

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

#Import clean dataset
data=pd.read_csv('data_clean.csv')

## Dropping data leakage columns and checking correlations with target column 'out_of_school'

In [None]:
data.drop('num_kids_noschool',axis=1, inplace=True)
data.drop('num_kids_left_school',axis=1, inplace=True)
data.drop('num_girls_noschool',axis=1, inplace=True)
data.drop('num_girls_left_school',axis=1, inplace=True)
data.drop('id',axis=1, inplace=True)

In [None]:
data.shape

In [None]:
#Check correlations with target column 'out_of_school' to select features
corr = data.corr()
#corr

In [None]:
abs(corr['out_of_school']).sort_values()

In [None]:
#Correlation with output variable
cor_target = abs(corr["out_of_school"])
y = []
# create Correlation Chart
for i in range(100):
    relevant_features = cor_target[cor_target>i/100]
    _y = len(relevant_features)
    y.append(_y)

plt.figure(figsize=(11,6))
    
ax = sns.lineplot(x=[x/100 for x in range(100)], y=y)

ax.set_xlabel('threshold', fontsize=18)
ax.set_ylabel('features amount', fontsize=18)

ax.set_title('Feature correlation importance', fontsize=20)

plt.xlim(0,1.0)
plt.ylim(0,45)

fig=ax.get_figure()
fig.savefig('output_figures/features_correlation.pdf')

In [None]:
#Correlation with output variable
cor_target = abs(corr["out_of_school"])
#Selecting highly correlated features (you can play with the threshold value to see how classifier changes)
relevant_features = cor_target[cor_target>0.1]
relevant_features.sort_values()

In [None]:
feat_list = list(filter(lambda x: x != 'out_of_school', relevant_features.keys()))
classes = data['out_of_school']
clf_data = pd.DataFrame()
clf_data = data[feat_list]
clf_data['out_of_school'] = classes
clf_data = clf_data.fillna(0)

In [None]:
clf_data.shape

In [None]:
clf_data.keys()

In [None]:
with open("output_files/important_features_list.txt", "w") as outfile:
    outfile.write("\n".join(feat_list))

## Construct Decistion Tree Classifier with train/test split dataset

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
#from sklearn.externals.six import StringIO  
from io import StringIO
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

#y=np.array(clf_data['school_drop_corr']).reshape(-1,1)
#X=np.array(clf_data[feat_list])

y=clf_data['out_of_school']
X=clf_data[feat_list]

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.25, random_state=1)


data_model = DecisionTreeClassifier(random_state=1)
data_model.fit(train_X,train_y)
y_predict=data_model.predict(val_X)

print("Accuracy: %3.1f %%" %(metrics.accuracy_score(val_y, y_predict)*100) )

In [None]:
dot_data = StringIO()
export_graphviz(data_model, out_file=dot_data
                ,filled=True, rounded=True
                ,feature_names = feat_list
                ,class_names = ['attend', 'drop']
                ,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("output_figures/decision_tree_picaps.pdf")
Image(graph.create_png())

### Tuning the decision tree depth by checking accuracy

In [None]:
# function which calculates accuracy
def get_acc(max_depth, train_X, val_X, train_y, val_y):
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    acc = metrics.accuracy_score(val_y, preds_val)*100
    return(acc)

# compare acc with differing values of max_leaf_nodes
for max_depth in [2,3,4,5,6,7,8,9,10,15,50]:
    my_acc = get_acc(max_depth, train_X, val_X, train_y, val_y)
    print("Max depth: %d  \t\t Accuracy:  %3.1f" %(max_depth, my_acc))

In [None]:
#We choose the leaf-node number which gives maximum accuracy
data_model = DecisionTreeClassifier(max_depth=4, random_state=1)
data_model.fit(train_X,train_y)
pred_y=data_model.predict(val_X)

dot_data = StringIO()
export_graphviz(data_model, out_file=dot_data
                ,filled=True, rounded=True
                ,feature_names = feat_list
                ,class_names = ['attend', 'out']
                ,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("output_figures/decision_tree_tuned_picaps.pdf")
Image(graph.create_png())

### Checking the importance of each feature in the classification process

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
from IPython.display import display

perm = PermutationImportance(data_model, random_state=1).fit(val_X, val_y)
display(eli5.show_weights(perm, feature_names = val_X.columns.tolist()))