Sources:
    
http://dataaspirant.com/2017/02/01/decision-tree-algorithm-python-with-scikit-learn/

http://scikit-learn.org/stable/modules/tree.html

# Decision Trees

### Things to keep in mind with decision trees:
* They work with discrete values
* They are prone to overfitting

In [None]:
import pandas as pd
import numpy as np
import sys, os
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

### Set input variables

In [2]:
background = "../../ai4all_data/background.csv"
train = "../../ai4all_data/train.csv"
output_dir = "../output"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Read in data

In [3]:
data_frame = pd.read_csv(background, low_memory=False)

In [4]:
data_frame.shape

(4242, 12943)

In [5]:
data_frame = pd.read_csv(background, low_memory=False)

num_samples = data_frame.shape[0]
assert list(data_frame['challengeID'].to_dict().values()) == list(range(1, num_samples+1))

data_frame = data_frame.set_index('challengeID')

data_frame = data_frame.replace('missing', -3)
data_frame = data_frame.apply(lambda x: pd.to_numeric(x, errors='ignore'))
# removing all non-numeric elements
data_frame = data_frame.select_dtypes(include = [np.number])

In [6]:
outcome = pd.read_csv(train, low_memory=False)
outcome = outcome.set_index('challengeID')
outcome = outcome.loc[~outcome['gpa'].isnull()]
data_frame.shape[1]

12795

### Pick out the students whose language and literacy skills ('t5c13a'), social science skills ('t5c13b'), math skills ('t5c13c') and GPA are both numeric.
In other words we get rid of the NaN values.

In [8]:
data_frame = data_frame.loc[data_frame.index.isin(outcome.index.values)]

lang_lit = data_frame.loc[~data_frame['t5c13a'].isnull()]
science_social = data_frame.loc[~data_frame['t5c13b'].isnull()]
math = data_frame.loc[~data_frame['t5c13c'].isnull()]

In [9]:
from functools import reduce
common_indices = reduce(np.intersect1d, (lang_lit.index.values, science_social.index.values, math.index.values))

### Pick out students whose skills are non-missing (above 0 values)

In [10]:
nonmissing_X = data_frame.loc[common_indices, ['t5c13a','t5c13b','t5c13c']]
X = nonmissing_X[(nonmissing_X>0).all(1)]

In [11]:
X.head()

Unnamed: 0_level_0,t5c13a,t5c13b,t5c13c
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9,1,2,2
14,3,4,4
16,1,2,2
18,2,2,2
23,2,4,3


### Now pick out the corresponding GPA

In [23]:
Y = outcome.loc[outcome.index.isin(X.index.values)]
# Round GPA up
GPA = Y['gpa'].astype('int')+1

In [16]:
Y.head()

Unnamed: 0_level_0,gpa,grit,materialHardship,eviction,layoff,jobTraining
challengeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9,2.25,4.0,0.181818,0.0,0.0,0.0
14,3.25,2.75,0.272727,0.0,1.0,0.0
16,2.0,3.5,0.090909,0.0,0.0,1.0
18,2.25,3.0,0.0,0.0,1.0,0.0
23,2.5,3.25,0.0,0.0,0.0,0.0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, GPA, test_size = 0.3, random_state = 100)

In [25]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

### TODO: Figure out how to draw graphs

In [50]:
# from sklearn import tree
# tree.export_graphviz(clf_gini,out_file='tree.dot')
# import pydot
# (graph,) = pydot.graph_from_dot_file('tree.dot')
# graph.write_png('somefile.png')

In [38]:
pred = clf_gini.predict(X_test)
accuracy = sum(pred==y_test.values)/pred.shape[0]
accuracy

0.52036199095022628

### Find mean squared error of prediction

In [42]:
sum(pred==y_test.values)
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(pred, y_test.values))

0.78446454055273618

# TODO: interpreting created trees

# K-Nearest Neighbors Regression
http://scikit-learn.org/stable/modules/neighbors.html

TODO: add visualization for clustering

In [71]:
from sklearn.neighbors import KNeighborsClassifier
nbrs = KNeighborsClassifier(n_neighbors=20, algorithm='ball_tree').fit(X_train, y_train)
distances, indices = nbrs.kneighbors(X_train)

### Show connections between neighbors

In [64]:
nbrs.kneighbors_graph(X_train).toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  1., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [72]:
pred = nbrs.predict(X_test)
accuracy = sum(pred==y_test.values)/pred.shape[0]
accuracy

0.52036199095022628

Exercise: Try changing the n_neighbors variable, see what happens.

In [85]:
results = np.zeros(28)
for i in range(2,30):
    nbrs = KNeighborsClassifier(n_neighbors=i, algorithm='ball_tree').fit(X_train, y_train)
    pred = nbrs.predict(X_test)
    accuracy = sum(pred==y_test.values)/pred.shape[0]
    results[i-2] = accuracy

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.scatter(list(range(2,30)),results)
plt.show()