In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

###System settings
pd.set_option("display.max_rows", None, "display.max_columns", None)
np.set_printoptions(threshold=sys.maxsize)

###Import data set
student_data3 = pd.read_csv('studentInfo.csv')

###Drop null values from data set
student_data3 = student_data3.dropna()

###Simplify target attribute 1=Pass, 0=Fail
student_data3 = student_data3.replace(['Distinction', 'Pass'], 1)
student_data3 = student_data3.replace(['Withdrawn', 'Fail'], 0)

###Simplify protected attribute 1=high_deprivation, 0=low_deprivation
student_data3 = student_data3.replace(['0-10%','10-20','20-30%','30-40%','40-50%'], 1)
student_data3 = student_data3.replace(['50-60%','60-70%','70-80%','80-90%','90-100%'], 0)

###Drop column id_student
student_data3 = student_data3.drop(columns=['id_student'])


###Split dataset and encode data for training and testing
X = student_data3.drop(columns=['final_result'])
y = student_data3['final_result']
X_enc = pd.get_dummies(X)

###Create train-test split while balancing imd_band values
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, stratify=student_data3['imd_band'])
check_X  = X_test['imd_band'].value_counts()
#print(check_X)
###Create new classifier instance
classifier = DecisionTreeClassifier(splitter='random', max_depth=10, max_features='sqrt', min_samples_split=5, min_samples_leaf=2)

###Train classifier with train split
classifier.fit(X_train, y_train)

###Creat a copy from X_test and safe it in a new dataframe
df = X_test.copy()

###Check if indices of X_test and y_test match
print(all(X_test.index == y_test.index))


###Create new column in df and safe ground truth target values
df['gt_test_values'] = y_test

###Reset indices of rows in dataframe
df.reset_index(inplace=True)

###Predict with the classifier on the Test set and save values in new data frame.
###Order of prediciton values and X_test stays the same, so column 'pred_values' 
###is in the right order with the other values
df['pred_values'] = classifier.predict(X_test)

###Evaluate accuracy of classifier
accuracy = accuracy_score(y_test, df['pred_values'])
print('Accuracy: %.2f' % (accuracy*100))

precision = precision_score(y_test, df['pred_values'])
print('Precision: %.2f' % (precision*100))

recall = recall_score(y_test, df['pred_values'])
print('Recall: %.2f' % (recall*100))

###Define function which combines imd_band status and final_result of individual on ground truth data
def f1(row):
    if row['imd_band'] == 1 and row['gt_test_values'] == 1:
        val = 'highdevPass'
    elif row['imd_band'] == 0 and row['gt_test_values'] == 1:
        val = 'lowdevPass'
    elif row['imd_band'] == 1 and row['gt_test_values'] == 0:
        val = 'highdevFail'
    else:
        val = 'lowdevFail'
    return val

###Define function which combines imd_band status and final_result of individual on prediction data 
def f2(row):
    if row['imd_band'] == 1 and row['pred_values'] == 1:
        val = 'highdevPass'
    elif row['imd_band'] == 0 and row['pred_values'] == 1:
        val = 'lowdevPass'
    elif row['imd_band'] == 1 and row['pred_values'] == 0:
        val = 'highdevFail'
    else:
        val = 'lowdevFail'
    return val

###Define function which combines all attributes to give out TP, TN, FP and FN values for high and low deprivation group
def f3(row):
    if row['imd_band'] == 1 and row['pred_values'] == 1 and row['gt_test_values'] == 1:
        val = 'highdevTP'
    elif row['imd_band'] == 1 and row['pred_values'] == 1 and row['gt_test_values'] == 0:
        val = 'highdevFP'
    elif row['imd_band'] == 1 and row['pred_values'] == 0 and row['gt_test_values'] == 0:
        val = 'highdevTN'
    elif row['imd_band'] == 1 and row['pred_values'] == 0 and row['gt_test_values'] == 1:
        val = 'highdevFN'
    elif row['imd_band'] == 0 and row['pred_values'] == 1 and row['gt_test_values'] == 1:
        val = 'lowdevTP'
    elif row['imd_band'] == 0 and row['pred_values'] == 1 and row['gt_test_values'] == 0:
        val = 'lowdevFP'
    elif row['imd_band'] == 0 and row['pred_values'] == 0 and row['gt_test_values'] == 0:
        val = 'lowdevTN'
    else:
        val = 'lowdevFN'
    return val

###Apply Function f1 and save output in new column
df['gtCompare'] = df.apply(f1, axis=1)

###Apply Function f1 and save output in new column
df['predCompare'] = df.apply(f2, axis=1)

###Apply Function f1 and save output in new column
df['confusionValues'] = df.apply(f3, axis=1)

###Count values in column and save output in var
conf_matrix0 = df['gtCompare'].value_counts()

###Count values in column and save output in var
conf_matrix1 = df['predCompare'].value_counts()

###Count values in column and save output in var
conf_matrix2 = df['confusionValues'].value_counts()

print(conf_matrix0)
print(conf_matrix1)
print(conf_matrix2)