# Import our data from a file

We'll use this data to train and then test our machine learning model.

In [53]:
import pandas as pd
data = pd.read_csv("dataset.csv", index_col=False)

In [54]:
data

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of Y,5.0,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


# Split out training vs test data and result labels

In [55]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, shuffle=False)

In [56]:
train

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of Y,5.0,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


In [57]:
test

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
35,U of Z,4.3,4002,A,0
36,U of Y,6.0,4015,D,1
37,U of Z,5.0,4001,A,0
38,U of Y,5.5,4012,C,0
39,U of X,6.3,4001,A,1
40,U of Z,6.0,4010,C,0
41,U of X,5.5,4015,D,1
42,U of Z,3.9,4001,A,0
43,U of Z,6.0,4015,D,0
44,U of Z,6.0,4001,A,1


# Encode category values as numbers

This is needed because the machine learning algorithm only deals with numbers. We use a binary encoding instead of a single integer to avoid artificial ordering being introduced, e.g. the model thinking the average of university 1 and 3 is 2.

In [58]:
from sklearn.preprocessing import LabelBinarizer
def enc(df, column):
    lb = LabelBinarizer()
    lb.fit(data[column].unique())
    return pd.DataFrame(lb.transform(df[column]), columns=lb.classes_, index=df.index)
def build(df):
    return pd.concat([enc(df, 'University'), df['GPA'], enc(df, 'Postcode'), enc(df, 'Tribe')], axis=1)
X, T = build(train), build(test)

In [51]:
X.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4005,4006,4010,4011,4012,4015,4016,A,B,C,D
0,0,0,1,4.2,0,0,0,0,0,1,0,0,0,0,0,0,1,0
1,1,0,0,5.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0,1,0,4.2,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0,0,1,4.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,1,0,0,6.5,1,0,0,0,0,0,0,0,0,0,1,0,0,0


In [52]:
T.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4005,4006,4010,4011,4012,4015,4016,A,B,C,D
35,0,0,1,4.3,0,0,1,0,0,0,0,0,0,0,1,0,0,0
36,0,1,0,6.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
37,0,0,1,5.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
38,0,1,0,5.5,0,0,0,0,0,0,0,1,0,0,0,0,1,0
39,1,0,0,6.3,0,1,0,0,0,0,0,0,0,0,1,0,0,0


# Train and run the model

The output shows our test data including the actual *Repaid* against the prediction.

In [60]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, train['Approval'])
p = clf.predict(T)
test.assign(p = p)

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Z,4.3,4002,A,0,1
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,1
38,U of Y,5.5,4012,C,0,0
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,1
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,1
43,U of Z,6.0,4015,D,0,0
44,U of Z,6.0,4001,A,1,1


# Accuracy on test data

In [62]:
from sklearn.metrics import accuracy_score
accuracy_score(test['Approval'], p)

0.53333333333333333

# Classification report

Summarises how predictions compared with actual *Repaid* on the training set.

In [63]:
from sklearn.metrics import classification_report
print(classification_report(test['Approval'], p))

             precision    recall  f1-score   support

          0       1.00      0.36      0.53        11
          1       0.36      1.00      0.53         4

avg / total       0.83      0.53      0.53        15



# Confusion Matrix

The count of true negatives is (0,0), false negatives is (1,0), true positives is (1,1) and false positives is (0,1).

In [64]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test['Approval'], p)

array([[4, 7],
       [0, 4]])

# Revisiting our training set

We have a clear pattern in *Repaid* based on state.

In [65]:
train.groupby(['Tribe', 'Approval'])[['Approval']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Approval
Tribe,Approval,Unnamed: 2_level_1
A,0,2
A,1,13
B,0,1
B,1,1
C,0,7
C,1,3
D,0,4
D,1,4


In [66]:
train.groupby(['Tribe'])[['Approval']].sum() / train.groupby(['Tribe'])[['Approval']].count()

Unnamed: 0_level_0,Approval
Tribe,Unnamed: 1_level_1
A,0.866667
B,0.5
C,0.3
D,0.5
