# Import our data from a file

We'll use this data to train and then test our machine learning model.

In [429]:
import pandas as pd
data = pd.read_csv("dataset.csv", index_col=False)

In [430]:
data

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of Y,5.0,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


# Split out training vs test data and result labels

In [431]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, shuffle=False)

In [432]:
train

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of Y,5.0,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


In [433]:
test

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
35,U of Z,4.3,4002,A,0
36,U of Y,6.0,4015,D,1
37,U of Z,5.0,4001,A,0
38,U of Y,5.5,4012,C,0
39,U of X,6.3,4001,A,1
40,U of Z,6.0,4010,C,0
41,U of X,5.5,4015,D,1
42,U of Z,3.9,4001,A,0
43,U of Z,6.0,4015,D,0
44,U of Z,6.0,4001,A,1


# Encode category values as numbers

This is needed because the machine learning algorithm only deals with numbers. We use a binary encoding instead of a single integer to avoid artificial ordering being introduced, e.g. the model thinking the average of university 1 and 3 is 2.

In [434]:
from sklearn.preprocessing import LabelBinarizer
def enc(df, column):
    lb = LabelBinarizer()
    lb.fit(data[column].unique())
    return pd.DataFrame(lb.transform(df[column]), columns=lb.classes_, index=df.index)
def build(df):
    return pd.concat([enc(df, 'University'), df['GPA'], enc(df, 'Postcode'), enc(df, 'Tribe')], axis=1)
X, T = build(train), build(test)

In [435]:
X.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4005,4006,4010,4011,4012,4015,4016,A,B,C,D
0,0,0,1,4.2,0,0,0,0,0,1,0,0,0,0,0,0,1,0
1,1,0,0,5.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0,1,0,4.2,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0,0,1,4.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,1,0,0,6.5,1,0,0,0,0,0,0,0,0,0,1,0,0,0


In [436]:
T.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4005,4006,4010,4011,4012,4015,4016,A,B,C,D
35,0,0,1,4.3,0,0,1,0,0,0,0,0,0,0,1,0,0,0
36,0,1,0,6.0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
37,0,0,1,5.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
38,0,1,0,5.5,0,0,0,0,0,0,0,1,0,0,0,0,1,0
39,1,0,0,6.3,0,1,0,0,0,0,0,0,0,0,1,0,0,0


# Train and run the model

The output shows our test data including the actual *Repaid* against the prediction.

In [437]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=0)
#from sklearn.neural_network import MLPClassifier
#clf = MLPClassifier(hidden_layer_sizes=(18,18,18), max_iter=2000)
clf.fit(X, train['Approval'])
p = clf.predict(T)
test_with_predictions = test.assign(p = p)

In [438]:
test_with_predictions

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Z,4.3,4002,A,0,0
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,0
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of Z,6.0,4001,A,1,1


# Accuracy on training data

In [439]:
from sklearn.metrics import accuracy_score
accuracy_score(train['Approval'], clf.predict(X))

0.97142857142857142

# Accuracy on test data

In [440]:
from sklearn.metrics import accuracy_score
accuracy_score(test['Approval'], p)

0.93333333333333335

# Classification report

Summarises how predictions compared with actual *Repaid* on the training set.

In [441]:
from sklearn.metrics import classification_report
print(classification_report(test['Approval'], p))

             precision    recall  f1-score   support

          0       1.00      0.91      0.95        11
          1       0.80      1.00      0.89         4

avg / total       0.95      0.93      0.94        15



# Confusion Matrix

The count of true negatives is (0,0), false negatives is (1,0), true positives is (1,1) and false positives is (0,1).

In [442]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test['Approval'], p)

array([[10,  1],
       [ 0,  4]])

# Reviewing our training set

In [443]:
pd.DataFrame({
    'Count': train.groupby(['Tribe'])['Tribe'].count(),
    'Approvals': train[train['Approval'] == 1].groupby(['Tribe'])['Tribe'].count(),
    'Rate': train.groupby(['Tribe'])['Approval'].sum() / train.groupby(['Tribe'])['Approval'].count()
})

Unnamed: 0_level_0,Approvals,Count,Rate
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,11,15,0.733333
B,1,2,0.5
C,3,9,0.333333
D,4,9,0.444444


In [444]:
pd.DataFrame({
    'Count': test_with_predictions.groupby(['Tribe'])['Tribe'].count(),
    'Predictions': test_with_predictions[test_with_predictions['p'] == 1].groupby(['Tribe'])['Tribe'].count(),
    'Rate': test_with_predictions.groupby(['Tribe'])['p'].sum() / test_with_predictions.groupby(['Tribe'])['Approval'].count()
})

Unnamed: 0,Count,Predictions,Rate
A,5,2.0,0.4
B,1,1.0,1.0
C,5,,0.0
D,4,2.0,0.5


# Re-running our model without Tribe

In [445]:
def build_without_tribe(df):
    return pd.concat([enc(df, 'University'), df['GPA'], enc(df, 'Postcode')], axis=1)
X, T = build_without_tribe(train), build_without_tribe(test)

In [446]:
X.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4005,4006,4010,4011,4012,4015,4016
0,0,0,1,4.2,0,0,0,0,0,1,0,0,0,0
1,1,0,0,5.0,0,0,0,0,0,0,0,0,0,1
2,0,1,0,4.2,0,0,0,0,0,0,0,0,1,0
3,0,0,1,4.0,0,0,0,0,0,0,1,0,0,0
4,1,0,0,6.5,1,0,0,0,0,0,0,0,0,0


In [447]:
T.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4005,4006,4010,4011,4012,4015,4016
35,0,0,1,4.3,0,0,1,0,0,0,0,0,0,0
36,0,1,0,6.0,0,0,0,0,0,0,0,0,1,0
37,0,0,1,5.0,0,1,0,0,0,0,0,0,0,0
38,0,1,0,5.5,0,0,0,0,0,0,0,1,0,0
39,1,0,0,6.3,0,1,0,0,0,0,0,0,0,0


In [448]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X, train['Approval'])
p = clf.predict(T)
test_with_predictions = test.assign(p = p)

In [449]:
test_with_predictions

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Z,4.3,4002,A,0,0
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,0
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of Z,6.0,4001,A,1,0


In [450]:
from sklearn.metrics import accuracy_score
accuracy_score(train['Approval'], clf.predict(X))

0.97142857142857142

In [451]:
from sklearn.metrics import accuracy_score
accuracy_score(test['Approval'], p)

0.8666666666666667

In [452]:
pd.DataFrame({
    'Count': test_with_predictions.groupby(['Tribe'])['Tribe'].count(),
    'Predictions': test_with_predictions[test_with_predictions['p'] == 1].groupby(['Tribe'])['Tribe'].count(),
    'Rate': test_with_predictions.groupby(['Tribe'])['p'].sum() / test_with_predictions.groupby(['Tribe'])['Approval'].count()
})

Unnamed: 0,Count,Predictions,Rate
A,5,1.0,0.2
B,1,1.0,1.0
C,5,,0.0
D,4,2.0,0.5


In [453]:
train.groupby(['Tribe', 'Postcode'])[['Postcode']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Postcode
Tribe,Postcode,Unnamed: 2_level_1
A,4000,5
A,4001,4
A,4002,6
B,4005,1
B,4006,1
C,4010,3
C,4011,3
C,4012,3
D,4015,4
D,4016,5


# Re-running our model without postcode

In [454]:
def build_without_postcode(df):
    return pd.concat([enc(df, 'University'), df['GPA']], axis=1)
X, T = build_without_postcode(train), build_without_postcode(test)

In [455]:
X.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA
0,0,0,1,4.2
1,1,0,0,5.0
2,0,1,0,4.2
3,0,0,1,4.0
4,1,0,0,6.5


In [456]:
T.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA
35,0,0,1,4.3
36,0,1,0,6.0
37,0,0,1,5.0
38,0,1,0,5.5
39,1,0,0,6.3


In [457]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X, train['Approval'])
p = clf.predict(T)
test_with_predictions = test.assign(p = p)

In [458]:
test_with_predictions

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Z,4.3,4002,A,0,0
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,1
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of Z,6.0,4001,A,1,0


In [459]:
from sklearn.metrics import accuracy_score
accuracy_score(train['Approval'], clf.predict(X))

0.94285714285714284

In [460]:
from sklearn.metrics import accuracy_score
accuracy_score(test['Approval'], p)

0.80000000000000004

In [461]:
pd.DataFrame({
    'Count': test_with_predictions.groupby(['Tribe'])['Tribe'].count(),
    'Predictions': test_with_predictions[test_with_predictions['p'] == 1].groupby(['Tribe'])['Tribe'].count(),
    'Rate': test_with_predictions.groupby(['Tribe'])['p'].sum() / test_with_predictions.groupby(['Tribe'])['Approval'].count()
})

Unnamed: 0_level_0,Count,Predictions,Rate
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,5,1,0.2
B,1,1,1.0
C,5,1,0.2
D,4,2,0.5
