# Import our data from a file

We'll use this data to train and then test our machine learning model.

In [1599]:
import pandas as pd
data = pd.read_csv("dataset.csv", index_col=False)

In [1600]:
data

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of X,6.8,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


# Split out training vs test data

In [1601]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, shuffle=False)

In [1602]:
train

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of X,6.8,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


In [1603]:
test

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
35,U of Y,6.2,4001,A,1
36,U of Y,6.0,4015,D,1
37,U of Z,5.0,4001,A,0
38,U of Y,5.5,4012,C,0
39,U of X,6.3,4001,A,1
40,U of Z,6.0,4010,C,0
41,U of X,5.5,4015,D,1
42,U of Z,3.9,4001,A,0
43,U of Z,6.0,4015,D,0
44,U of X,6.0,4001,A,1


# Encode category values as numbers

This is needed because the machine learning algorithm only deals with numbers. We use a binary encoding instead of a single integer to avoid artificial ordering being introduced, e.g. the model thinking the average of university 1 and 3 is 2.

In [1604]:
from sklearn.preprocessing import LabelBinarizer
def enc(df, column):
    lb = LabelBinarizer()
    lb.fit(data[column].unique())
    return pd.DataFrame(lb.transform(df[column]), columns=lb.classes_, index=df.index)
def build(df):
    return pd.concat([enc(df, 'University'), df['GPA'], enc(df, 'Postcode'), enc(df, 'Tribe')], axis=1)
X, T = build(train), build(test)

In [1605]:
X.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016,A,B,C,D
0,0,0,1,4.2,0,0,0,0,1,0,0,0,0,0,0,1,0
1,1,0,0,5.0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0,1,0,4.2,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0,0,1,4.0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,1,0,0,6.5,1,0,0,0,0,0,0,0,0,1,0,0,0


In [1606]:
T.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016,A,B,C,D
35,0,1,0,6.2,0,1,0,0,0,0,0,0,0,1,0,0,0
36,0,1,0,6.0,0,0,0,0,0,0,0,1,0,0,0,0,1
37,0,0,1,5.0,0,1,0,0,0,0,0,0,0,1,0,0,0
38,0,1,0,5.5,0,0,0,0,0,0,1,0,0,0,0,1,0
39,1,0,0,6.3,0,1,0,0,0,0,0,0,0,1,0,0,0


# Train and run the model

The output shows our test data including the actual *Repaid* against the prediction.

In [1607]:
#from sklearn.ensemble import RandomForestClassifier
#clf = RandomForestClassifier(max_depth=10, random_state=0)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(18, 18, 18), random_state=1)
clf.fit(X, train['Approval'])
p = clf.predict(T)
test_with_p = test.assign(p = p)

In [1608]:
test_with_p

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Y,6.2,4001,A,1,1
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,0
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of X,6.0,4001,A,1,1


# Accuracy

In [1609]:
from sklearn.metrics import accuracy_score
print("Accuracy on training data: %.2f%%" % (accuracy_score(train['Approval'], clf.predict(X)) * 100))
print("Accuracy on test data: %.2f%%" % (accuracy_score(test['Approval'], p) * 100))

Accuracy on training data: 100.00%
Accuracy on test data: 93.33%


In [1610]:
test_with_p.groupby(['Tribe'])[['Approval', 'p']].sum()

Unnamed: 0_level_0,Approval,p
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,3
B,0,1
C,0,0
D,2,2


# Problem 1: Accuracy for each tribe

Let's breakdown accuracy of the model by tribe:

In [1611]:
count = test.groupby(['Tribe'])['Tribe'].count()
accurate = test_with_p[test_with_p['Approval'] == test_with_p['p']].groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Accuracy (%)': accurate / count * 100}).fillna(0)

Unnamed: 0_level_0,Accuracy (%)
Tribe,Unnamed: 1_level_1
A,100.0
B,0.0
C,100.0
D,100.0


It shows that we get our predictions wrong for tribe B - but why is this?

Let's look at how many records were in the training set for each tribe.

In [1612]:
train.groupby(['Tribe'])[['Tribe']].count()

Unnamed: 0_level_0,Tribe
Tribe,Unnamed: 1_level_1
A,15
B,1
C,9
D,10


# Problem 2: Approval rate for each tribe

In turns out our model doesn't approve a single applicant from tribe C!

In [1613]:
display(test_with_p[test_with_p['Tribe'] == 'A'])
display(test_with_p.groupby(['Tribe'])['p'].count())

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Y,6.2,4001,A,1,1
37,U of Z,5.0,4001,A,0,0
39,U of X,6.3,4001,A,1,1
42,U of Z,3.9,4001,A,0,0
44,U of X,6.0,4001,A,1,1


Tribe
A    5
B    1
C    5
D    4
Name: p, dtype: int64

In [1614]:
approved = test_with_p.groupby(['Tribe'])['p'].sum()
count = test_with_p.groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Approvals (predicted)': approved, 'Count': count, 'Percent': approved / count * 100})

Unnamed: 0_level_0,Approvals (predicted),Count,Percent
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,3,5,60.0
B,1,1,100.0
C,0,5,0.0
D,2,4,50.0


What do we find when we look at our training data?

In [1615]:
approved = train.groupby(['Tribe'])['Approval'].sum()
count = train.groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Approvals (trained)': approved, 'Count': count, 'Percent': approved / count * 100})

Unnamed: 0_level_0,Approvals (trained),Count,Percent
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,10,15,66.666667
B,1,1,100.0
C,3,9,33.333333
D,5,10,50.0


# Re-running our model without Tribe

In [1616]:
def build_without_tribe(df):
    return pd.concat([enc(df, 'University'), df['GPA'], enc(df, 'Postcode')], axis=1)
X2, T2 = build_without_tribe(train), build_without_tribe(test)

In [1617]:
X2.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016
0,0,0,1,4.2,0,0,0,0,1,0,0,0,0
1,1,0,0,5.0,0,0,0,0,0,0,0,0,1
2,0,1,0,4.2,0,0,0,0,0,0,0,1,0
3,0,0,1,4.0,0,0,0,0,0,1,0,0,0
4,1,0,0,6.5,1,0,0,0,0,0,0,0,0


In [1618]:
T2.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016
35,0,1,0,6.2,0,1,0,0,0,0,0,0,0
36,0,1,0,6.0,0,0,0,0,0,0,0,1,0
37,0,0,1,5.0,0,1,0,0,0,0,0,0,0
38,0,1,0,5.5,0,0,0,0,0,0,1,0,0
39,1,0,0,6.3,0,1,0,0,0,0,0,0,0


In [1619]:
from sklearn.ensemble import RandomForestClassifier
#clf2 = RandomForestClassifier(max_depth=5, random_state=0)
clf2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(18, 18, 18), random_state=1)
clf2.fit(X2, train['Approval'])
p2 = clf2.predict(T2)
test_with_p2 = test.assign(p = p2)

In [1620]:
test_with_p2

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Y,6.2,4001,A,1,1
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,1
38,U of Y,5.5,4012,C,0,0
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of X,6.0,4001,A,1,1


In [1621]:
print("Accuracy on training data: %.2f%%" % (accuracy_score(train['Approval'], clf2.predict(X2)) * 100))
print("Accuracy on test data: %.2f%%" % (accuracy_score(test['Approval'], p2) * 100))

Accuracy on training data: 100.00%
Accuracy on test data: 80.00%


But we still find that all people from tribe C are not approved:

In [1622]:
approved = test_with_p2.groupby(['Tribe'])['p'].sum()
count = test_with_p2.groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Approved (predicted)': approved, 'Count': count, 'Rate': approved / count})

Unnamed: 0_level_0,Approved (predicted),Count,Rate
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,4,5,0.8
B,1,1,1.0
C,1,5,0.2
D,2,4,0.5


We can see why when we look at postcode:

In [1623]:
train.groupby(['Tribe', 'Postcode'])[['Postcode']].count().unstack().fillna(0)

Unnamed: 0_level_0,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode
Postcode,4000,4001,4002,4006,4010,4011,4012,4015,4016
Tribe,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,5.0,3.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
C,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,0.0
D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0


# Re-running our model without postcode

In [1624]:
def build_without_postcode(df):
    return pd.concat([enc(df, 'University'), df['GPA']], axis=1)
X3, T3 = build_without_postcode(train), build_without_postcode(test)

In [1625]:
X3.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA
0,0,0,1,4.2
1,1,0,0,5.0
2,0,1,0,4.2
3,0,0,1,4.0
4,1,0,0,6.5


In [1626]:
T3.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA
35,0,1,0,6.2
36,0,1,0,6.0
37,0,0,1,5.0
38,0,1,0,5.5
39,1,0,0,6.3


In [1627]:
from sklearn.ensemble import RandomForestClassifier
#clf3 = RandomForestClassifier(max_depth=5, random_state=0)
clf3 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(18, 18, 18), random_state=1)
clf3.fit(X3, train['Approval'])
p3 = clf3.predict(T3)
test_with_p3 = test.assign(p = p3)

In [1628]:
test_with_p3

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Y,6.2,4001,A,1,1
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,1
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of X,6.0,4001,A,1,1


In [1629]:
print("Accuracy on training data: %.2f%%" % (accuracy_score(train['Approval'], clf3.predict(X3)) * 100))
print("Accuracy on test data: %.2f%%" % (accuracy_score(test['Approval'], p3) * 100))

Accuracy on training data: 97.14%
Accuracy on test data: 86.67%


In [1630]:
approved = test_with_p3.groupby(['Tribe'])['p'].sum()
count = test_with_p3.groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Approved (predicted)': approved, 'Count': count, 'Rate': approved / count})

Unnamed: 0_level_0,Approved (predicted),Count,Rate
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,3,5,0.6
B,1,1,1.0
C,1,5,0.2
D,2,4,0.5


Using only University and GPA (not Tribe or Postcode) may lead to lower accuracy, potentially quantifying the original bias.