# Import our data from a file

In [5]:
import pandas as pd
data = pd.read_csv("dataset.csv", index_col=False)

In [6]:
data

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of X,6.8,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


# Split out training vs test data

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, shuffle=False)

In [8]:
train

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
0,U of Z,4.2,4010,C,0
1,U of X,5.0,4016,D,1
2,U of Y,4.2,4015,D,0
3,U of Z,4.0,4011,C,0
4,U of X,6.5,4000,A,1
5,U of X,6.8,4006,B,1
6,U of X,5.0,4011,C,1
7,U of Y,5.5,4001,A,1
8,U of X,5.0,4002,A,1
9,U of Z,6.0,4016,D,0


In [9]:
test

Unnamed: 0,University,GPA,Postcode,Tribe,Approval
35,U of Y,6.2,4001,A,1
36,U of Y,6.0,4015,D,1
37,U of Z,5.0,4001,A,0
38,U of Y,5.5,4012,C,0
39,U of X,6.3,4001,A,1
40,U of Z,6.0,4010,C,0
41,U of X,5.5,4015,D,1
42,U of Z,3.9,4001,A,0
43,U of Z,6.0,4015,D,0
44,U of X,6.0,4001,A,1


# Encode category values as numbers

This is needed because the machine learning algorithm only deals with numbers. We use a binary encoding instead of a single integer to avoid artificial ordering being introduced, e.g. the model thinking the average of university 1 and 3 is 2.

In [10]:
from sklearn.preprocessing import LabelBinarizer
def enc(df, column):
    lb = LabelBinarizer()
    lb.fit(data[column].unique())
    return pd.DataFrame(lb.transform(df[column]), columns=lb.classes_, index=df.index)
def build(df):
    return pd.concat([enc(df, 'University'), df['GPA'], enc(df, 'Postcode'), enc(df, 'Tribe')], axis=1)
X, T = build(train), build(test)

In [11]:
X.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016,A,B,C,D
0,0,0,1,4.2,0,0,0,0,1,0,0,0,0,0,0,1,0
1,1,0,0,5.0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,0,1,0,4.2,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0,0,1,4.0,0,0,0,0,0,1,0,0,0,0,0,1,0
4,1,0,0,6.5,1,0,0,0,0,0,0,0,0,1,0,0,0


In [12]:
T.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016,A,B,C,D
35,0,1,0,6.2,0,1,0,0,0,0,0,0,0,1,0,0,0
36,0,1,0,6.0,0,0,0,0,0,0,0,1,0,0,0,0,1
37,0,0,1,5.0,0,1,0,0,0,0,0,0,0,1,0,0,0
38,0,1,0,5.5,0,0,0,0,0,0,1,0,0,0,0,1,0
39,1,0,0,6.3,0,1,0,0,0,0,0,0,0,1,0,0,0


# Train and run the model

The output shows our test data including the actual *Repaid* against the prediction.

In [13]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(18,18,18), random_state=1, max_iter=10000)
clf.fit(X, train['Approval'])
p = clf.predict(T)
test_with_p = test.assign(p = p)

In [14]:
test_with_p

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Y,6.2,4001,A,1,1
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,0
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of X,6.0,4001,A,1,1


# Accuracy

In [15]:
from sklearn.metrics import accuracy_score
print("Accuracy on training data: %.2f%%" % (accuracy_score(train['Approval'], clf.predict(X)) * 100))
print("Accuracy on test data: %.2f%%" % (accuracy_score(test['Approval'], p) * 100))

Accuracy on training data: 100.00%
Accuracy on test data: 93.33%


# Problem 1: Accuracy for each tribe

Let's breakdown accuracy of the model by tribe:

In [17]:
count = test.groupby(['Tribe'])['Tribe'].count()
accurate = test_with_p[test_with_p['Approval'] == test_with_p['p']].groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Accuracy (%)': accurate / count * 100}).fillna(0)

Unnamed: 0_level_0,Accuracy (%)
Tribe,Unnamed: 1_level_1
A,100.0
B,0.0
C,100.0
D,100.0


It shows that we get our predictions wrong for tribe B - but why is this?

Let's look at how many records were in the training set for each tribe.

In [18]:
train.groupby(['Tribe'])[['Tribe']].count()

Unnamed: 0_level_0,Tribe
Tribe,Unnamed: 1_level_1
A,15
B,1
C,9
D,10


# Problem 2: Approval rate for each tribe

In turns out our model doesn't approve a single applicant from tribe C!

In [19]:
test_with_p.groupby(['Tribe'])[['p']].sum()

Unnamed: 0_level_0,p
Tribe,Unnamed: 1_level_1
A,3
B,1
C,0
D,2


What do we find when we look at our training data?

In [20]:
approved = train.groupby(['Tribe'])['Approval'].sum()
count = train.groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Approvals (trained)': approved, 'Count': count, 'Percent': approved / count * 100})

Unnamed: 0_level_0,Approvals (trained),Count,Percent
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,10,15,66.666667
B,1,1,100.0
C,3,9,33.333333
D,5,10,50.0


# Re-running our model without Tribe

In [21]:
def build_without_tribe(df):
    return pd.concat([enc(df, 'University'), df['GPA'], enc(df, 'Postcode')], axis=1)
X2, T2 = build_without_tribe(train), build_without_tribe(test)

In [22]:
X2.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016
0,0,0,1,4.2,0,0,0,0,1,0,0,0,0
1,1,0,0,5.0,0,0,0,0,0,0,0,0,1
2,0,1,0,4.2,0,0,0,0,0,0,0,1,0
3,0,0,1,4.0,0,0,0,0,0,1,0,0,0
4,1,0,0,6.5,1,0,0,0,0,0,0,0,0


In [23]:
T2.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA,4000,4001,4002,4006,4010,4011,4012,4015,4016
35,0,1,0,6.2,0,1,0,0,0,0,0,0,0
36,0,1,0,6.0,0,0,0,0,0,0,0,1,0
37,0,0,1,5.0,0,1,0,0,0,0,0,0,0
38,0,1,0,5.5,0,0,0,0,0,0,1,0,0
39,1,0,0,6.3,0,1,0,0,0,0,0,0,0


In [24]:
from sklearn.ensemble import RandomForestClassifier
clf2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(13,13,13), random_state=1, max_iter=10000)
clf2.fit(X2, train['Approval'])
p2 = clf2.predict(T2)
test_with_p2 = test.assign(p = p2)

In [25]:
test_with_p2

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Y,6.2,4001,A,1,1
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,0
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of X,6.0,4001,A,1,1


In [26]:
print("Accuracy on training data: %.2f%%" % (accuracy_score(train['Approval'], clf2.predict(X2)) * 100))
print("Accuracy on test data: %.2f%%" % (accuracy_score(test['Approval'], p2) * 100))

Accuracy on training data: 100.00%
Accuracy on test data: 93.33%


Accuracy is largely preserved.

And we still find that all people from tribe C are not approved:

In [27]:
approved = test_with_p2.groupby(['Tribe'])['p'].sum()
count = test_with_p2.groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Approved (predicted)': approved, 'Count': count, 'Percent': approved / count * 100})

Unnamed: 0_level_0,Approved (predicted),Count,Percent
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,3,5,60.0
B,1,1,100.0
C,0,5,0.0
D,2,4,50.0


We can see why when we look at postcode:

In [28]:
train.groupby(['Tribe', 'Postcode'])[['Postcode']].count().unstack().fillna(0)

Unnamed: 0_level_0,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode,Postcode
Postcode,4000,4001,4002,4006,4010,4011,4012,4015,4016
Tribe,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
A,5.0,3.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0
B,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
C,0.0,0.0,0.0,0.0,3.0,3.0,3.0,0.0,0.0
D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0


# Re-running our model without postcode

In [29]:
def build_without_postcode(df):
    return pd.concat([enc(df, 'University'), df['GPA']], axis=1)
X3, T3 = build_without_postcode(train), build_without_postcode(test)

In [30]:
X3.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA
0,0,0,1,4.2
1,1,0,0,5.0
2,0,1,0,4.2
3,0,0,1,4.0
4,1,0,0,6.5


In [31]:
T3.head()

Unnamed: 0,U of X,U of Y,U of Z,GPA
35,0,1,0,6.2
36,0,1,0,6.0
37,0,0,1,5.0
38,0,1,0,5.5
39,1,0,0,6.3


In [32]:
from sklearn.ensemble import RandomForestClassifier
clf3 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(4,4,4), random_state=1, max_iter=10000)
clf3.fit(X3, train['Approval'])
p3 = clf3.predict(T3)
test_with_p3 = test.assign(p = p3)

In [33]:
test_with_p3

Unnamed: 0,University,GPA,Postcode,Tribe,Approval,p
35,U of Y,6.2,4001,A,1,1
36,U of Y,6.0,4015,D,1,1
37,U of Z,5.0,4001,A,0,0
38,U of Y,5.5,4012,C,0,1
39,U of X,6.3,4001,A,1,1
40,U of Z,6.0,4010,C,0,0
41,U of X,5.5,4015,D,1,1
42,U of Z,3.9,4001,A,0,0
43,U of Z,6.0,4015,D,0,0
44,U of X,6.0,4001,A,1,1


In [34]:
print("Accuracy on training data: %.2f%%" % (accuracy_score(train['Approval'], clf3.predict(X3)) * 100))
print("Accuracy on test data: %.2f%%" % (accuracy_score(test['Approval'], p3) * 100))

Accuracy on training data: 97.14%
Accuracy on test data: 86.67%


Surprisingly, our model remains accurate - and tribe C continues to not be approved.

In [35]:
approved = test_with_p3.groupby(['Tribe'])['p'].sum()
count = test_with_p3.groupby(['Tribe'])['Tribe'].count()
pd.DataFrame({'Approvals (predicted)': approved, 'Count': count, 'Percent': approved / count * 100})

Unnamed: 0_level_0,Approvals (predicted),Count,Percent
Tribe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,3,5,60.0
B,1,1,100.0
C,1,5,20.0
D,2,4,50.0


Since our model still relies on University, let's look at predictions against each one:

In [36]:
test_with_p.groupby(['University'])[['p']].sum().fillna(0)

Unnamed: 0_level_0,p
University,Unnamed: 1_level_1
U of X,4
U of Y,2
U of Z,0


Does using University still invite a kind of discrimination given where tribes tend to graduate from?

In [37]:
train.groupby(['Tribe', 'University'])[['University']].count().unstack().fillna(0)

Unnamed: 0_level_0,University,University,University
University,U of X,U of Y,U of Z
Tribe,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,6.0,6.0,3.0
B,1.0,0.0,0.0
C,2.0,3.0,4.0
D,3.0,4.0,3.0
