# Import our data from a file

We'll use this data to train and then test our machine learning model.

In [81]:
import pandas as pd
data = pd.read_csv("data.csv", index_col=False)
data

Unnamed: 0,State,Age,Repaid
0,Queensland,25,1
1,Victoria,62,1
2,Western Australia,18,0
3,Queensland,37,1
4,Victoria,48,1
5,Victoria,44,0
6,Western Australia,19,0
7,Queensland,30,1
8,Victoria,40,0
9,Western Australia,60,0


# Split out training vs test data and result labels

In [82]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, shuffle=False)

In [83]:
train

Unnamed: 0,State,Age,Repaid
0,Queensland,25,1
1,Victoria,62,1
2,Western Australia,18,0
3,Queensland,37,1
4,Victoria,48,1
5,Victoria,44,0
6,Western Australia,19,0


In [84]:
test

Unnamed: 0,State,Age,Repaid
7,Queensland,30,1
8,Victoria,40,0
9,Western Australia,60,0
10,Tasmania,70,0


# Encode category values as numbers

This is needed because the machine learning algorithm only deals with numbers.

The State in each record is encoded as a 1 in that state's column in a binary matrix.

In [85]:
from sklearn.preprocessing import LabelBinarizer
enc = LabelBinarizer()
enc.fit(['Queensland', 'Victoria', 'Queensland', 'Western Australia', 'South Australia', 'Tasmania'])
train_state_enc = pd.DataFrame(enc.transform(train['State']), columns=enc.classes_)
test_state_enc = pd.DataFrame(enc.transform(test['State']), columns=enc.classes_)
X = train_state_enc.assign(Age = train['Age'])
T = test_state_enc.assign(Age = train['Age'])

In [86]:
X

Unnamed: 0,Queensland,South Australia,Tasmania,Victoria,Western Australia,Age
0,1,0,0,0,0,25
1,0,0,0,1,0,62
2,0,0,0,0,1,18
3,1,0,0,0,0,37
4,0,0,0,1,0,48
5,0,0,0,1,0,44
6,0,0,0,0,1,19


In [87]:
T

Unnamed: 0,Queensland,South Australia,Tasmania,Victoria,Western Australia,Age
0,1,0,0,0,0,25
1,0,0,0,1,0,62
2,0,0,0,0,1,18
3,0,0,1,0,0,37


# Train and run the model

The output shows our test data including the actual *Repaid* against the prediction.

In [88]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, train['Repaid'])
p = clf.predict(T)
test.assign(p = p)

Unnamed: 0,State,Age,Repaid,p
7,Queensland,30,1,1
8,Victoria,40,0,1
9,Western Australia,60,0,0
10,Tasmania,70,0,1


# Classification report

Summarises how predictions compared with actual *Repaid* on the training set.

In [89]:
from sklearn.metrics import classification_report
print(classification_report(test['Repaid'], p))

             precision    recall  f1-score   support

          0       1.00      0.33      0.50         3
          1       0.33      1.00      0.50         1

avg / total       0.83      0.50      0.50         4



# Confusion Matrix

The count of true negatives is (0,0), false negatives is (1,0), true positives is (1,1) and false positives is (0,1).

In [90]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test['Repaid'], p)

array([[1, 2],
       [0, 1]])

# Revisiting our training set

We have a clear pattern in *Repaid* based on state.

In [91]:
train.groupby(['State', 'Repaid'])[['Repaid']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Repaid
State,Repaid,Unnamed: 2_level_1
Queensland,1,2
Victoria,0,1
Victoria,1,2
Western Australia,0,2


In [92]:
train.groupby(['State'])[['Repaid']].sum() / train.groupby(['State'])[['Repaid']].count()

Unnamed: 0_level_0,Repaid
State,Unnamed: 1_level_1
Queensland,1.0
Victoria,0.666667
Western Australia,0.0
