# Import our data from a file

We'll use this data to train and then test our machine learning model.

In [None]:
import pandas as pd
data = pd.read_csv("data.csv", index_col=False)
data

# Split out training vs test data and result labels

`X` and `y` capture inputs (features) and results (labels) for our training data.

`T` and `z` are the same but we use them only for testing performance of our model.

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, shuffle=False)

In [None]:
train

In [None]:
test

# Encode category values as numbers

This is needed because the machine learning algorithm only deals with numbers.

The State in each record is encoded as a 1 in that state's column in a binary matrix.

In [None]:
from sklearn.preprocessing import LabelBinarizer
enc = LabelBinarizer()
enc.fit(['Queensland', 'Victoria', 'Queensland', 'Western Australia', 'South Australia', 'Tasmania'])
X = enc.transform(train['State'])
T = enc.transform(test['State'])

In [None]:
X

In [None]:
T

# Train and run the model

The output shows our test data including the actual *Repaid* against the prediction.

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, train['Repaid'])
p = clf.predict(T)
test.assign(p = p)

# Classification report

Summarises how predictions compared with actual *Repaid* on the training set.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test['Repaid'], p))

# Confusion Matrix

The count of true negatives is (0,0), false negatives is (1,0), true positives is (1,1) and false positives is (0,1).

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test['Repaid'], p)

# Revisiting our training set

We have a clear pattern in *Repaid* based on state.

In [None]:
train.groupby(['State', 'Repaid'])[['Repaid']].count()

In [None]:
train.groupby(['State'])[['Repaid']].sum() / train.groupby(['State']).count()