In [89]:
import random
# pip3 install -U scikit-learn
from sklearn.model_selection import train_test_split as sk_train_test_split
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

In [76]:
def split_data(data, prob):
    """Split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [77]:
def train_test_split(x, y, test_pct):
    data = zip(x, y)
    train, test = split_data(data, 1 - test_pct)
    x_train, y_train = zip(*train)
    x_test, y_test = zip(*test)
    return x_train, x_test, y_train, y_test

In [78]:
X, y = list(range(10)), list(range(-10, 0))

In [79]:
random.seed(0)
train_test_split(X, y, 1/3)

((2, 3, 4, 5, 7, 8, 9), (0, 1, 6), (-8, -7, -6, -5, -3, -2, -1), (-10, -9, -4))

In [80]:
sk_train_test_split(X, y, test_size=1/3, random_state=0, shuffle=True)

[[1, 6, 7, 3, 0, 5], [2, 8, 4, 9], [-9, -4, -3, -7, -10, -5], [-8, -2, -6, -1]]

## Pseudo-code for training
```
model = SomeKindOfModel()
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.33)
model.train(x_train, y_train)
performance = model.test(x_test, y_test)
```

In [81]:
def accuracy(tp, fp, fn, tn):
    """Fraction of correct predictions."""
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total

In [82]:
accuracy(70, 4930, 13930, 981070)

0.98114

In [83]:
def precision(tp, fp, fn, tn):
    """Measures how accurate our positive predictions are."""
    return tp / (tp + fp)

In [84]:
precision(70, 4930, 13930, 981070)

0.014

In [85]:
def recall(tp, fp, fn, tn):
    """Measures what fraction of positives our model identified"""
    return tp / (tp + fn)

In [86]:
recall(70, 4930, 13930, 981070)

0.005

In [87]:
def f1_score(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    return 2 * p * r / (p + r)

In [88]:
f1_score(70, 4930, 13930, 981070)

0.00736842105263158

In [61]:
# tp, fp, fn, tn.
tf = [70, 4930, 13930, 981070] # True Falses
n = sum(tf)
X, y = [], []

# True positives.
for i in range(tf[0]):
    X.append(1)
    y.append(1)

# False positive.
for i in range(tf[1]):
    X.append(0)
    y.append(1)

# False negative.
for i in range(tf[2]):
    X.append(1)
    y.append(0)

# True negative.
for i in range(tf[3]):
    X.append(0)
    y.append(0)

(array([0.986, 0.014]),
 array([0.995, 0.005]),
 array([0.99047956, 0.00736842]),
 array([986000,  14000]))

In [68]:
m = precision_recall_fscore_support(X, y)
m

(array([0.986, 0.014]),
 array([0.995, 0.005]),
 array([0.99047956, 0.00736842]),
 array([986000,  14000]))

In [93]:
np.array(m)[:,1]

array([1.40000000e-02, 5.00000000e-03, 7.36842105e-03, 1.40000000e+04])

## Todos
- what are good values for precision, recall etc
- how to calculate those values with scikit learn?