In [13]:
#from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import joblib

from codicem.timings_type import *


In [2]:
train, _ = load_timings_set("/home/braddock/expire/morse/new-train", num_sets=200000)
test, _ = load_timings_set("/home/braddock/expire/morse/new-test")
(len(train), len(test))

(200000, 2000)

In [3]:
marks_stypes = [DOT, DASH]
spaces_stypes = [SYM_SPACE, CHAR_SPACE, WORD_SPACE]
label_lookup_marks = {stype: i for i, stype in enumerate(marks_stypes)}
stype_lookup_marks = {i: stype for i, stype in enumerate(marks_stypes)}
label_lookup_spaces = {stype: i for i, stype in enumerate(spaces_stypes)}
stype_lookup_spaces = {i: stype for i, stype in enumerate(spaces_stypes)}

def prep_data(timings_set, is_on):
    label_lookup = label_lookup_marks if is_on else label_lookup_spaces
    timings = [t for timings in timings_set for t in timings if t.is_on == is_on]
    durations = [t.duration for t in timings]
    labels = [label_lookup[t.stype] for t in timings]
    X = np.array([durations, labels]).T
    return X
    


In [4]:
X_marks = prep_data(train, True)
X_spaces = prep_data(train, False)

In [5]:
X_marks[:10, 0].reshape(-1, 1)

array([[4.8 ],
       [0.43],
       [0.82],
       [0.82],
       [0.37],
       [0.42],
       [0.46],
       [4.62],
       [1.11],
       [0.74]])

In [6]:
rf_marks = RandomForestClassifier(n_estimators=100)
rf_spaces = RandomForestClassifier(n_estimators=100)

In [7]:
rf_marks.fit(X_marks[:, 0].reshape(-1, 1), np.array(X_marks[:, 1], dtype=int))
rf_spaces.fit(X_spaces[:, 0].reshape(-1, 1), np.array(X_spaces[:, 1], dtype=int))

In [15]:
joblib.dump([rf_marks, rf_spaces], "rf.joblib")

['rf.joblib']

In [8]:
test_marks = prep_data(test, True)
test_spaces = prep_data(test, False)

In [9]:
predicted_marks = rf_marks.predict(test_marks[:, 0].reshape(-1, 1))
predicted_spaces = rf_spaces.predict(test_spaces[:, 0].reshape(-1, 1))


In [10]:
predicted_spaces[:20], test_spaces[:20,1]

(array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 array([0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1.,
        0., 0., 0.]))

In [11]:
marks_accuracy = np.mean(predicted_marks == test_marks[:, 1])
spaces_accuracy = np.mean(predicted_spaces == test_spaces[:, 1])
print("Marks accuracy:", marks_accuracy, " Spaces accuracy:", spaces_accuracy)

Marks accuracy: 0.9505111606353249  Spaces accuracy: 0.9640883977900553
