In [160]:
#from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import joblib
from typing import List, Tuple
from copy import copy, deepcopy
import sys

from codicem.timings_type import *
from codicem.util import *


In [114]:
train, _ = load_timings_set("/home/braddock/expire/morse/new-train", num_sets=200000)
test, _ = load_timings_set("/home/braddock/expire/morse/new-test")
(len(train), len(test))

(200000, 2000)

In [35]:
marks_stypes = [DOT, DASH]
spaces_stypes = [SYM_SPACE, CHAR_SPACE, WORD_SPACE]
label_lookup_marks = {stype: i for i, stype in enumerate(marks_stypes)}
stype_lookup_marks = {i: stype for i, stype in enumerate(marks_stypes)}
label_lookup_spaces = {stype: i for i, stype in enumerate(spaces_stypes)}
stype_lookup_spaces = {i: stype for i, stype in enumerate(spaces_stypes)}

def prep_data(timings_set, is_on):
    label_lookup = label_lookup_marks if is_on else label_lookup_spaces
    timings = [t for timings in timings_set for t in timings if t.is_on == is_on]
    durations = [t.duration for t in timings]
    labels = [label_lookup[t.stype] for t in timings]
    X = np.array(durations).reshape(-1, 1)
    return X, labels
    


In [36]:
X_marks, labels_marks = prep_data(train, True)
X_spaces, labels_spaces = prep_data(train, False)

In [38]:
X_marks[:10]

array([[4.8 ],
       [0.43],
       [0.82],
       [0.82],
       [0.37],
       [0.42],
       [0.46],
       [4.62],
       [1.11],
       [0.74]])

In [39]:
rf_marks = RandomForestClassifier(n_estimators=10)
rf_spaces = RandomForestClassifier(n_estimators=10)

In [40]:
rf_marks.fit(X_marks, labels_marks)
rf_spaces.fit(X_spaces, labels_spaces)

In [15]:
joblib.dump([rf_marks, rf_spaces], "rf.joblib")

['rf.joblib']

In [41]:
test_marks, test_labels_marks = prep_data(test, True)
test_spaces, test_labels_spaces = prep_data(test, False)

In [42]:
predicted_marks = rf_marks.predict(test_marks)
predicted_spaces = rf_spaces.predict(test_spaces)


In [45]:
predicted_spaces[:20], test_labels_spaces[:20]

(array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0])

In [157]:
marks_accuracy = np.mean(predicted_marks == test_labels_marks)
spaces_accuracy = np.mean(predicted_spaces == test_labels_spaces)
print("Marks accuracy:", marks_accuracy, " Spaces accuracy:", spaces_accuracy)

Marks accuracy: 0.9505111606353249  Spaces accuracy: 0.9640883977900553


In [11]:
# Random forest with 100 estimators and 200,000 training sets
# Marks accuracy: 0.9505111606353249  Spaces accuracy: 0.9640883977900553
# Random forest with 20 estimators and 200,000 training sets
# Marks accuracy: 0.9505111606353249  Spaces accuracy: 0.9640525220635718

In [190]:
def predicted2timings(timing_set, predicted_marks, predicted_spaces):
    m = predicted_marks.tolist()
    s = predicted_spaces.tolist()
    out_set = []
    for timings in timing_set:
        labeled_timings = []
        assert len(m) and len(s)
        for t in timings:
            if t.is_on:
                stype = stype_lookup_marks[m.pop(0)]
            else:
                stype = stype_lookup_spaces[s.pop(0)]
            timing_copy = deepcopy(t)
            timing_copy.stype = stype
            timing_copy.label = '~'
            labeled_timings.append(timing_copy)
        out_set.append(labeled_timings)
    return out_set
        

def timings2dashdots(timings: List[Timing]) -> Tuple[str, List[Timing]]:
    dd = []
    sym = ''
    new_timings = []
    for t in timings:
        new_timings.append(copy(t))
        new_timings[-1].label = '~'
        if t.stype == DOT:
            sym += '.'
        elif t.stype == DASH:
            sym += '-'
        elif t.stype == WORD_SPACE:
            if len(sym) > 0:
                dd.append(sym)
                try:
                    new_timings[-1].label = dashdot2char(sym) + ' '
                except KeyError as e:
                    new_timings[-1].label = ' '
            dd.append(' ')
        elif t.stype == CHAR_SPACE:
            dd.append(sym)
            sym = ''
        elif t.stype == SYM_SPACE:
            pass
        else:
            raise ValueError(f"Unknown stype {t.stype}")
        
    return dd, new_timings

def dashdots2string(dashdots: List[str]) -> str:
    s = ''
    for dd in dashdots:
        try:
            c = dashdot2char(dd)
        except KeyError as e:
            c = '~'
        s += c
    return s


def score_timing_sets(reference: List[List[Timing]], predicted: List[List[Timing]]):
    if len(reference) != len(predicted):
        raise ValueError("len(reference) != len(predicted)")
    correct = []
    for ref, pred in zip(reference, predicted):
        if ref[-1].label == pred[-1].label:
            correct.append(1)
        else:
            print(f"{timings2dashdots(ref)[0]} vs {timings2dashdots(pred)[0]}")
            correct.append(0)
    return correct 

dd, _ = timings2dashdots(train[10])
print(dd)

#dashdots2string(dd), dd, ''.join([t.label for t in train[10]])

['.----', '.--', '..-']


In [191]:
predicted_timings = predicted2timings(test[:1000], predicted_marks, predicted_spaces)
predicted_timings = [timings2dashdots(t)[1] for t in predicted_timings]
correct = score_timing_sets(test[:1000], predicted_timings)
accuracy = np.mean(correct)
accuracy

['.-.-.-', '...', '.-..', '-.-.'] vs ['.-.-.-', '...', '.-..']
['-....', '--..--', '--..', ' '] vs ['-....', '--..--', '--..']
['-..-', '---', '.--.', '--'] vs ['-..-', '---', '.--.', '--']
['.--.', '--..--', '--.', '.-'] vs ['.--.', '--...-', '--.', '.-']
['.-..', '..', '.', '-'] vs ['.-..', '-.', '-', '-']
['-.-', ' ', '-.--....', '-....-'] vs ['-.-', '-....', '-....-']
[' ', '--.', '-.-.'] vs ['', '.-.-.-.']
['...-', '--..', '----.', '-....'] vs ['...-', '.-..', '----.', '-....']
['---', '-....-', '.....', '-----'] vs ['---', '-....-', '.....', '-----']
['.-..', ' ', '.-..-....-', '-.-.'] vs ['.-..', ' ', '.-..-....-']
['..---', '.--.', ' ', '.--......'] vs ['..---', '.--.', '.....']
[' ', '-.--', '-..-', '--'] vs ['', '-.--']
['.--.-.', ' ', '.--.-..----', '--'] vs ['.-..-.', ' ', '.-..-..-..-', '--']
['--.-', ' ', '--.-.--.-.', '.-..'] vs ['--.-', '.--...', '.-..']
['-...', '....-', '-....-', '-----'] vs ['-...', '....-', '-....-', '-----']
[' ', '---...', ' ', '---....-.-.-'] vs 

0.647

In [192]:
# save_timings(sys.stdout, predicted_timings[2], "PREDICT")
# print("----------------------------")
# save_timings(sys.stdout, test[2], "TEST")

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]

In [207]:
np.mean(predicted_spaces == 2), np.mean(np.array(labels_spaces, dtype=int) == 2)

(0.016502834182392193, 0.027900560193030888)

In [11]:
marks_sum = 0
spaces_sum = 0
for timings in test:
    mark_durations, mark_labels = prep_data([timings], True)
    space_durations, space_labels = prep_data([timings], False)
    predicted_marks = rf_marks.predict(mark_durations)
    predicted_spaces = rf_spaces.predict(space_durations)
    if predicted_marks[-1] == mark_labels[-1]:
        marks_sum += 1
    if predicted_spaces[-1] == space_labels[-1]:
        spaces_sum += 1
print("Character")