In [19]:
import os
import glob
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
import csv
import pandas as pd
import random
from collections import defaultdict
from joblib import dump, load
import numpy as np

In [20]:
locations = ['not_crossing_wen', 'not_crossing_nick', 'not_crossing']
locations_crossing = ['crossing_wen', 'crossing_nick', 'crossing']

not_crossing = defaultdict(list)
crossing = defaultdict(list)

In [21]:
# Read only the valid data into dict

for location in locations:
    path = 'data/' + location + '/'
    for filename in glob.glob(path + '*.csv'):
        name = filename.replace(path, '')[:-4]

        df=pd.read_csv(filename)
        if len(df.columns) >= 5 * 2: # 2 columns per point for x & y
            for data in df.values:
                not_crossing[name].append(data)
                
for location in locations_crossing:
    path = 'data/' + location + '/'
    for filename in glob.glob(path + '*.csv'):
        name = filename.replace(path, '')[:-4]

        df=pd.read_csv(filename)
        if len(df.columns) >= 5 * 2: # 2 columns per point for x & y
            for data in df.values:
                crossing[name].append(data)

In [22]:
# sanity check
crossing['01234567891011121314151617'][:5]

[]

In [23]:
# Create models

def extract(datum, label):
#     print(datum)
    return (label, [float(d) for d in datum])

model_scores = []

for filename in glob.glob('models/*.sav'):
    os.remove(filename)

for key in not_crossing:
    if key in crossing:
        data = [extract(d, 2) for d in not_crossing[key]]
        data += [extract(d, 1) for d in crossing[key]]
        
        random.shuffle(data)
        
        X = [d[1] for d in data]
        y = [d[0] for d in data]
        
        N = len(X)
        
        if N < 8:
            continue
        
        X_train = X[:3*N//4]
        X_test = X[3*N//4:]
        y_train = y[:3*N//4]
        y_test = y[3*N//4:]
        
#         print(y_train)
#         break;
        
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
        
        try:
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            
            if score < 0.6:
                print("⚠️ low score " + key + ' count: ' + str(N), 'score: ' + str(score))
            else:
                print('✔️ ' + key + ' score: ' + str(score), 'count: ' + str(N))
#                 pickle.dump(clf, open('models/' + key + '_{:.2f}'.format(score) + '.sav', 'wb'))
                dump(clf, 'models/' + key + '_{:.2f}'.format(score) + '.joblib')
            model_scores.append((key, score, N))
        except:
            # This means there is only one classification in training data. Caused by train/test split
            print("❌ omitted " + key + ' count: ' + str(N))

In [24]:
sorted(model_scores, key=lambda tup: (tup[1], tup[2]), reverse=True)

[]

### The best model: 01234567891011121314151617

In [25]:
model_dict = {}

for filename in glob.glob('models/*.joblib'):
    name = filename.replace('models/', '')[:-12]
    print(name)

    model_dict[name] = load(filename)

In [26]:
data = [extract(d, 1) for d in not_crossing["01234567891011121314151617"]]
data += [extract(d, 2) for d in crossing["01234567891011121314151617"]]

# print([extract(d, 1) for d in not_crossing["01234567891011121314151617"]])

random.shuffle(data)

X = [d[1] for d in data]
y = [d[0] for d in data]

N = len(X)

X_train = X[:3*N//4]
X_test = X[3*N//4:]
y_train = y[:3*N//4]
y_test = y[3*N//4:]

# print(model_dict["01234567891011121314151617"].score(X_test, y_test))

In [18]:
file = "0124567891011121314151617"

cross_arr = np.asarray(crossing[file])
cross_arr = [np.append(d, 0) for d in cross_arr]

not_cross_arr = np.asarray(not_crossing[file])
not_cross_arr = [np.append(d, 1) for d in not_cross_arr]

combined_arr = cross_arr + not_cross_arr

random.shuffle(combined_arr)

N = len(combined_arr)

train_arr = combined_arr[:4*N//5]
test_arr = combined_arr[4*N//5:]

np.savetxt('train.csv', train_arr, delimiter=',')
np.savetxt('test.csv', test_arr, delimiter=',')