In [1]:
import os.path

# combining csv files and writing combined file

In [2]:
csv_files = [
    os.path.join('data', '1-right.csv'),
    os.path.join('data', '2-right.csv'),
    os.path.join('data', '3-right.csv'),
    os.path.join('data', '4-right.csv'),
    os.path.join('data', '5-right.csv'),
    os.path.join('data', '5-right-cam2.csv'),
    os.path.join('data', '4-right-cam2.csv'),
]

In [3]:
with open(csv_files[0], 'r') as f:
    csv_header = f.readline()

In [4]:
content = []

In [5]:
for csv_file in csv_files:
    with open(csv_file, 'r') as f:
        lines = f.readlines()
        if lines[0] != csv_header:
            raise ValueError('csv headers don\'t match')
        else:
            content += lines[1:]

In [6]:
len(content)

7451

In [7]:
content.insert(0, csv_header)

In [8]:
# saving combined csv
with open('combined-data.csv', 'w') as f:
    print(''.join(content), end='', file=f)

# reading data from csv and preparing it

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv('combined-data.csv')

In [11]:
df

Unnamed: 0,number,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v53,v54,v55,v56,v57,v58,v59,v60,v61,v62
0,1,0.602748,0.917588,-0.000028,0.693633,0.912995,-0.006857,0.754931,0.900526,-0.040802,...,-0.147942,0.682083,0.750853,-0.199479,0.711456,0.792313,-0.227089,0.731041,0.826979,-0.242656
1,1,0.534882,0.863483,-0.000140,0.595566,0.866910,-0.030099,0.660967,0.806089,-0.048674,...,-0.032465,0.569288,0.576899,-0.092532,0.574232,0.634821,-0.115561,0.572673,0.679820,-0.123432
2,1,0.520680,0.898689,-0.000142,0.585637,0.884776,-0.018797,0.647537,0.798036,-0.030558,...,-0.048226,0.541657,0.600207,-0.096860,0.554513,0.663675,-0.103948,0.547745,0.703274,-0.100962
3,1,0.476776,0.898089,-0.000088,0.544837,0.890374,-0.026918,0.602967,0.797070,-0.037584,...,-0.065043,0.499007,0.660150,-0.107898,0.506651,0.727275,-0.102076,0.491929,0.760151,-0.089867
4,1,0.454493,0.901271,-0.000100,0.515079,0.880060,-0.025587,0.574937,0.795480,-0.050120,...,-0.074114,0.472746,0.650696,-0.114860,0.482902,0.717718,-0.109989,0.468389,0.759947,-0.101114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7446,4,0.460346,0.801832,-0.000082,0.421153,0.754602,-0.033525,0.402319,0.653197,-0.041127,...,-0.009764,0.533310,0.550837,-0.019729,0.546845,0.515787,-0.026886,0.560670,0.480423,-0.028421
7447,4,0.460667,0.802326,-0.000083,0.420983,0.755234,-0.033787,0.401414,0.653567,-0.041362,...,-0.010277,0.533197,0.550058,-0.020391,0.546836,0.515047,-0.027055,0.560680,0.480198,-0.027919
7448,4,0.463359,0.802403,-0.000084,0.423455,0.755093,-0.035096,0.404398,0.652363,-0.043994,...,-0.015887,0.533725,0.550645,-0.028575,0.547704,0.514707,-0.036801,0.562415,0.477812,-0.038653
7449,4,0.462902,0.803138,-0.000084,0.423486,0.758174,-0.034460,0.403409,0.657140,-0.043354,...,-0.013569,0.533694,0.548648,-0.025409,0.548187,0.513385,-0.033197,0.563307,0.477578,-0.034698


In [12]:
X = df.drop('number', axis=1) # features
y = df['number'] # target value

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6969)

# train model(s)

In [14]:
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler 

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [15]:
pipelines = {
    'lr':make_pipeline(StandardScaler(), LogisticRegression()),
    'rc':make_pipeline(StandardScaler(), RidgeClassifier()),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
    'knn':make_pipeline(StandardScaler(), KNeighborsClassifier()),
}

In [16]:
import time

In [None]:
fit_models = {}
for algo, pipeline in pipelines.items():
    print(f'training {algo}')
    s_time = time.time()
    model = pipeline.fit(X_train, y_train)
    print(f'finished training {algo} in {time.time() - s_time} s')
    fit_models[algo] = model

training lr


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


finished training lr in 0.9754440784454346 s
training rc
finished training rc in 0.040846824645996094 s
training rf
finished training rf in 4.081534385681152 s
training gb


# evaluating models

In [18]:
from sklearn.metrics import accuracy_score # Accuracy metrics 

In [19]:
for algo, model in fit_models.items():
    yhat = model.predict(X_test)
    print(algo, accuracy_score(y_test, yhat))


lr 0.9953051643192489
rc 0.9899396378269618
rf 0.9966465459423206
gb 0.9926224010731053
knn 0.9919517102615694


# saving model

In [20]:
import pickle

In [21]:
with open('trained-model.pkl', 'wb') as f:
    pickle.dump(fit_models['rf'], f)