In [1]:
import os.path

# combining csv files and writing combined file

In [2]:
csv_files = [
    os.path.join('data', '1-right.csv'),
    os.path.join('data', '2-right.csv'),
    os.path.join('data', '3-right.csv'),
    os.path.join('data', '4-right.csv'),
    os.path.join('data', '5-right.csv'),
]

In [3]:
with open(csv_files[0], 'r') as f:
    csv_header = f.readline()

In [4]:
content = []

In [5]:
for csv_file in csv_files:
    with open(csv_file, 'r') as f:
        lines = f.readlines()
        if lines[0] != csv_header:
            raise ValueError('csv headers don\'t match')
        else:
            content += lines[1:]

In [6]:
len(content)

2294

In [7]:
content.insert(0, csv_header)

In [8]:
# saving combined csv
with open('combined-data.csv', 'w') as f:
    print(''.join(content), end='', file=f)

# reading data from csv and preparing it

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv('combined-data.csv')

In [11]:
df

Unnamed: 0,number,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v53,v54,v55,v56,v57,v58,v59,v60,v61,v62
0,1,0.456219,0.873755,-0.000070,0.532996,0.882750,-0.020303,0.594814,0.887066,-0.072079,...,-0.192685,0.412329,0.849084,-0.242480,0.413113,0.895810,-0.251400,0.416987,0.939110,-0.244810
1,1,0.382249,1.022371,0.000041,0.480319,1.008138,-0.054953,0.566399,0.941803,-0.118954,...,-0.232701,0.362973,0.807706,-0.292520,0.383200,0.872722,-0.315798,0.388456,0.921556,-0.331551
2,1,0.355092,1.021233,-0.000152,0.453272,0.974141,-0.006805,0.519954,0.898316,-0.031711,...,-0.123474,0.367822,0.769571,-0.178965,0.389779,0.832463,-0.178868,0.389396,0.883066,-0.164177
3,1,0.317955,0.978340,-0.000045,0.401257,0.952056,-0.010679,0.462353,0.875827,-0.042232,...,-0.101091,0.348230,0.695236,-0.122613,0.357710,0.768464,-0.110804,0.344006,0.803927,-0.101029
4,1,0.309653,0.933218,-0.000035,0.385834,0.897101,-0.022649,0.443898,0.801311,-0.049826,...,-0.081017,0.315975,0.631280,-0.109333,0.324516,0.711649,-0.095311,0.309623,0.755084,-0.079429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2289,5,0.350177,0.760743,-0.000073,0.408041,0.749393,-0.025332,0.455017,0.679040,-0.032530,...,-0.008599,0.322104,0.473742,-0.013582,0.319052,0.427352,-0.024718,0.316667,0.380856,-0.034656
2290,5,0.349107,0.759184,-0.000085,0.407159,0.748954,-0.024824,0.454302,0.678276,-0.030291,...,-0.005590,0.321730,0.472208,-0.011031,0.318479,0.425481,-0.022108,0.315923,0.378818,-0.031650
2291,5,0.348406,0.759560,-0.000079,0.406499,0.749546,-0.025332,0.453935,0.678353,-0.030344,...,-0.004958,0.320784,0.472693,-0.010299,0.317779,0.426051,-0.021836,0.315392,0.379795,-0.032051
2292,5,0.348499,0.759566,-0.000082,0.406132,0.749411,-0.024803,0.453403,0.676937,-0.029542,...,-0.006050,0.320109,0.472807,-0.011715,0.316855,0.425691,-0.023115,0.314076,0.378051,-0.033523


In [12]:
X = df.drop('number', axis=1) # features
y = df['number'] # target value

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6969)

# train model(s)

In [14]:
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler 

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [15]:
pipelines = {
    'lr':make_pipeline(StandardScaler(), LogisticRegression()),
    'rc':make_pipeline(StandardScaler(), RidgeClassifier()),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
    'knn':make_pipeline(StandardScaler(), KNeighborsClassifier()),
}

In [16]:
import time

In [17]:
fit_models = {}
for algo, pipeline in pipelines.items():
    print(f'training {algo}')
    s_time = time.time()
    model = pipeline.fit(X_train, y_train)
    print(f'finished training {algo} in {time.time() - s_time} ms')
    fit_models[algo] = model

training lr


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


finished training lr in 0.6109356880187988 ms
training rc
finished training rc in 0.026322126388549805 ms
training rf
finished training rf in 0.6469628810882568 ms
training gb
finished training gb in 13.91026520729065 ms
training knn
finished training knn in 0.0044498443603515625 ms


# evaluating models

In [18]:
from sklearn.metrics import accuracy_score # Accuracy metrics 

In [19]:
for algo, model in fit_models.items():
    yhat = model.predict(X_test)
    print(algo, accuracy_score(y_test, yhat))


lr 1.0
rc 0.9934640522875817
rf 0.9934640522875817
gb 0.9934640522875817
knn 0.9934640522875817


# saving model

In [20]:
import pickle

In [21]:
with open('trained-model.pkl', 'wb') as f:
    pickle.dump(fit_models['rf'], f)