In [1]:
import os.path

# combining csv files and writing combined file

In [2]:
csv_files = [
    os.path.join('data', '1-right.csv'),
    os.path.join('data', '2-right.csv'),
    os.path.join('data', '3-right.csv'),
    os.path.join('data', '4-right.csv'),
    os.path.join('data', '5-right.csv'),
]

In [3]:
with open(csv_files[0], 'r') as f:
    csv_header = f.readline()

In [4]:
content = []

In [5]:
for csv_file in csv_files:
    with open(csv_file, 'r') as f:
        lines = f.readlines()
        if lines[0] != csv_header:
            raise ValueError('csv headers don\'t match')
        else:
            content += lines[1:]

In [6]:
len(content)

1166

In [7]:
content.insert(0, csv_header)

In [8]:
# saving combined csv
with open('combined-data.csv', 'w') as f:
    print(''.join(content), end='', file=f)

# reading data from csv and preparing it

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv('combined-data.csv')

In [11]:
df

Unnamed: 0,number,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v53,v54,v55,v56,v57,v58,v59,v60,v61,v62
0,1,0.459556,1.068004,-0.000023,0.404865,0.988291,0.021323,0.386485,0.904044,0.013519,...,-0.092794,0.537547,0.846286,-0.079927,0.523416,0.877155,-0.060446,0.514705,0.915527,-0.059300
1,1,0.514210,1.035730,0.000031,0.446691,0.937000,0.016114,0.420571,0.842098,0.017736,...,-0.035976,0.555677,0.774057,-0.036686,0.539824,0.806357,-0.041776,0.534274,0.844050,-0.050918
2,1,0.631706,0.915632,0.000013,0.559298,0.834830,-0.007613,0.515313,0.744087,-0.017699,...,-0.042729,0.670636,0.670955,-0.074938,0.641264,0.701070,-0.101158,0.626095,0.737828,-0.113989
3,1,0.644954,0.898782,-0.000032,0.572989,0.795587,-0.007245,0.550288,0.704822,-0.009594,...,-0.007517,0.709739,0.665345,-0.030206,0.689190,0.686612,-0.045460,0.676137,0.717414,-0.051320
4,1,0.687299,0.861615,-0.000038,0.620114,0.773832,-0.009120,0.590263,0.693418,-0.017644,...,-0.023523,0.730394,0.639077,-0.048310,0.715270,0.682701,-0.056428,0.709047,0.709845,-0.051895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1161,5,0.543384,0.697984,-0.000023,0.484875,0.667651,-0.045035,0.435403,0.606591,-0.068764,...,-0.083742,0.643759,0.500102,-0.111502,0.673980,0.477447,-0.138931,0.702728,0.449528,-0.162217
1162,5,0.539552,0.697464,-0.000043,0.479525,0.668351,-0.045204,0.429955,0.611073,-0.068713,...,-0.074027,0.635535,0.498001,-0.101778,0.663637,0.472552,-0.130003,0.690142,0.442413,-0.152455
1163,5,0.540279,0.694836,-0.000019,0.479537,0.666893,-0.042827,0.428672,0.608975,-0.061878,...,-0.057261,0.629776,0.485303,-0.082247,0.656825,0.459002,-0.108845,0.682912,0.428398,-0.129342
1164,5,0.536842,0.692231,-0.000039,0.476965,0.667322,-0.041909,0.425245,0.611663,-0.059864,...,-0.054448,0.621683,0.481183,-0.078393,0.646523,0.454400,-0.103199,0.670804,0.423767,-0.122771


In [12]:
X = df.drop('number', axis=1) # features
y = df['number'] # target value

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6969)

# train model(s)

In [14]:
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler 

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [15]:
pipelines = {
    'lr':make_pipeline(StandardScaler(), LogisticRegression()),
    'rc':make_pipeline(StandardScaler(), RidgeClassifier()),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier()),
    'knn':make_pipeline(StandardScaler(), KNeighborsClassifier()),
}

In [16]:
import time

In [17]:
fit_models = {}
for algo, pipeline in pipelines.items():
    print(f'training {algo}')
    s_time = time.time()
    model = pipeline.fit(X_train, y_train)
    print(f'finished training {algo} in {time.time() - s_time} ms')
    fit_models[algo] = model

training lr
finished training lr in 0.462099552154541 ms
training rc
finished training rc in 0.03398633003234863 ms
training rf
finished training rf in 0.3872346878051758 ms
training gb
finished training gb in 6.534621953964233 ms
training knn
finished training knn in 0.005452632904052734 ms


# evaluating models

In [18]:
from sklearn.metrics import accuracy_score # Accuracy metrics 

In [19]:
for algo, model in fit_models.items():
    yhat = model.predict(X_test)
    print(algo, accuracy_score(y_test, yhat))


lr 0.9957264957264957
rc 1.0
rf 1.0
gb 1.0
knn 0.9957264957264957


# saving model