# Logistic regression

In [1]:
# visualisation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn import preprocessing

# model
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

# validation
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

### train data

In [2]:
train_labels = pd.read_csv("./data/train_labels.csv", header=None)
print(train_labels.shape)

train_labels.head()

(4363, 1)


Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1


In [3]:
train_data = pd.read_csv("./data/train_data.csv", header=None)
print(train_data.shape)
train_data.head()

(4363, 264)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,254,255,256,257,258,259,260,261,262,263
0,1040.7,2315.6,2839.1,2552.2,2290.4,1913.8,2152.6,1930.3,2079.3,1706.7,...,0.21649,0.36548,0.093584,0.16687,0.083426,0.11809,0.089792,0.074371,0.073162,0.059463
1,2309.4,4780.4,4055.7,3120.5,1979.9,2343.6,2634.2,3208.5,3078.0,3374.7,...,0.10067,0.14739,0.10256,0.21304,0.082041,0.080967,0.07645,0.052523,0.052357,0.055297
2,2331.9,4607.0,4732.3,5007.0,3164.9,3171.9,2915.7,3282.3,2400.0,1895.2,...,0.12676,0.36321,0.1142,0.22378,0.10077,0.18691,0.06727,0.061138,0.085509,0.049422
3,3350.9,6274.4,5037.0,4609.7,3438.8,3925.8,3746.4,3539.4,3053.7,3075.4,...,0.096479,0.2895,0.074124,0.20158,0.049032,0.13021,0.0458,0.080885,0.14891,0.042027
4,2017.6,3351.8,2924.9,2726.3,1979.9,1930.9,2083.4,1889.2,1695.4,1911.7,...,0.13834,0.38266,0.079402,0.063495,0.053717,0.08675,0.06209,0.048999,0.033159,0.070813


In [4]:
# scale
scaled_features = preprocessing.scale(train_data)
pd.DataFrame(scaled_features).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,254,255,256,257,258,259,260,261,262,263
0,-1.571333,-1.423949,-0.935285,-1.169432,-0.772396,-1.258998,-0.811812,-0.985136,-0.581884,-0.756522,...,1.279245,1.266232,-0.158097,0.297566,0.164625,0.499226,0.921775,-0.185416,0.092879,0.006096
1,-0.602171,0.267213,0.055395,-0.711935,-1.073589,-0.876857,-0.373028,0.155161,0.39948,0.963212,...,-0.897103,-0.852736,0.074136,1.103637,0.111916,-0.726099,0.269456,-1.177036,-0.694615,-0.2238
2,-0.584983,0.148239,0.606352,0.806748,0.07589,-0.140405,-0.116555,0.220999,-0.266751,-0.562176,...,-0.406852,1.244176,0.375294,1.291144,0.824687,2.77078,-0.179373,-0.786025,0.560227,-0.548004
3,0.193432,1.292285,0.85447,0.486911,0.341579,0.529897,0.640292,0.450361,0.375602,0.654629,...,-0.975856,0.528008,-0.66158,0.90356,-1.144311,0.899273,-1.229087,0.110236,2.960029,-0.956088
4,-0.825078,-0.712986,-0.865418,-1.029277,-1.073589,-1.243794,-0.87486,-1.021802,-0.95912,-0.545164,...,-0.189255,1.433153,-0.525024,-1.507232,-0.966014,-0.535219,-0.432634,-1.336981,-1.421282,0.632431


In [5]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, train_labels[0], test_size=0.2)

In [6]:
initialModel = LogisticRegressionCV(solver='lbfgs', max_iter=10**6, cv=5)
initialModel.fit(X_train,y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1000000,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [8]:
predictions = initialModel.predict(X_test)

In [9]:
predictions.shape

(873,)

In [10]:
accuracy_score(y_test, predictions)

0.6391752577319587

In [11]:
np.unique(predictions)

array([1, 2, 3, 4, 5, 6])

In [12]:
pred_proba = initialModel.predict_proba(X_test)
log_loss(y_true=y_test, y_pred=pred_proba)

1.1873170454160231

### train using full data

In [13]:
mainModel = LogisticRegressionCV(solver='lbfgs', max_iter=10**6, cv=5)
mainModel.fit(scaled_features, train_labels[0])

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1000000,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

# predict on test datasets

In [14]:
test_data = pd.read_csv("./data/test_data.csv", header=None)
scaled_test_data = preprocessing.scale(test_data)

In [15]:
pd.DataFrame(scaled_test_data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,254,255,256,257,258,259,260,261,262,263
0,0.043826,-0.809767,-0.945635,-0.571495,-0.451566,-0.539191,-0.568266,-0.369424,-0.299865,0.101217,...,0.988079,0.40627,1.051041,0.315653,1.091843,1.080988,0.829564,1.646998,0.159933,1.253805
1,-0.764994,-0.760131,-1.318111,-0.279499,-1.052338,-0.93628,-1.156958,-1.194894,-1.448273,-1.322496,...,-1.008087,-0.546926,0.945286,-0.057584,-0.624147,0.383785,0.286891,1.209392,1.329834,0.436842
2,-1.111097,-0.395324,-0.271093,-0.399065,-0.108006,-0.318027,-0.471264,-0.778783,-0.784574,-0.720827,...,-0.645886,0.708601,-0.063793,-0.748778,-1.639754,-0.541667,-0.921541,-0.184411,-0.864731,-0.883554
3,-0.238746,0.154644,0.249435,-0.011017,0.36387,0.735586,0.085907,0.430105,0.388821,1.164532,...,-0.687927,1.859212,-1.102982,0.289751,-0.835946,0.829969,-2.591491,-0.12366,0.844177,-1.695492
4,-1.17684,-1.313125,-1.168628,-1.241462,-1.013563,-0.95083,-0.793338,-0.848165,-0.870585,-0.55014,...,-0.943363,0.394816,-1.090551,1.65006,-1.533462,-1.704437,-1.215077,-0.021176,-0.107274,-0.871699


In [16]:
predictions = mainModel.predict(scaled_test_data)

In [17]:
np.unique(predictions)

array([1, 2, 3, 4, 5, 6, 7, 8])

In [18]:
template = pd.read_csv("./data/dummy_solution_accuracy.csv")
print(template.shape)
template.head()

(6544, 2)


Unnamed: 0,Sample_id,Sample_label
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


In [19]:
submission = template.copy()
submission["Sample_label"] = predictions
submission.head()

Unnamed: 0,Sample_id,Sample_label
0,1,3
1,2,3
2,3,1
3,4,1
4,5,1


In [20]:
submission.to_csv("./submission/accuracy_log.csv", index=False)

In [21]:
submission_proba = mainModel.predict_proba(scaled_test_data)

In [22]:
dummy = pd.read_csv("./data/dummy_solution_accuracy.csv")
dummy.drop("Sample_label", inplace=True, axis=1)
print(dummy.shape)
dummy.head()

(6544, 1)


Unnamed: 0,Sample_id
0,1
1,2
2,3
3,4
4,5


In [25]:
submission2 = dummy.copy()
for i in range(10):
    submission2[f'Class_{i+1}'] = submission_proba[:, i]
submission2.head()

Unnamed: 0,Sample_id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,Class_10
0,1,0.144119,0.063314,0.436656,0.018117,0.069936,0.112757,0.089207,0.002548,0.035977,0.027369
1,2,0.107816,0.046043,0.368767,0.031754,0.078423,0.136735,0.116891,0.027958,0.048483,0.03713
2,3,0.682844,0.031179,0.001349,0.002312,0.041194,0.044557,0.034815,0.122745,0.018688,0.020317
3,4,0.903234,0.029865,0.002594,8.5e-05,0.029588,0.00101,0.005973,0.001218,0.011349,0.015085
4,5,0.871065,0.00576,0.003067,0.00193,0.032746,0.026981,0.00752,0.017305,0.015517,0.018108


In [26]:
submission2.to_csv("./submission/logloss_log.csv", index=False)