In [9]:
import numpy as np
import pandas as pd

import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [2]:
def scale(x_train, x_test):
    scaler = StandardScaler()
    scaler.fit(x_train)

    x_train = pd.DataFrame(scaler.transform(x_train), 
                           index=x_train.index, 
                           columns=x_train.columns)

    x_test = pd.DataFrame(scaler.transform(x_test),
                          index=x_test.index, 
                          columns=x_test.columns)
    return x_train, x_test

In [12]:
parms = joblib.load('../Project_data/results/log_gridsearch.joblib')
parms.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [14]:
logreg=LogisticRegression(solver='saga', 
                          multi_class='multinomial', 
                          verbose=1, 
                          max_iter=200,
                          **parms.best_params_)

## 1. 5-classes classification:

### 1.1. Original set

In [3]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc.csv', index_col='building_id')

In [4]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [5]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [15]:
logreg.fit(x_train_norm, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 108 epochs took 476 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.0min finished


LogisticRegression(max_iter=200, multi_class='multinomial', solver='saga',
                   verbose=1)

In [40]:
prediction = logreg.predict(x_test_norm)
pred_proba = logreg.predict_proba(x_test_norm)

In [42]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_3,0.037906,0.17795,0.405512,0.267498,0.111134
1,Grade_2,0.114363,0.518469,0.203811,0.072554,0.090804
2,Grade_4,0.005979,0.012413,0.092188,0.670673,0.218747
3,Grade_3,0.035994,0.132767,0.459925,0.267166,0.104147
4,Grade_5,0.008275,0.018981,0.057072,0.195429,0.720243


In [44]:
preds.to_csv('../Project_data/results/log_preds_orig.csv')

## make a function that needs not classifier, so you only save prediction and prediction proba - maybe save the two in the same file - how is prediction made from the prediction proba? - maybe .5 threshold per class, check

### 1.2 Resampled set:

#### 1.2.1 Over-sampling with SMOTENC

In [45]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc.csv', index_col=0).reset_index(drop=True)

  mask |= (ar1 == a)


In [46]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [47]:
logreg.fit(x_train_norm, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 1368 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 22.9min finished


LogisticRegression(max_iter=200, multi_class='multinomial', solver='saga',
                   verbose=1)

In [50]:
prediction = logreg.predict(x_test_norm)
pred_proba = logreg.predict_proba(x_test_norm)

In [51]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_3,0.033531,0.259614,0.446449,0.183761,0.076646
1,Grade_2,0.136831,0.628951,0.159758,0.033517,0.040943
2,Grade_4,0.001265,0.004286,0.050098,0.622184,0.322166
3,Grade_3,0.014555,0.164016,0.538524,0.224724,0.058181
4,Grade_5,0.009972,0.043496,0.095544,0.285311,0.565677


In [52]:
preds.to_csv('../Project_data/results/log_preds_smotenc.csv')

### 1.2.2 Under-sampling: cleaning oversampled dataset

#### 1.2.2.1 Tomek

In [53]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)

In [54]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [55]:
logreg.fit(x_train_norm, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 1177 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 19.7min finished


LogisticRegression(max_iter=200, multi_class='multinomial', solver='saga',
                   verbose=1)

In [57]:
prediction = logreg.predict(x_test_norm)
pred_proba = logreg.predict_proba(x_test_norm)

In [58]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_3,0.031675,0.272507,0.445336,0.177777,0.072705
1,Grade_2,0.140317,0.647568,0.149445,0.02843,0.03424
2,Grade_4,0.001132,0.004014,0.046684,0.618539,0.32963
3,Grade_3,0.014885,0.174216,0.548209,0.214466,0.048224
4,Grade_5,0.010062,0.045245,0.0978,0.290711,0.556181


In [59]:
preds.to_csv('../Project_data/results/log_preds_smotenc_tmk.csv')

#### 1.2.2.2 ENN

In [60]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)

In [61]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [62]:
logreg.fit(x_train_norm, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 668 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 11.2min finished


LogisticRegression(max_iter=200, multi_class='multinomial', solver='saga',
                   verbose=1)

In [64]:
prediction = logreg.predict(x_test_norm)
pred_proba = logreg.predict_proba(x_test_norm)

In [65]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_3,0.036799,0.331258,0.459154,0.13851,0.034279
1,Grade_2,0.188665,0.708398,0.090632,0.006837,0.005468
2,Grade_4,0.001534,0.00568,0.057305,0.64131,0.294172
3,Grade_3,0.020613,0.241167,0.586275,0.141086,0.010859
4,Grade_5,0.013702,0.066389,0.125699,0.342573,0.451637


In [66]:
preds.to_csv('../Project_data/results/log_preds_smotenc_enn.csv')

## 2. 3-classes classification:

In [67]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc_3lab.csv', index_col='building_id')

In [68]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [69]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [70]:
logreg.fit(x_train_norm, y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 66 epochs took 191 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.2min finished


LogisticRegression(max_iter=200, multi_class='multinomial', solver='saga',
                   verbose=1)

In [71]:
logreg.predict(x_test_norm)

array(['G2', 'G1', 'G2', ..., 'G2', 'G2', 'G2'], dtype=object)

In [72]:
prediction = logreg.predict(x_test_norm)
pred_proba = logreg.predict_proba(x_test_norm)

In [73]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3
0,G2,0.210791,0.675913,0.113296
1,G1,0.615668,0.277009,0.107323
2,G2,0.025303,0.743056,0.231641
3,G2,0.155374,0.740178,0.104448
4,G3,0.0313,0.255617,0.713083


In [74]:
preds.to_csv('../Project_data/results/log_preds_3cls.csv')