In [122]:
import sys
sys.path.insert(0, '../..')
from helper import _process_pred_labels
from evaluate import get_pred_labels

In [123]:
import os
import pandas as pd
import numpy as np

In [139]:
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [142]:
N_FILES = 2

In [125]:
target = pd.read_pickle('../../data/prepared/target.pkl')

In [148]:
train = []
for i in range(N_FILES):
    df = pd.read_csv(f'oof_{i}/pred_proba.csv', index_col='review_id')
    df.columns = [f'{i}_{col}' for col in df.columns]
    df = df.rank(ascending=False) / len(df)
    train.append(df)
train = pd.concat(train, axis=1)

In [149]:
test = []
for i in range(N_FILES):
    df = pd.read_csv(f'sub_{i}/pred_proba.csv', index_col='review_id')
    df.columns = [f'{i}_{col}' for col in df.columns]
    df = df.rank(ascending=False) / len(df)
    test.append(df)
test = pd.concat(test, axis=1)

In [153]:
base_model = LogisticRegression()
model = MultiOutputClassifier(base_model, n_jobs=-1)

param_grid = {'estimator__C': np.random.uniform(low=0.1, high=5, size=20),
              'estimator__penalty': ['l2', 'l1'],
              'estimator__solver': ['liblinear'],
              'estimator__max_iter': [1000]}
gscv = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_samples', cv=5, n_jobs=1, verbose=5, error_score='raise')
gscv.fit(train, target)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END estimator__C=2.256614501638116, estimator__max_iter=1000, estimator__penalty=l2, estimator__solver=liblinear;, score=0.834 total time=   1.0s
[CV 2/5] END estimator__C=2.256614501638116, estimator__max_iter=1000, estimator__penalty=l2, estimator__solver=liblinear;, score=0.836 total time=   0.8s
[CV 3/5] END estimator__C=2.256614501638116, estimator__max_iter=1000, estimator__penalty=l2, estimator__solver=liblinear;, score=0.833 total time=   0.8s
[CV 4/5] END estimator__C=2.256614501638116, estimator__max_iter=1000, estimator__penalty=l2, estimator__solver=liblinear;, score=0.837 total time=   0.7s
[CV 5/5] END estimator__C=2.256614501638116, estimator__max_iter=1000, estimator__penalty=l2, estimator__solver=liblinear;, score=0.822 total time=   0.8s
[CV 1/5] END estimator__C=2.256614501638116, estimator__max_iter=1000, estimator__penalty=l1, estimator__solver=liblinear;, score=0.834 total time=   8.1s
[CV 2/5]

GridSearchCV(cv=5, error_score='raise',
             estimator=MultiOutputClassifier(estimator=LogisticRegression(),
                                             n_jobs=-1),
             n_jobs=1,
             param_grid={'estimator__C': array([2.2566145 , 3.61569577, 1.05212709, 3.65597918, 2.16976284,
       2.22134059, 3.45702661, 1.71371274, 2.31302917, 3.20058405,
       0.76539113, 3.24463422, 3.21968374, 3.24683379, 4.86538964,
       4.05316783, 2.95402567, 3.79889132, 2.43023706, 4.61269815]),
                         'estimator__max_iter': [1000],
                         'estimator__penalty': ['l2', 'l1'],
                         'estimator__solver': ['liblinear']},
             scoring='f1_samples', verbose=5)

In [154]:
gscv.best_score_

0.8331642737291322

In [None]:
# 0.8432528292228764 - LR
# 0.8441071697984434 - LinSVC
# 0.8441432080880193 - LinSVC

In [156]:
pred_proba = gscv.predict_proba(test)
pred_labels = gscv.predict(test)

In [157]:
pred_proba = np.stack([p[:, 1] for p in pred_proba]).T
pred_proba = get_pred_labels(pred_proba)
pred_proba = pd.DataFrame(pred_proba, index=test.index, columns=target.columns)
pred_proba['target'] = pred_proba.apply(_process_pred_labels, axis=1)

In [159]:
pred_proba['target'].to_csv('stacking.csv')