## Imports

In [1]:
import os

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

## Config

In [3]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep

## Read Data

In [4]:
X = load(features_data_folder + 'X_train_avg_fasttext_wiki_embedding.pickle')

In [5]:
y = load(features_data_folder + 'y_train.pickle')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

In [7]:
print('X train:', X_train.shape)
print('y train:', y_train.shape)
print('X test: ', X_test.shape)
print('y test: ', y_test.shape)

X train: (343646, 600)
y train: (343646,)
X test:  (60644, 600)
y test:  (60644,)


## Train Model

In [8]:
model = LogisticRegression(random_state=42)

In [10]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Evaluate Model

In [34]:
y_pred_train = model.predict(X_train)
y_pred_proba_train = model.predict_proba(X_train)[:, 0]

In [35]:
y_pred_test = model.predict(X_test)
y_pred_proba_test = model.predict_proba(X_test)[:, 0]

In [36]:
continuous_metrics = [log_loss, roc_auc_score]

In [37]:
binary_metrics = [accuracy_score, precision_score, recall_score]

In [41]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_proba_train)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_train)))

log_loss            :    1.05645
roc_auc_score       :    0.27769
accuracy_score      :    0.69822
precision_score     :    0.64254
recall_score        :    0.41158
