In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [5]:
data_folder = Path('data')
train_csv = 'train.csv'
test_csv = 'test.csv'

In [9]:
train = pd.read_csv(data_folder.joinpath(train_csv))
test = pd.read_csv(data_folder.joinpath(test_csv))

In [11]:
print('Number of training examples: %s\n Number of testing examples: %s' %(len(train), len(test)))

Number of training examples: 250
 Number of testing examples: 19750


In [16]:
X = train.drop(labels=['id', 'target'], axis=1)
y = train['target']

X_test = test.drop(labels=['id'], axis=1)

In [46]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve, auc, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

Lets scale the data first.

In [30]:
std = StandardScaler()
X_scaled = std.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [61]:
random_state = 42
clf = LogisticRegression(random_state=random_state)

param_grid = {
    'C': [0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1, 10],
#     'max_iter': [100, 500],
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced', None],
}

In [62]:
grid_clf = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=3,
            refit=True, return_train_score=True, verbose=1)
grid_clf = grid_clf.fit(X_scaled, y)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed:    9.0s finished


In [63]:
print('Best score: ', grid_clf.best_score_)
print('Best estimator: ', grid_clf.best_estimator_)
print('Best Params: ', grid_clf.best_params_)

best_parameters = grid_clf.best_params_

Best score:  0.6797986997559343
Best estimator:  LogisticRegression(C=0.001, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Best Params:  {'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2'}


In [56]:
def scores_table(model, subtitle, X, y):
    scores = ['accuracy', 'roc_auc']
    res = []
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 4, scoring = sc)
        res.append(scores)
    df = pd.DataFrame(res).T
    df.loc['mean'] = df.mean()
    df.loc['std'] = df.std()
    df= df.rename(columns={0: 'accuracy', 1:'roc_auc'})
    return df

In [57]:
from sklearn.feature_selection import RFE, RFECV

In [71]:
best_parameters = {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1'}
log_clf = LogisticRegression(**best_parameters, solver='liblinear')
# selector = RFECV(estimator=log_clf, min_features_to_select=25, step=1)
selector = RFE(estimator=log_clf, n_features_to_select=25, step=1)
selector.fit(X_scaled, y)

RFE(estimator=LogisticRegression(C=0.1, class_weight='balanced', dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='auto', n_jobs=None, penalty='l1',
                                 random_state=None, solver='liblinear',
                                 tol=0.0001, verbose=0, warm_start=False),
    n_features_to_select=25, step=1, verbose=0)

In [72]:
selector.n_features_

25

In [74]:
cross_val = scores_table(selector, 'selector_clf', X, y)
cross_val

Unnamed: 0,accuracy,roc_auc
0,0.68254,0.809463
1,0.650794,0.723785
2,0.741935,0.764706
3,0.5,0.596467
mean,0.643817,0.723605
std,0.089245,0.079412


# Submission

In [77]:
submission = pd.read_csv(data_folder.joinpath('sample_submission.csv'))
submission['target'] = selector.predict_proba(std.transform(X_test))
submission.to_csv('submissions/submission.csv', index=False)