In [1]:
import pandas as pd
from pathlib import Path

In [2]:
data_folder = Path('data')
train_csv = 'train.csv'
test_csv = 'test.csv'

In [3]:
train = pd.read_csv(data_folder.joinpath(train_csv))
test = pd.read_csv(data_folder.joinpath(test_csv))

In [4]:
print('Number of training examples: %s\n Number of testing examples: %s' %(len(train), len(test)))

Number of training examples: 250
 Number of testing examples: 19750


In [5]:
X = train.drop(labels=['id', 'target'], axis=1)
y = train['target']
X_test = test.drop(labels=['id'], axis=1)

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score

Lets scale the data first.

In [7]:
std = StandardScaler()
std = std.fit(pd.concat([X, X_test]))
X_scaled = std.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [23]:
random_state = 42
clf = LogisticRegression(random_state=random_state, solver='liblinear')

param_grid = {
    'C': [0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1, 10],
#     'max_iter': [100, 500],
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced', None],
    'solver': ['lbfgs', 'liblinear'],
}

In [24]:
grid_clf = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, cv=3,
            refit=True, return_train_score=True, verbose=1)
grid_clf = grid_clf.fit(X_scaled, y)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:    0.9s finished


In [25]:
print('Best score: ', grid_clf.best_score_)
print('Best estimator: ', grid_clf.best_estimator_)
print('Best Params: ', grid_clf.best_params_)

best_parameters = grid_clf.best_params_

Best score:  0.698514006782004
Best estimator:  LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
Best Params:  {'C': 0.1, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}


In [26]:
def scores_table(model, subtitle, X, y):
    scores = ['accuracy', 'roc_auc']
    res = []
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 4, scoring = sc)
        res.append(scores)
    df = pd.DataFrame(res).T
    df.loc['mean'] = df.mean()
    df.loc['std'] = df.std()
    df= df.rename(columns={0: 'accuracy', 1:'roc_auc'})
    return df

In [27]:
from sklearn.feature_selection import RFE, RFECV

In [28]:
# best_parameters = {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', solver='liblinear'}
log_clf = LogisticRegression(**best_parameters)
# selector = RFECV(estimator=log_clf, min_features_to_select=25, step=1)
selector = RFE(estimator=log_clf, n_features_to_select=25, step=1)
selector.fit(X_scaled, y)

RFE(estimator=LogisticRegression(C=0.1, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='auto', n_jobs=None, penalty='l1',
                                 random_state=None, solver='liblinear',
                                 tol=0.0001, verbose=0, warm_start=False),
    n_features_to_select=25, step=1, verbose=0)

In [29]:
selector.n_features_

25

In [30]:
cross_val = scores_table(selector, 'selector_clf', X, y)
cross_val

Unnamed: 0,accuracy,roc_auc
0,0.761905,0.814578
1,0.714286,0.707161
2,0.693548,0.752941
3,0.741935,0.627717
mean,0.727919,0.725599
std,0.026071,0.068165


# Submission

In [31]:
submission = pd.read_csv(data_folder.joinpath('sample_submission.csv'))
submission['target'] = selector.predict_proba(std.transform(X_test))
submission.to_csv('submissions/submission.csv', index=False)