In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

np.random.seed(27)

In [2]:
# setting up default plotting parameters
%matplotlib inline

plt.rcParams['figure.figsize'] = [20.0, 7.0]
plt.rcParams.update({'font.size': 22,})

sns.set_palette('viridis')
sns.set_style('white')
sns.set_context('talk', font_scale=0.8)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print('Train Shape: ', train.shape)
print('Test Shape: ', test.shape)

train.head()

('Train Shape: ', (250, 302))
('Test Shape: ', (19750, 301))


Unnamed: 0,id,target,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
0,0,1.0,-1.067,-1.114,-0.616,0.376,1.09,0.467,-0.422,0.46,...,0.22,-0.339,0.254,-0.179,0.352,0.125,0.347,0.436,0.958,-0.824
1,1,0.0,-0.831,0.271,1.716,1.096,1.731,-0.197,1.904,-0.265,...,-0.765,-0.735,-1.158,2.554,0.856,-1.506,0.462,-0.029,-1.932,-0.343
2,2,0.0,0.099,1.39,-0.732,-1.065,0.005,-0.081,-1.45,0.317,...,-1.311,0.799,-1.001,1.544,0.575,-0.309,-0.339,-0.148,-0.646,0.725
3,3,1.0,-0.989,-0.916,-1.343,0.145,0.543,0.636,1.127,0.189,...,-1.37,1.093,0.596,-0.589,-0.649,-0.163,-0.958,-1.081,0.805,3.401
4,4,0.0,0.811,-1.509,0.522,-0.36,-0.22,-0.959,0.334,-0.566,...,-0.178,0.718,-1.017,1.249,-0.596,-0.445,1.751,1.442,-0.393,-0.643


In [4]:
# prepare for modeling
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']

X_test = test.drop(['id'], axis=1)

# scaling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# define models
ridge = linear_model.Ridge()
lasso = linear_model.Lasso()
elastic = linear_model.ElasticNet()
lasso_lars = linear_model.LassoLars()
bayesian_ridge = linear_model.BayesianRidge()
logistic = linear_model.LogisticRegression(solver='liblinear')
sgd = linear_model.SGDClassifier()

In [6]:
models = [ridge, lasso, elastic, lasso_lars, bayesian_ridge, logistic, sgd]
# function to get cross validation scores
def get_cv_scores(model):
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    print('CV Mean: ', np.mean(scores))
    print('STD: ', np.std(scores))
    print('\n')

In [7]:
# loop through list of models
for model in models:
    print(model)
    get_cv_scores(model)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
('CV Mean: ', 0.655320621373253)
('STD: ', 0.08822973705933819)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
('CV Mean: ', 0.5)
('STD: ', 0.0)


ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
('CV Mean: ', 0.5)
('STD: ', 0.0)


LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)
('CV Mean: ', 0.5)
('STD: ', 0.0)


BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True

#### Logistic Regression and Grid Search
Grid search is an exhaustive search over specified parameter values.

In [8]:
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty,
                  C=C,
                  class_weight=class_weight,
                  solver=solver)

grid = GridSearchCV(estimator=logistic, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Fitting 3 folds for each of 128 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 337 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 377 out of 384 | elapsed:   17.4s remaining:    0.3s


('Best Score: ', 0.7269642972850385)
('Best Params: ', {'penalty': 'l1', 'C': 0.1, 'solver': 'saga', 'class_weight': {0: 0.6, 1: 0.4}})


[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:   17.9s finished


In [9]:
logistic = linear_model.LogisticRegression(C=1, class_weight={1:0.6, 0:0.4}, penalty='l1', solver='liblinear')
get_cv_scores(logistic)

('CV Mean: ', 0.7013431013431013)
('STD: ', 0.06559048891024649)




#### Stochastic Gradient Descent and Random Search
Random search is a random (obviously) search over specified parameter values.

In [10]:
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalty = ['l1', 'l2', 'elasticnet']
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive']
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
eta0 = [1, 10, 100]

param_distributions = dict(loss=loss,
                           penalty=penalty,
                           alpha=alpha,
                           learning_rate=learning_rate,
                           class_weight=class_weight,
                           eta0=eta0)

random = RandomizedSearchCV(estimator=sgd, param_distributions=param_distributions, scoring='roc_auc', verbose=1, n_jobs=-1, n_iter=1000)
random_result = random.fit(X_train, y_train)

print('Best Score: ', random_result.best_score_)
print('Best Params: ', random_result.best_params_)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 689 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 1689 tasks      | elapsed:   23.0s


('Best Score: ', 0.7303431607594116)
('Best Params: ', {'loss': 'perceptron', 'eta0': 10, 'learning_rate': 'optimal', 'penalty': 'l1', 'alpha': 0.01, 'class_weight': {0: 0.3, 1: 0.7}})


[Parallel(n_jobs=-1)]: Done 2993 out of 3000 | elapsed:   39.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:   39.3s finished


In [11]:
sgd = linear_model.SGDClassifier(alpha=0.1,
                                 class_weight={1:0.7, 0:0.3},
                                 eta0=100,
                                 learning_rate='optimal',
                                 loss='log',
                                 penalty='elasticnet')
get_cv_scores(sgd)

('CV Mean: ', 0.7325545325545326)
('STD: ', 0.09729936666075499)




In [12]:
predictions = sgd.fit(X_train, y_train).predict_proba(X_test)
