In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import TruncatedSVD

In [2]:
train_df = pd.read_csv('../../Datasets/train.csv')                          #load the train data
train_df_values = train_df.drop(['id', 'target'], axis = 1)   #retain only required columns for train matrix

In [3]:
X = train_df_values.values                                  #convert the df into array

In [4]:
svd_100 = TruncatedSVD(n_components=100, n_iter=20, random_state=42)  #svd object with 100 components

In [5]:
svd_100.fit(X)                                  

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=20,
             random_state=42, tol=0.0)

In [6]:
print(svd_100.explained_variance_ratio_.sum()) 

0.7803804657811667


We see that 100 columns explain upto 78% of variance. Let's try 150 columns

In [7]:
svd_150 = TruncatedSVD(n_components=150, n_iter=20, random_state=42)
svd_150.fit(X)

TruncatedSVD(algorithm='randomized', n_components=150, n_iter=20,
             random_state=42, tol=0.0)

In [8]:
print(svd_150.explained_variance_ratio_.sum()) 

0.9246093979004004


150 columns gives about 92% variance which seems enough to achieve a parsimonious model

In [9]:
X_reduced = svd_150.transform(X)

X_reduced is the new train matrix with 150 columns which explain 92% of the variance. Lasso regression has been the most successful model yet so we will perform lasso regression using this reduced matrix

In [10]:
test = pd.read_csv('../../Datasets/test.csv')                 #read test data

In [11]:
X_test = test.drop(['id'], axis=1)            #load X_test and y_train 
y_train = train_df['target']

Rest of the code is borrowed from Lasso Regression which can be found in ../3 folder.

In [12]:
random_state = 0

#Hyperparameter = lambda
#Using Logistic Regression with l1 regularisation
logit = LogisticRegression(random_state=random_state)
#Using ROC_AUC score for testing each lambda value
rocauc_score = make_scorer(roc_auc_score) 
#Using GridSearch to search for the best lambda value for the model
parameter_grid = {'class_weight':['balanced'], 'penalty' : ['l1', 'l2'], 'C':[0.0001, 0.0005, 0.001,0.005, 0.01, 0.05, 0.1, 0.5, 1,10, 100, 1000, 1500, 2000, 2500,2600, 2700, 2800, 2900, 3000, 3100, 3200],'max_iter' : [100, 1000, 2000, 5000, 10000] }

#Grid Search
grid = GridSearchCV(estimator=logit,param_grid=parameter_grid,scoring=rocauc_score,verbose=1,cv=20,n_jobs=-1)
grid.fit(X_reduced, y_train)
best_score = grid.best_score_
best_para = grid.best_params_
best_logit = grid.best_estimator_
#roc_auc Score
print("Best Score obtained is: ", best_score, "for the parameters: ", best_para)
#Hyperparameters of the best model
print(best_logit)

Fitting 20 folds for each of 220 candidates, totalling 4400 fits


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


Best Score obtained is:  0.7017 for the parameters:  {'C': 1, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l1'}
LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


[Parallel(n_jobs=-1)]: Done 4400 out of 4400 | elapsed:   52.1s finished


In [13]:
#Creating a Logistic Regression Model with l1 Regularisation with the above obtained best hyperparameters
model = LogisticRegression(C=1, class_weight='balanced', dual=False,fit_intercept=True, intercept_scaling=1, max_iter=100,multi_class='warn', n_jobs=None, penalty='l1', random_state=0, solver='liblinear', tol=0.0001, verbose=0, warm_start=False);
model.fit(X_reduced, y_train)

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
#Train Score on the model
score_train = model.score(X_reduced, y_train)
print("Train Score :", str(score_train*100)+" %")

Train Score : 100.0 %


In [15]:
#do svd with same params on  X_test 
X_test_reduced = svd_150.transform(X_test)

In [16]:
#Generating the predicted values and testing them on Kaggle to get a score
y_pred_logit_lasso = model.predict_proba(X_test_reduced)[:,1]

In [17]:
#convert to csv to test on kaggle
data = {'id':test['id'], 'target':y_pred_logit_lasso}
df = pd.DataFrame(data)
df.to_csv('results.csv', index=False)

Test score on kaggle: 0.724