# Penalized (Lasso) logistic model

Let's illustrate the penalized classification using the Wine data set:

https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-dataset

This is a multiclass classification problem.

In [None]:
import sklearn.datasets as datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts

dataset = datasets.load_wine()
X = dataset['data']
y = dataset['target']

# Split data
x_train, x_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=42)

# Scaled predictors
x_train_scaled = StandardScaler().fit_transform(x_train)
x_test_scaled = StandardScaler().fit_transform(x_test)

In [None]:
import pandas as pd
import numpy as np

data1 = pd.DataFrame(data= np.c_[dataset['data'], dataset['target']],
                     columns= dataset['feature_names'] + ['target'])                     

data1.describe()

Run k-fold cross validation to tune the penalization strength

In [None]:
from sklearn.linear_model import LogisticRegression
from numpy import arange
from sklearn.model_selection import GridSearchCV

lasso_logistic_model = LogisticRegression(
    penalty='l1',
    solver='liblinear')

grid = dict() 
grid['C'] = arange(0.0001, 1, 0.01)

search = GridSearchCV(lasso_logistic_model, grid, scoring='accuracy', cv=5, refit=True)
results = search.fit(x_train_scaled, y_train)

print('Config: %s' % results.best_params_)

Let's visualize the coefficients of the lasso-logistic model and compare with those of the standard (non-penalized) model

In [None]:
# Coeficients of the lasso-logistic model
lasso_model = LogisticRegression(
    penalty='l1',
    solver='liblinear',
    C = 0.4901).fit(x_train_scaled,y_train)
print("Coefficients of the lasso logistic model: \n\n",lasso_model.coef_)

# Coeficients of the traditional logistic model
logistic_model = LogisticRegression(
    penalty='none').fit(x_train_scaled,y_train)
print("\n\n Coefficients of the traditional logistic model: \n\n",logistic_model.coef_)

Let's compute the predictions for the test set with the lasso-logistic and with the standard model

In [None]:
# Predictions with the lasso-logistic model
predictions_tuned_model = search.predict(x_test_scaled)
print(predictions_tuned_model)

# Predictions with the standard logistic model
logictic_model = LogisticRegression(solver='liblinear').fit(x_train_scaled,y_train)
predictions_non_tuned_model = logictic_model.predict(x_test_scaled)
print(predictions_non_tuned_model)


Let's evaluate the performance of each model 

In [None]:
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import accuracy_score as accuracy


print("Confusion matrix for the lasso-logistic model: \n"+str(cm(y_test,predictions_tuned_model)))

print("Confusion matrix for the traditional logistic model: \n"+str(cm(y_test,predictions_non_tuned_model)))

print("Accuracy lasso model: "+str(accuracy(y_test,predictions_tuned_model)))

print("Accuracy traditional model: "+str(accuracy(y_test,predictions_non_tuned_model)))

# Addressing imbalance while tuning a classification model 

In this example will see how to include the sampling tehnique SMOTE within the modelling cycle for tuning the lasso-logistic model. For that, we will use the breast cancer data set to see if we are able to identify malignant labels.

In [None]:
#!pip install imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
import sklearn.datasets as datasets
from sklearn.model_selection import StratifiedKFold

# Load data set
dataset = datasets.load_breast_cancer()
X = dataset['data']
y = dataset['target']

# Split data
X_train, X_test, y_train, y_test = tts(X,y,test_size=0.3,stratify=y,random_state=11)

print(dataset.DESCR)

In [None]:
# Create a pipeline with SMOTE in it
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                ['classifier', LogisticRegression(random_state=11,
                                                                  penalty='l1',
                                                                  solver='liblinear')]])

# Grid for the shrinkage intensuity
grid = dict() 
grid['classifier__C'] = arange(0.0001, 2, 0.01)

# Setup stratified cross-validation
stratified_kfold = StratifiedKFold(n_splits=5,
                                   shuffle=True,
                                   random_state=11)

# Run cross-validation using the AUC as scoring metric
search = GridSearchCV(pipeline, grid, scoring='roc_auc', cv=stratified_kfold, refit=True)
results = search.fit(X_train, y_train)
print('Config: %s' % results.best_params_)

In [None]:
cv_score = search.best_score_
test_score = search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
