In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.datasets import load_breast_cancer

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.linear_model import LogisticRegressionCV

In [6]:
from sklearn import metrics

In [7]:
import matplotlib.pyplot as plt

In [8]:
import seaborn as sns

In [9]:
sns.set_style('darkgrid')

In [10]:
cancer = load_breast_cancer()

In [11]:
feats = pd.DataFrame(cancer.data, columns=cancer.feature_names)

In [12]:
target = pd.DataFrame(cancer.target, columns=['target'])

In [14]:
feats.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [15]:
target.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [16]:
test_size = 0.2

In [17]:
random_state = 42

In [18]:
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=test_size, random_state=random_state)

In [19]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (455, 30)
Shape of y_train: (455, 1)
Shape of X_test: (114, 30)
Shape of y_test: (114, 1)


In [20]:
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=0.3, random_state=42, stratify=target)

In [21]:
Cs = np.logspace(-4, 4, 10)

In [23]:
model_l1 = LogisticRegressionCV(
    Cs=Cs,
    cv=5,
    penalty='l1',
    solver='liblinear',
    scoring='accuracy',
    max_iter=5000,
    random_state=42
)
model_l1.fit(X_train, y_train.values.ravel())

In [24]:
model_l2 = LogisticRegressionCV(
    Cs=Cs,
    cv=5,
    penalty='l2',
    solver='lbfgs',
    scoring='accuracy',
    max_iter=5000,
    random_state=42
)
model_l2.fit(X_train, y_train.values.ravel())

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [25]:
print("Best C value for L1 model:", model_l1.C_[0])
print("Best C value for L2 model:", model_l2.C_[0])

Best C value for L1 model: 21.54434690031882
Best C value for L2 model: 166.81005372000558


In [26]:
y_pred_l1 = model_l1.predict(X_test)

In [27]:
y_pred_l2 = model_l2.predict(X_test)

In [28]:
print("L1 Model Performance:")
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_l1))
print("Precision:", metrics.precision_score(y_test, y_pred_l1))
print("Recall:", metrics.recall_score(y_test, y_pred_l1))
print("F1-score:", metrics.f1_score(y_test, y_pred_l1))

L1 Model Performance:
Accuracy: 0.9532163742690059
Precision: 0.9380530973451328
Recall: 0.9906542056074766
F1-score: 0.9636363636363636


In [29]:
print("L2 Model Performance:")
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_l2))
print("Precision:", metrics.precision_score(y_test, y_pred_l2))
print("Recall:", metrics.recall_score(y_test, y_pred_l2))
print("F1-score:", metrics.f1_score(y_test, y_pred_l2))

L2 Model Performance:
Accuracy: 0.9532163742690059
Precision: 0.9380530973451328
Recall: 0.9906542056074766
F1-score: 0.9636363636363636


In [30]:
coefs_l1 = pd.Series(model_l1.coef_[0], index=feats.columns)

In [31]:
coefs_l2 = pd.Series(model_l2.coef_[0], index=feats.columns)

In [32]:
print("Number of non-zero coefficients in L1:", np.sum(coefs_l1 != 0))
print("Number of non-zero coefficients in L2:", np.sum(coefs_l2 != 0))

Number of non-zero coefficients in L1: 16
Number of non-zero coefficients in L2: 30


In [33]:
print("Top L1 coefficients:")
print(coefs_l1.sort_values(ascending=False).head(10))

Top L1 coefficients:
mean radius          1.575283
texture error        1.159236
perimeter error      0.917767
worst radius         0.860469
concavity error      0.205689
mean perimeter       0.093852
worst perimeter      0.008161
smoothness error     0.000000
worst concavity      0.000000
worst compactness    0.000000
dtype: float64


In [34]:
print("Top L2 coefficients:")
print(coefs_l2.sort_values(ascending=False).head(10))

Top L2 coefficients:
concavity error            5.229489
compactness error          4.259038
mean radius                4.007644
worst compactness          3.925953
radius error               1.716892
texture error              1.692118
mean compactness           1.294613
perimeter error            0.838178
fractal dimension error    0.768512
symmetry error             0.207255
dtype: float64


In [35]:
l1_coefs_df = pd.DataFrame({
    "Feature": feats.columns,
    "Coefficient": model_l1.coef_[0]
})

In [36]:
print(l1_coefs_df)

                    Feature  Coefficient
0               mean radius     1.575283
1              mean texture    -0.109303
2            mean perimeter     0.093852
3                 mean area    -0.006025
4           mean smoothness     0.000000
5          mean compactness     0.000000
6            mean concavity     0.000000
7       mean concave points   -12.070297
8             mean symmetry     0.000000
9    mean fractal dimension     0.000000
10             radius error     0.000000
11            texture error     1.159236
12          perimeter error     0.917767
13               area error    -0.128671
14         smoothness error     0.000000
15        compactness error     0.000000
16          concavity error     0.205689
17     concave points error     0.000000
18           symmetry error     0.000000
19  fractal dimension error     0.000000
20             worst radius     0.860469
21            worst texture    -0.255427
22          worst perimeter     0.008161
23              

In [37]:
print("Number of zero coefficients in L1 model:", (l1_coefs_df["Coefficient"] == 0).sum())
print("Number of non-zero coefficients in L1 model:", (l1_coefs_df["Coefficient"] != 0).sum())

Number of zero coefficients in L1 model: 14
Number of non-zero coefficients in L1 model: 16


In [38]:
l2_coefs_df = pd.DataFrame({
    "Feature": feats.columns,
    "Coefficient": model_l2.coef_[0]
})

In [39]:
print(l2_coefs_df)

                    Feature  Coefficient
0               mean radius     4.007644
1              mean texture    -0.023562
2            mean perimeter    -0.351233
3                 mean area    -0.000338
4           mean smoothness    -6.065443
5          mean compactness     1.294613
6            mean concavity    -4.740546
7       mean concave points    -8.378391
8             mean symmetry    -5.552176
9    mean fractal dimension    -0.533344
10             radius error     1.716892
11            texture error     1.692118
12          perimeter error     0.838178
13               area error    -0.149231
14         smoothness error    -2.031752
15        compactness error     4.259038
16          concavity error     5.229489
17     concave points error    -1.627871
18           symmetry error     0.207255
19  fractal dimension error     0.768512
20             worst radius    -0.575072
21            worst texture    -0.328383
22          worst perimeter     0.005693
23              