In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import pandas as pd

In [3]:
data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Logistic Regression

In [10]:
#use some of the features to test it
df = data[['survived', 'age', 'sex', 'fare']].dropna()

#logreg only takes numerical input
df['sex'] = df['sex'].map(lambda x: 1 if x == 'male' else 0)

#X is always 2D and y is 1D
X = df.drop('survived', axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#logreg has a max_iter paramter
model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report: ", classification_report(y_test, y_pred))

Accuracy: 0.83
Classification Report:                precision    recall  f1-score   support

           0       0.90      0.84      0.87        93
           1       0.73      0.82      0.77        50

    accuracy                           0.83       143
   macro avg       0.81      0.83      0.82       143
weighted avg       0.84      0.83      0.83       143



## Regularization

In [11]:
#use some of the features to test it
df = data[['survived', 'age', 'sex', 'fare']].dropna()

#logreg only takes numerical input
df['sex'] = df['sex'].map(lambda x: 1 if x == 'male' else 0)

#X is always 2D and y is 1D
X = df.drop('survived', axis=1)
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#there are no designated function for regularization, instead we put the parameter 'penalty'
#C = inverse of regularization strength. 
#smaller means stronger and larger means weaker regularization
model_l2 = LogisticRegression(max_iter = 1000, penalty = 'l2', C=1.0)
model_l2.fit(X_train, y_train)

#l1 does not work with all solver, we need to specify
model_l1 = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', max_iter=1000)
model_l1.fit(X_train, y_train)

#compare the coef
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Org_Coef': model.coef_[0],
    'L2_Coef': model_l2.coef_[0],
    'L1_Coef': model_l1.coef_[0]
})

print(coef_df)

  Feature  Org_Coef   L2_Coef   L1_Coef
0     age -0.015312 -0.015312 -0.013878
1     sex -2.138830 -2.138830 -2.163571
2    fare  0.012742  0.012742  0.012892


we can see that the original coef is the same as l2 coef, this is because by default, logistic regression model already used L2 regularization. If we want a no penalty equation, we have to set penalty='none'