In [18]:
import pandas as pd

df = pd.read_csv("Titanic.csv")

print(df.head())

   PassengerId  Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0            1         0       3    male  22.0      1      0   7.2500        S
1            2         1       1  female  38.0      1      0  71.2833        C
2            3         1       3  female  26.0      0      0   7.9250        S
3            4         1       1  female  35.0      1      0  53.1000        S
4            5         0       3    male  35.0      0      0   8.0500        S


In [19]:
# Y is our Label, did the person survive or not 
Y = df["Survived"]

print(Y)


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [None]:
# after some experiments we have to handle the null values under 'Age' and 'Embarked'
# we could drop both features, but this will make us have loss of features and information
# filling the missing data with a specific data is not realistic 
# we could use another Model to try to predict the age but thats overkill, so i will just populate the missing values with the median

df['Age'] = df['Age'].fillna(df['Age'].median())

# filling the missing embarked values with the most present values 
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])



# since we can't use strings, we need to use integers! 
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# same problem for Embarked
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Everything else is a feature 
X = df.drop(columns=['Survived'])

print(X)

     PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0              1       3    0  22.0      1      0   7.2500         0
1              2       1    1  38.0      1      0  71.2833         1
2              3       3    1  26.0      0      0   7.9250         0
3              4       1    1  35.0      1      0  53.1000         0
4              5       3    0  35.0      0      0   8.0500         0
..           ...     ...  ...   ...    ...    ...      ...       ...
886          887       2    0  27.0      0      0  13.0000         0
887          888       1    1  19.0      0      0  30.0000         0
888          889       3    1  28.0      1      2  23.4500         0
889          890       1    0  26.0      0      0  30.0000         1
890          891       3    0  32.0      0      0   7.7500         2

[891 rows x 8 columns]


In [21]:
from sklearn.model_selection import train_test_split

# Dividing the Features and the Labels into Test and train arrays
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=433
)

## After creating the test array and the train array we will use 2 models to try and predict if a specific person could survive the titanic or not
### 1. Logistic Regression Model

In [34]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=1000,
    C=0.5,
    penalty='l2',
    solver='lbfgs',
    class_weight='balanced'
)

clf.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.5
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [36]:
from sklearn.metrics import f1_score, classification_report

# Train set predictions
y_train_pred = clf.predict(X_train)

# Test set predictions
y_test_pred = clf.predict(X_test)

print("=== Train Set Performance ===")
print(classification_report(y_train, y_train_pred))

print("=== Test Set Performance ===")
print(classification_report(y_test, y_test_pred))



=== Train Set Performance ===
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       434
           1       0.72      0.76      0.74       278

    accuracy                           0.79       712
   macro avg       0.78      0.78      0.78       712
weighted avg       0.79      0.79      0.79       712

=== Test Set Performance ===
              precision    recall  f1-score   support

           0       0.87      0.80      0.83       115
           1       0.68      0.78      0.73        64

    accuracy                           0.79       179
   macro avg       0.78      0.79      0.78       179
weighted avg       0.80      0.79      0.80       179



The current model have only one set of parameters

Next step is to create a Grid contianing diffrent Parameter options and then we will let it train on all of these parameters and choose the best one based on the F1-Score

In [None]:
from sklearn.model_selection import GridSearchCV

# variable for recurring hyper-parameters
hyper_params = {
    'max_iter': [10000, 12000],
    'C': [0.1, 1, 10],
    'class_weight': [None, 'balanced']
}

param_grid = [
    # L2 penalty with solvers that support it
    {'penalty': ['l2'], 'solver': ['lbfgs', 'liblinear'],'C': hyper_params['C'] , 'class_weight': hyper_params['class_weight'], "max_iter": hyper_params["max_iter"] },
    
    # L1 penalty only with liblinear or saga
    {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': hyper_params['C'],  'class_weight': hyper_params['class_weight'], "max_iter": hyper_params["max_iter"] },
    
    # ElasticNet only with saga
    {'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.5, 0.7], 'C': hyper_params['C'], 'class_weight': hyper_params['class_weight'] , "max_iter": hyper_params["max_iter"] }
]

grid = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=5,          # 5-fold cross-validation
    scoring='f1',  # Use F1-score to pick the best model
    n_jobs=-1,    # Use all CPU cores
    error_score='raise'
)

grid.fit(X_train, y_train)

print("Best parameters found:", grid.best_params_)
print("Best F1 score on training set (CV):", grid.best_score_)

Best parameters found: {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 10000, 'penalty': 'l2', 'solver': 'lbfgs'}
Best F1 score on training set (CV): 0.7376952101577502


After Creating a model using Logistic Regression, we will create another model using perceptron

In [54]:
from sklearn.linear_model import Perceptron
from sklearn.model_selection import GridSearchCV

# Define recurring hyper-parameters
perceptron_hyper_params = {
    'max_iter': [5000, 10000, 15000],        # number of epochs
    'alpha': [0.0001, 0.001, 0.01],        # regularization strength
    'class_weight': [None, 'balanced'],    # handle class imbalance
    'penalty': [None, 'l2', 'l1', 'elasticnet'],  # type of regularization
    'l1_ratio': [0.15, 0.5, 0.7]           # only used if penalty='elasticnet'
}

# Create parameter grid
perceptron_param_grid = [
    # L2, L1, or None penalty
    {'penalty': [None, 'l2', 'l1'], 
     'max_iter': perceptron_hyper_params['max_iter'],
     'alpha': perceptron_hyper_params['alpha'],
     'class_weight': perceptron_hyper_params['class_weight']},
    
    # ElasticNet penalty (requires l1_ratio)
    {'penalty': ['elasticnet'],
     'max_iter': perceptron_hyper_params['max_iter'],
     'alpha': perceptron_hyper_params['alpha'],
     'l1_ratio': perceptron_hyper_params['l1_ratio'],
     'class_weight': perceptron_hyper_params['class_weight']}
]

# GridSearchCV setup
perceptron_grid = GridSearchCV(
    Perceptron(),
    perceptron_param_grid,
    cv=5,
    scoring='f1',   # F1-score as optimization metric
    n_jobs=-1,
    error_score='raise'
)

# Fit on training data
perceptron_grid.fit(X_train, y_train)

# Results
print("Best parameters found:", perceptron_grid.best_params_)
print("Best F1 score on training set (CV):", perceptron_grid.best_score_)


Best parameters found: {'alpha': 0.01, 'class_weight': None, 'max_iter': 5000, 'penalty': 'l2'}
Best F1 score on training set (CV): 0.5645142742111545


In [55]:
from sklearn.metrics import classification_report

# Best model predictions
y_train_pred = perceptron_grid.best_estimator_.predict(X_train)
y_test_pred = perceptron_grid.best_estimator_.predict(X_test)

print("=== Train Set Performance ===")
print(classification_report(y_train, y_train_pred))

print("=== Test Set Performance ===")
print(classification_report(y_test, y_test_pred))


=== Train Set Performance ===
              precision    recall  f1-score   support

           0       0.62      0.98      0.76       434
           1       0.61      0.04      0.07       278

    accuracy                           0.62       712
   macro avg       0.61      0.51      0.42       712
weighted avg       0.61      0.62      0.49       712

=== Test Set Performance ===
              precision    recall  f1-score   support

           0       0.65      0.97      0.78       115
           1       0.50      0.06      0.11        64

    accuracy                           0.64       179
   macro avg       0.57      0.51      0.44       179
weighted avg       0.60      0.64      0.54       179

