# Titanic Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

if 'Survived' not in df_test.columns:
    df_test['Survived'] = 0

In [104]:
def Preprocessing(df_train , df_test):

  # cleaning data
  df = pd.concat([df_train, df_test], axis=0)
  df = df.drop(['Name', 'Ticket'], axis=1)
  df['Age'] = df['Age'].fillna(df['Age'].mean())
  df['Cabin']=df['Cabin'].fillna('X000')
  df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
  #df['Embarked'] = df['Embarked'].fillna('X')
  df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

  # feature emgineering
  df['cabin_letter']=df['Cabin'].str.extract(r'([a-zA-z]+)',expand=False)
  df['cabin_number']=df['Cabin'].str.extract(r'(\d+)',expand=False)
  df.drop('Cabin',axis=1,inplace=True)

  # convert caegories into numbers
  df = pd.get_dummies(df, columns=['cabin_letter'], prefix=['cabin_letter'])
  df = pd.get_dummies(df, columns=['Sex'], prefix=['Sex'])
  df = pd.get_dummies(df, columns=['Embarked'], prefix=['Embarked'])
  df.drop('cabin_letter_X',axis=1,inplace=True)
  df['cabin_number']=df['cabin_number'].fillna(0)
  df['cabin_number']=pd.to_numeric(df['cabin_number'])

  # split data into train and test data
  df_train = df[:len(df_train)]
  df_test = df[len(df_train):]
  df_test = df_test.drop('Survived', axis=1)




  return df_train , df_test

In [68]:
df=pd.concat([df_train,df_test],axis=0)
df.isna().sum()[df.isna().sum()>0]

Unnamed: 0,0
Age,263
Fare,1
Cabin,1014
Embarked,2


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     1309 non-null   int64  
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 132.9+ KB


In [70]:
len(df) == len(df_train) + len(df_test)

True

In [94]:
train_df , test_df = Preprocessing(df_train, df_test)
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,cabin_number,cabin_letter_A,cabin_letter_B,...,cabin_letter_D,cabin_letter_E,cabin_letter_F,cabin_letter_G,cabin_letter_T,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,False,False,...,False,False,False,False,False,False,True,False,False,True
1,2,1,1,38.0,1,0,71.2833,85,False,False,...,False,False,False,False,False,True,False,True,False,False
2,3,1,3,26.0,0,0,7.925,0,False,False,...,False,False,False,False,False,True,False,False,False,True
3,4,1,1,35.0,1,0,53.1,123,False,False,...,False,False,False,False,False,True,False,False,False,True
4,5,0,3,35.0,0,0,8.05,0,False,False,...,False,False,False,False,False,False,True,False,False,True


In [105]:
train_df.drop('PassengerId',axis=1,inplace=True)

In [106]:
X=train_df.drop('Survived',axis=1)
y=train_df['Survived']

In [107]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = np.reshape(y_train,(-1,1))
y_test = np.reshape(y_test,(-1,1))
y_train.shape , y_test.shape

((712, 1), (179, 1))

##Modeling

In [108]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , confusion_matrix , classification_report

In [109]:
model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = AdaBoostClassifier()
model_5 = GradientBoostingClassifier()
model_6 = XGBClassifier()

In [110]:
model_1.fit(X_train , y_train)
y_pred = model_1.predict(X_test)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

precision is :  0.8
recall is :  0.7567567567567568
f1 score is :  0.7777777777777778


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [111]:
model_2.fit(X_train , y_train)
y_pred = model_2.predict(X_test)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       105
           1       0.72      0.74      0.73        74

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

precision is :  0.7236842105263158
recall is :  0.7432432432432432
f1 score is :  0.7333333333333333


In [112]:
model_3.fit(X_train , y_train)
y_pred = model_3.predict(X_test)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))

  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

           0       0.80      0.86      0.83       105
           1       0.77      0.69      0.73        74

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.78       179
weighted avg       0.79      0.79      0.79       179

precision is :  0.7727272727272727
recall is :  0.6891891891891891
f1 score is :  0.7285714285714285


In [113]:
model_4.fit(X_train , y_train)
y_pred = model_4.predict(X_test)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

precision is :  0.7878787878787878
recall is :  0.7027027027027027
f1 score is :  0.7428571428571429


  y = column_or_1d(y, warn=True)


In [114]:
model_5.fit(X_train , y_train)
y_pred = model_5.predict(X_test)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

precision is :  0.7878787878787878
recall is :  0.7027027027027027
f1 score is :  0.7428571428571429


In [115]:
model_6.fit(X_train , y_train)
y_pred = model_6.predict(X_test)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       105
           1       0.75      0.74      0.75        74

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179

precision is :  0.7534246575342466
recall is :  0.7432432432432432
f1 score is :  0.7482993197278912


## HyperParametr

### GradientBoosting

In [116]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to your training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform your test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

In [117]:
model_4.fit(X_train_scaled , y_train)
y_pred = model_4.predict(X_test_scaled)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))
print("accuracy is : ",accuracy_score(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

precision is :  0.7878787878787878
recall is :  0.7027027027027027
f1 score is :  0.7428571428571429
accuracy is :  0.7988826815642458


  y = column_or_1d(y, warn=True)


In [118]:
model_5.fit(X_train_scaled , y_train)
y_pred = model_5.predict(X_test_scaled)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))
print("accuracy is : ",accuracy_score(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

precision is :  0.7878787878787878
recall is :  0.7027027027027027
f1 score is :  0.7428571428571429
accuracy is :  0.7988826815642458


  y = column_or_1d(y, warn=True)


In [119]:
# random_forest
model_3.fit(X_train_scaled , y_train)
y_pred = model_3.predict(X_test_scaled)
print(classification_report(y_test , y_pred))
print("precision is : ", precision_score(y_test , y_pred))
print("recall is : ", recall_score(y_test , y_pred))
print("f1 score is : ", f1_score(y_test , y_pred))

  return fit_method(estimator, *args, **kwargs)


              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

precision is :  0.7794117647058824
recall is :  0.7162162162162162
f1 score is :  0.7464788732394366


In [140]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(
    estimator=model_3,
    param_distributions=param_dist,
    n_iter=50,                  # Number of random combinations to try
    cv=5,                       # 5-fold cross-validation
    scoring='precision',        # Use precision as the metric
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit the search to your data
random_search.fit(X, y)

# Best model
best_rf = random_search.best_estimator_

# Predict
y_pred = best_rf.predict(X_test_scaled)

# Evaluate
print("Best Hyperparameters:\n", random_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("f1 is: ",f1_score(y_test, y_pred))
print("precision is: ",precision_score(y_test, y_pred))
print("recall is: ",recall_score(y_test, y_pred))

Fitting 5 folds for each of 50 candidates, totalling 250 fits


115 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
115 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sk

Best Hyperparameters:
 {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': True}

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.75      0.79       105
           1       0.69      0.80      0.74        74

    accuracy                           0.77       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.77      0.77       179

f1 is:  0.7421383647798742
precision is:  0.6941176470588235
recall is:  0.7972972972972973


In [103]:
# submission
pred = random_search.predict(test_df)

submission_cv = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': pred
})

submission_cv.to_csv('submission_cv.csv', index=False)



### Gradient_Boosting

In [135]:
param_grid_gb = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5],
}

# Create GridSearchCV object
grid_search_gb = GridSearchCV(estimator=model_5, param_grid=param_grid_gb, cv=5, scoring='f1')

# Fit the GridSearchCV object to the data
grid_search_gb.fit(X, y)

# Get the best parameters
best_params_gb = grid_search_gb.best_params_

# Create a new GradientBoostingClassifier model with the best parameters
best_gb_model = GradientBoostingClassifier(**best_params_gb)

# Train the model using the best parameters
best_gb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_gb = best_gb_model.predict(X_test_scaled)

  y = column_or_1d(y, warn=True)


In [136]:
precision_score(y_test, y_pred_gb)

0.8181818181818182

In [137]:
f1_score(y_test, y_pred_gb)

0.7714285714285715

In [138]:
accuracy_score(y_test, y_pred_gb)

0.8212290502793296

### LogisticRegression

In [144]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score

# Define the model
logreg = LogisticRegression(random_state=42)

# Define the hyperparameter space
param_dist = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200, 500]
}

# Setup the search
random_search = RandomizedSearchCV(
    estimator=model_1,
    param_distributions=param_dist,
    n_iter=30,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(X_train_scaled, y_train)

# Get best model
best_logreg = random_search.best_estimator_

# Predict
y_pred = best_logreg.predict(X_test_scaled)

# Evaluate
print("Best Hyperparameters:\n", random_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters:
 {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 200, 'class_weight': 'balanced', 'C': 0.1}

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.79      0.82       105
           1       0.73      0.81      0.77        74

    accuracy                           0.80       179
   macro avg       0.79      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179

F1 Score: 0.7692307692307693


  y = column_or_1d(y, warn=True)


In [145]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],  # liblinear supports both l1 and l2
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200, 500]
}

grid_search = GridSearchCV(
    estimator=model_1,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

# 7. Evaluation
print("\n✅ Best Hyperparameters:", grid_search.best_params_)
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🎯 F1 Score:", f1_score(y_test, y_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits

✅ Best Hyperparameters: {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.79      0.82       105
           1       0.73      0.81      0.77        74

    accuracy                           0.80       179
   macro avg       0.79      0.80      0.80       179
weighted avg       0.80      0.80      0.80       179

🎯 F1 Score: 0.7692307692307693


  y = column_or_1d(y, warn=True)
