# **Importing Libraries and Downloading Data**

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [22]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [23]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [24]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [25]:
df_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# **Handling Nulls**

In [26]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [27]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [28]:
male_age = df_train[df_train['Sex'] == 'male']['Age'].median()
female_age = df_train[df_train['Sex'] == 'female']['Age'].median()

df_train.loc[df_train['Sex'] == 'male', 'Age'] = df_train.loc[df_train['Sex'] == 'male', 'Age'].fillna(male_age)
df_train.loc[df_train['Sex'] == 'female', 'Age'] = df_train.loc[df_train['Sex'] == 'female', 'Age'].fillna(female_age)

df_train.drop('Cabin', axis=1, inplace=True)

df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)

In [29]:
male_age = df_test[df_test['Sex'] == 'male']['Age'].median()
female_age = df_test[df_test['Sex'] == 'female']['Age'].median()

df_test.loc[df_test['Sex'] == 'male', 'Age'] = df_test.loc[df_test['Sex'] == 'male', 'Age'].fillna(male_age)
df_test.loc[df_test['Sex'] == 'female', 'Age'] = df_test.loc[df_test['Sex'] == 'female', 'Age'].fillna(female_age)

df_test.drop('Cabin', axis=1, inplace=True)

df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)

In [30]:
df_train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [31]:
df_test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# **Drop Unimportant Data**

In [32]:
df_train.drop(columns=['PassengerId','Name', 'Ticket'], inplace=True)
df_test.drop(columns=['PassengerId','Name', 'Ticket'], inplace=True)

# **Encoding**

In [33]:
label_encoder = LabelEncoder()
df_train['Sex'] = label_encoder.fit_transform(df_train['Sex'])

df_train = pd.get_dummies(df_train, columns=['Embarked'], drop_first=True)

df_test['Sex'] = label_encoder.fit_transform(df_test['Sex'])

df_test = pd.get_dummies(df_test, columns=['Embarked'], drop_first=True)

# **Training the model**

In [34]:
df_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.2500,False,True
1,1,1,0,38.0,1,0,71.2833,False,False
2,1,3,0,26.0,0,0,7.9250,False,True
3,1,1,0,35.0,1,0,53.1000,False,True
4,0,3,1,35.0,0,0,8.0500,False,True
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,False,True
887,1,1,0,19.0,0,0,30.0000,False,True
888,0,3,0,27.0,1,2,23.4500,False,True
889,1,1,1,26.0,0,0,30.0000,False,False


In [35]:
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(objective="binary:logistic", n_estimators=10, seed=123)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       105
           1       0.84      0.69      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.80       179
weighted avg       0.82      0.82      0.81       179



In [36]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.8100558659217877
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       105
           1       0.79      0.73      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [37]:
logreg_model = LogisticRegression(max_iter=1000)

logreg_model.fit(X_train, y_train)

y_pred_logreg = logreg_model.predict(X_test)

accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)
print(classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.8100558659217877
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [None]:
param_dist = {
    'n_estimators': sp_randint(50, 200),  # Number of trees
    'learning_rate': sp_uniform(0.01, 0.3),  # Learning rate
    'max_depth': sp_randint(3, 10),  # Maximum depth of trees
    'min_child_weight': sp_randint(1, 6),  # Minimum sum of instance weight needed in a child
    'gamma': sp_uniform(0, 10),  # Minimum loss reduction required to make a further partition
    'subsample': sp_uniform(0.5, 1),  # Subsample ratio of the training instances
    'colsample_bytree': sp_uniform(0.5, 1),  # Subsample ratio of columns when constructing each tree
    'reg_alpha': sp_uniform(0, 1),  # L1 regularization term on weights
    'reg_lambda': sp_uniform(0, 1)  # L2 regularization term on weights
}


random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Perform the random search
random_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)

# Use the best model for prediction
best_xgb_model = random_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with best model:", accuracy)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END colsample_bytree=0.8745401188473625, gamma=9.50714306409916, learning_rate=0.22959818254342154, max_depth=7, min_child_weight=5, n_estimators=152, reg_alpha=0.44583275285359114, reg_lambda=0.09997491581800289, subsample=0.9592488919658672; total time=   0.1s
[CV] END colsample_bytree=0.8337086111390218, gamma=1.4286681792194078, learning_rate=0.20526654188465587, max_depth=7, min_child_weight=2, n_estimators=137, reg_alpha=0.8324426408004217, reg_lambda=0.21233911067827616, subsample=0.6818249672071006; total time=   0.1s
[CV] END colsample_bytree=0.6834045098534338, gamma=3.0424224295953772, learning_rate=0.16742692948967136, max_depth=6, min_child_weight=1, n_estimators=98, reg_alpha=0.5247746602583891, reg_lambda=0.3998609717152555, subsample=0.5466656632136154; total time=   0.1s
[CV] END colsample_bytree=1.4737555188414593, gamma=2.3277134043030423, learning_rate=0.03718193035984624, max_depth=8, min_child_wei

In [None]:
# y_pred_test = best_xgb_model.predict(df_test)

# submission_df = pd.DataFrame({'PassengerId': pd.read_csv('test.csv')['PassengerId'], 'Survived': y_pred_test})

# submission_df.to_csv('submission.csv', index=False)