# **Importing Libraries and Downloading Data**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
df_train

In [None]:
df_test

# **Handling Nulls**

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
male_age = df_train[df_train['Sex'] == 'male']['Age'].median()
female_age = df_train[df_train['Sex'] == 'female']['Age'].median()

df_train.loc[df_train['Sex'] == 'male', 'Age'] = df_train.loc[df_train['Sex'] == 'male', 'Age'].fillna(male_age)
df_train.loc[df_train['Sex'] == 'female', 'Age'] = df_train.loc[df_train['Sex'] == 'female', 'Age'].fillna(female_age)

df_train.drop('Cabin', axis=1, inplace=True)

df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)

In [None]:
male_age = df_test[df_test['Sex'] == 'male']['Age'].median()
female_age = df_test[df_test['Sex'] == 'female']['Age'].median()

df_test.loc[df_test['Sex'] == 'male', 'Age'] = df_test.loc[df_test['Sex'] == 'male', 'Age'].fillna(male_age)
df_test.loc[df_test['Sex'] == 'female', 'Age'] = df_test.loc[df_test['Sex'] == 'female', 'Age'].fillna(female_age)

df_test.drop('Cabin', axis=1, inplace=True)

df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

# **Drop Unimportant Data**

In [None]:
df_train.drop(columns=['PassengerId','Name', 'Ticket'], inplace=True)
df_test.drop(columns=['PassengerId','Name', 'Ticket'], inplace=True)

# **Encoding**

In [None]:
label_encoder = LabelEncoder()
df_train['Sex'] = label_encoder.fit_transform(df_train['Sex'])

df_train = pd.get_dummies(df_train, columns=['Embarked'], drop_first=True)

df_test['Sex'] = label_encoder.fit_transform(df_test['Sex'])

df_test = pd.get_dummies(df_test, columns=['Embarked'], drop_first=True)

# **Training the model**

In [None]:
df_train

In [None]:
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(objective="binary:logistic", n_estimators=10, seed=123)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
print(classification_report(y_test, y_pred_rf))

In [None]:
logreg_model = LogisticRegression(max_iter=1000)

logreg_model.fit(X_train, y_train)

y_pred_logreg = logreg_model.predict(X_test)

accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)
print(classification_report(y_test, y_pred_logreg))

In [None]:
param_dist = {
    'n_estimators': sp_randint(50, 200),  # Number of trees
    'learning_rate': sp_uniform(0.01, 0.3),  # Learning rate
    'max_depth': sp_randint(3, 10),  # Maximum depth of trees
    'min_child_weight': sp_randint(1, 6),  # Minimum sum of instance weight needed in a child
    'gamma': sp_uniform(0, 10),  # Minimum loss reduction required to make a further partition
    'subsample': sp_uniform(0.5, 1),  # Subsample ratio of the training instances
    'colsample_bytree': sp_uniform(0.5, 1),  # Subsample ratio of columns when constructing each tree
    'reg_alpha': sp_uniform(0, 1),  # L1 regularization term on weights
    'reg_lambda': sp_uniform(0, 1)  # L2 regularization term on weights
}


random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Perform the random search
random_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy found: ", random_search.best_score_)

# Use the best model for prediction
best_xgb_model = random_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with best model:", accuracy)
print(classification_report(y_test, y_pred))

In [None]:
# y_pred_test = best_xgb_model.predict(df_test)

# submission_df = pd.DataFrame({'PassengerId': pd.read_csv('test.csv')['PassengerId'], 'Survived': y_pred_test})

# submission_df.to_csv('submission.csv', index=False)