# **Install Required Libraries**

In [1]:
!pip install seaborn scikit-learn xgboost



# **Import Required Libraries**

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt

# **Load the Titanic Dataset**

In [3]:
# Load the Titanic dataset from Seaborn's built-in dataset
df = sns.load_dataset('titanic')

# Display the first few rows to understand the data
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# **Data Preprocessing & Cleaning**

In [4]:
# Check for missing values in the dataset
df.isnull().sum()

# Drop rows with missing values in important columns (age, embarked, fare, sex)
df = df.dropna(subset=['age', 'embarked', 'fare', 'sex'])

# Select features and target variable (target = survived)
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]
y = df['survived']

# Convert categorical variables (like 'sex' and 'embarked') into dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)

# Display the processed data
X.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,embarked_Q,embarked_S
0,3,22.0,1,0,7.25,True,False,True
1,1,38.0,1,0,71.2833,False,False,False
2,3,26.0,0,0,7.925,False,False,True
3,1,35.0,1,0,53.1,False,False,True
4,3,35.0,0,0,8.05,True,False,True


# **Split Data into Train and Test Sets**

In [5]:
# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# **Train Logistic Regression Model**

In [7]:
# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train, y_train)

In [8]:
# Predict on the test set
lr_y_pred = lr_model.predict(x_test)

In [9]:
# Evaluate the model performance
print('=== Logistic Regression ===')
print('Accuracy:', accuracy_score(y_test, lr_y_pred))
print('Classification Report:\n', classification_report(y_test, lr_y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, lr_y_pred))

=== Logistic Regression ===
Accuracy: 0.8251748251748252
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.85        82
           1       0.80      0.79      0.79        61

    accuracy                           0.83       143
   macro avg       0.82      0.82      0.82       143
weighted avg       0.82      0.83      0.82       143

Confusion Matrix:
 [[70 12]
 [13 48]]


# **Train Random Forest Classifier Model**

In [10]:
# Initialize and train the Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=45)
rf_model.fit(x_train, y_train)

In [11]:
# Predict on the test set
rf_y_pred = rf_model.predict(x_test)

In [12]:
print('=== Random Forest Classifier ===')
print('Accuracy:', accuracy_score(y_test, rf_y_pred))
print('Classification Report:\n', classification_report(y_test, rf_y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, rf_y_pred))

=== Random Forest Classifier ===
Accuracy: 0.8041958041958042
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83        82
           1       0.77      0.77      0.77        61

    accuracy                           0.80       143
   macro avg       0.80      0.80      0.80       143
weighted avg       0.80      0.80      0.80       143

Confusion Matrix:
 [[68 14]
 [14 47]]


# **Train XGBoost Classifier Model**

In [14]:
# Initialize and train the XGBoost Classifier model
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=45)
xgb_model.fit(x_train, y_train)

In [15]:
# Predict on the test set
xgb_y_pred = xgb_model.predict(x_test)

In [16]:
# Evaluate the model performance
print('=== XGBoost Classifier ===')
print('Accuracy:', accuracy_score(y_test, xgb_y_pred))
print('Classification Report:\n', classification_report(y_test, xgb_y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, xgb_y_pred))

=== XGBoost Classifier ===
Accuracy: 0.7482517482517482
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78        82
           1       0.70      0.70      0.70        61

    accuracy                           0.75       143
   macro avg       0.74      0.74      0.74       143
weighted avg       0.75      0.75      0.75       143

Confusion Matrix:
 [[64 18]
 [18 43]]


# **Key Takeaways**

Logistic Regression: Best overall with the highest accuracy (82.5%). Balanced performance in precision and recall for both classes (non-survivors and survivors).

Random Forest: Good performance with an accuracy of 80.4%, slightly behind Logistic Regression, especially for predicting survivors.

XGBoost: Lowest accuracy (74.8%) and struggled more with predicting survivors, showing weaker performance compared to the other models.

# **Conclusion**

Logistic Regression is the most effective model for this task, but Random Forest can still be a good alternative.