# üö¢ Titanic Survival Prediction ‚Äî Full Machine Learning Project
Complete EDA + Preprocessing + Feature Engineering + ML Model + Submission

---

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

sns.set_theme(style="whitegrid")
plt.style.use("ggplot")

## üìå Load Dataset

In [None]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")
gender_df = pd.read_csv("gender_submission.csv")

train_df.head()

## üìä Exploratory Data Analysis

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='Sex', hue='Survived', data=train_df)
plt.title("Survival by Gender")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='Pclass', hue='Survived', data=train_df)
plt.title("Survival by Passenger Class")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data=train_df, x='Age', hue='Survived', kde=True)
plt.title("Age Distribution by Survival")
plt.show()

## üßπ Data Preprocessing Pipeline

In [None]:
def preprocess_titanic(df):
    df = df.copy()

    # Missing values
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # Encoding
    sex_encoder = LabelEncoder()
    embarked_encoder = LabelEncoder()

    df['Sex'] = sex_encoder.fit_transform(df['Sex'])
    df['Embarked'] = embarked_encoder.fit_transform(df['Embarked'])

    # Feature Engineering
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # Drop irrelevant
    df = df.drop(['Name','Ticket','Cabin'], axis=1, errors='ignore')
    return df

In [None]:
train_processed = preprocess_titanic(train_df)
test_processed  = preprocess_titanic(test_df)

train_processed.head()

## ‚úÇÔ∏è Train / Test Split

In [None]:
X = train_processed.drop("Survived", axis=1)
y = train_processed["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## ü§ñ Train Machine Learning Model

In [None]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

## ‚≠ê Feature Importance

In [None]:
def plot_feature_importance(model, X):
    importance = model.feature_importances_
    features = np.array(X.columns)
    sorted_idx = np.argsort(importance)

    plt.figure(figsize=(10,6))
    plt.barh(features[sorted_idx], importance[sorted_idx], color='skyblue')
    plt.title("Feature Importance")
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.show()

plot_feature_importance(model, X)

## üì§ Generate Kaggle Submission

In [None]:
final_predictions = model.predict(test_processed)

submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": final_predictions
})

submission.to_csv("submission.csv", index=False)
submission.head()