In [18]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")


# Step 2: Load Datasets (uploaded in Colab)
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")
submission_format = pd.read_csv("/content/gender_submission.csv")

# Step 3: Handle Missing Values
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)

test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Step 4: Drop Irrelevant Columns
train_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Step 5: Encode Categorical Features (Safe Combined Encoding)
combined = pd.concat([train_df[['Sex', 'Embarked']], test_df[['Sex', 'Embarked']]])

# Label encode 'Sex'
le_sex = LabelEncoder()
combined['Sex'] = le_sex.fit_transform(combined['Sex'])

# Label encode 'Embarked'
le_embarked = LabelEncoder()
combined['Embarked'] = le_embarked.fit_transform(combined['Embarked'])

# Assign back to train and test sets
train_df['Sex'] = combined.iloc[:len(train_df)]['Sex'].values
test_df['Sex'] = combined.iloc[len(train_df):]['Sex'].values

train_df['Embarked'] = combined.iloc[:len(train_df)]['Embarked'].values
test_df['Embarked'] = combined.iloc[len(train_df):]['Embarked'].values

# Step 6: Prepare Training Data
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']

# Step 7: Split into Train/Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train Naive Bayes Model
model = GaussianNB()
model.fit(X_train, y_train)

# Step 9: Validate
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Step 10: Predict on Test Set
X_test = test_df.drop(['PassengerId'], axis=1)
test_pred = model.predict(X_test)

# Step 11: Create Submission File
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_pred
})

submission.to_csv("titanic_naive_bayes_submission.csv", index=False)
print("Submission file created as 'titanic_naive_bayes_submission.csv'")
submission.head()


Validation Accuracy: 0.776536312849162
              precision    recall  f1-score   support

           0       0.83      0.78      0.80       105
           1       0.71      0.77      0.74        74

    accuracy                           0.78       179
   macro avg       0.77      0.78      0.77       179
weighted avg       0.78      0.78      0.78       179

Submission file created as 'titanic_naive_bayes_submission.csv'


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
