In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold

# Load the training and test data
data = pd.read_csv('titanic/train.csv')
test_data = pd.read_csv('titanic/test.csv')

# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

# Feature Engineering
def advanced_feature_engineering(df):
    # Family Size and Alone
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Interaction Features
    df['Pclass*Age'] = df['Pclass'] * df['Age']
    df['Fare*Pclass'] = df['Fare'] * df['Pclass']
    
    # Binning Age and Fare
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 16, 32, 48, 64, 80], labels=[1, 2, 3, 4, 5])
    df['FareBin'] = pd.qcut(df['Fare'], 4, labels=[1, 2, 3, 4])
    
    # Title feature
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    return df

# Apply advanced feature engineering
data = advanced_feature_engineering(data)
test_data = advanced_feature_engineering(test_data)

# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin'], drop_first=True)

# Reindex test_data to match data's columns
test_data = test_data.reindex(columns=data.columns, fill_value=0)

# Drop irrelevant columns
data = data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
test_data = test_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Survived'], axis=1, errors='ignore')

# Define features and target variable
X = data.drop('Survived', axis=1)
y = data['Survived']

# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test_data)

# Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
X_high_variance = selector.fit_transform(X_scaled)
X_test_high_variance = selector.transform(X_test_scaled)

# Split the data for validation
X_train, X_val, y_train, y_val = train_test_split(X_high_variance, y, test_size=0.2, random_state=42)

# Model 1: Logistic Regression with Cross-Validation
lr_model = LogisticRegression(max_iter=1000, random_state=42)
cv_scores_lr = cross_val_score(lr_model, X_high_variance, y, cv=10, scoring='accuracy')
print("Logistic Regression Cross-validation scores:", cv_scores_lr)
print("Mean CV score for Logistic Regression:", np.mean(cv_scores_lr))

# Model 2: K-Nearest Neighbors with GridSearchCV
param_grid_knn = {'n_neighbors': range(3, 15)}
knn_model = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, cv=10, scoring='accuracy')
grid_search_knn.fit(X_high_variance, y)
best_knn_model = grid_search_knn.best_estimator_
cv_scores_knn = cross_val_score(best_knn_model, X_high_variance, y, cv=10, scoring='accuracy')
print("K-Nearest Neighbors Cross-validation scores:", cv_scores_knn)
print("Mean CV score for K-Nearest Neighbors:", np.mean(cv_scores_knn))

# Choose the best model based on CV score
if np.mean(cv_scores_lr) > np.mean(cv_scores_knn):
    best_model = lr_model
    best_model_name = "Logistic Regression"
    best_model.fit(X_high_variance, y)
else:
    best_model = best_knn_model
    best_model_name = "K-Nearest Neighbors"
    best_model.fit(X_high_variance, y)

print(f"Selected Best Model: {best_model_name}")

# Evaluate on validation data
y_pred_val = best_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
print("\nClassification Report:\n", classification_report(y_val, y_pred_val))

# Train on the full dataset and predict on the test set
best_model.fit(X_high_variance, y)
test_predictions = best_model.predict(X_test_high_variance)

# Create a submission file for Kaggle
submission = pd.DataFrame({'PassengerId': pd.read_csv('titanic/test.csv')['PassengerId'], 'Survived': test_predictions})
submission.to_csv('submission_final_simple.csv', index=False)
print("Final submission file created: submission_final_simple.csv")


Logistic Regression Cross-validation scores: [0.8        0.83146067 0.7752809  0.85393258 0.80898876 0.7752809
 0.80898876 0.82022472 0.85393258 0.85393258]
Mean CV score for Logistic Regression: 0.8182022471910113
K-Nearest Neighbors Cross-validation scores: [0.74444444 0.85393258 0.75280899 0.80898876 0.83146067 0.79775281
 0.88764045 0.76404494 0.83146067 0.82022472]
Mean CV score for K-Nearest Neighbors: 0.8092759051186018
Selected Best Model: Logistic Regression
Validation Accuracy: 0.8100558659217877

Confusion Matrix:
 [[89 16]
 [18 56]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.85      0.84       105
           1       0.78      0.76      0.77        74

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Final submission file created: submission_final_simple.csv
