In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.utils import shuffle
import warnings

In [3]:
# Suppress warnings
warnings.filterwarnings("ignore")

In [4]:
# Load Kaggle Titanic dataset
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)
print("\nTrain columns:", train_data.columns.tolist())
print("\nFirst few rows of train data:")
print(train_data.head())

# Use training data for model comparison
df = train_data.copy()

Train shape: (891, 12)
Test shape: (418, 11)

Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First few rows of train data:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1    

In [5]:
# Map Kaggle column names to match your existing code
print("Before mapping - df columns:", df.columns.tolist())

column_mapping = {
    'Survived': 'survived',
    'Pclass': 'pclass', 
    'Sex': 'sex',
    'Age': 'age',
    'SibSp': 'sibsp',
    'Parch': 'parch',
    'Fare': 'fare',
    'Cabin': 'cabin',
    'Embarked': 'embarked'
}

df = df.rename(columns=column_mapping)

# Keep only the columns your code needs
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]

print("After mapping - df columns:", df.columns.tolist())
print("Data shape after filtering:", df.shape)
print("\nFirst few rows:")
print(df.head())

Before mapping - df columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
After mapping - df columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
Data shape after filtering: (891, 8)

First few rows:
   survived  pclass     sex   age  sibsp  parch     fare embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S


In [6]:
# Drop rows where target is missing
# Would be good to count how many rows will be dropped
df = df.dropna(subset=["survived"])

In [7]:
# Define features and target
target = "survived"
X = df.drop(columns=[target])
y = df[target]

In [8]:
# Define numeric and categorical features
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

In [9]:
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)
print("All features:", X.columns.tolist())

Numeric features: ['pclass', 'age', 'sibsp', 'parch', 'fare']
Categorical features: ['sex', 'embarked']
All features: ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']


In [10]:
# Drop high-NA or irrelevant columns
drop_cols = ['deck', 'embark_town', 'alive', 'who']
for col in drop_cols:
    if col in X.columns:
        X = X.drop(columns=col)
        if col in categorical_features:
            categorical_features.remove(col)

In [11]:
# Recalculate feature lists
numeric_features = [col for col in numeric_features if col in X.columns]
categorical_features = [col for col in categorical_features if col in X.columns]

In [12]:
# Typically count all the N/A's before you do the preprocessing pipeline, and put a note here to say "I replaced x values with the mean and y categorical values"
# For example, if you need to replace half the values, that's not good

In [13]:
# Preprocessing pipeline
# For handling the errors because it doesn't do well with N/A or if its not imputed
# If it's missing a value in the numeric one, then take the mean and put it in the missing value
numeric_transformer = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())
# If you're missing a value, pick the most frequent one
categorical_transformer = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [14]:
# Models for classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier

In [15]:
models = [
    ("Dummy", DummyClassifier()),
    ("LogisticRegression", LogisticRegression(max_iter=1000)),
    ("KNN", KNeighborsClassifier()),
    ("DecisionTree", DecisionTreeClassifier()),
    ("RandomForest", RandomForestClassifier()),
    ("GradientBoosting", GradientBoostingClassifier()),
    ("SVM", SVC(probability=True)),
    ("NaiveBayes", GaussianNB()),
    ("MLP", MLPClassifier(max_iter=1000)),
]

In [16]:
# Scoring
scoring = {
    "Accuracy": "accuracy",
    "F1": "f1",
    "ROC-AUC": "roc_auc",
}

In [17]:
# Evaluate models
results = []

In [18]:
for name, model in models:
    print(f"Evaluating {name}...")
    pipeline = make_pipeline(preprocessor, model)
    row = {"Model": name}
    for metric_name, scorer in scoring.items():
        score = cross_val_score(pipeline, X, y, cv=5, scoring=scorer)
        row[metric_name] = np.mean(score)
    results.append(row)

Evaluating Dummy...
Evaluating LogisticRegression...
Evaluating KNN...
Evaluating DecisionTree...
Evaluating RandomForest...
Evaluating GradientBoosting...
Evaluating SVM...
Evaluating NaiveBayes...
Evaluating MLP...


In [19]:
# Results DataFrame
df_results = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print("\nModel Performance on Titanic Dataset:\n")
print(df_results)


Model Performance on Titanic Dataset:

                Model  Accuracy        F1   ROC-AUC
6                 SVM  0.828284  0.759580  0.850844
5    GradientBoosting  0.822685  0.750883  0.869452
4        RandomForest  0.812598  0.744683  0.854959
2                 KNN  0.810345  0.740644  0.844005
8                 MLP  0.800289  0.742901  0.852824
7          NaiveBayes  0.789028  0.724459  0.829227
1  LogisticRegression  0.786768  0.714376  0.849016
3        DecisionTree  0.769952  0.691982  0.752903
0               Dummy  0.616163  0.000000  0.500000


In [20]:
# F1 is precision, harmonic mean
# Accuracy how close to the bullseye
# ROC is area under the curve
# The top five are very close. Can do the below steps for all 5 of those models TODO

In [21]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [22]:
# -----------------------------------------
# Step 1: Reload Titanic dataset for tuning (using Kaggle data)
# -----------------------------------------
# Use the same Kaggle dataset that was loaded earlier
df = train_data.copy()

# Apply the same column mapping as before
column_mapping = {
    'Survived': 'survived',
    'Pclass': 'pclass', 
    'Sex': 'sex',
    'Age': 'age',
    'SibSp': 'sibsp',
    'Parch': 'parch',
    'Fare': 'fare',
    'Cabin': 'cabin',
    'Embarked': 'embarked'
}

df = df.rename(columns=column_mapping)

# Keep only the columns needed for analysis (same as before)
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]

# Drop rows where target is missing (though Kaggle train data shouldn't have missing targets)
df = df.dropna(subset=["survived"])

In [23]:
# -----------------------------------------
# Step 2: Feature Engineering Function
# -----------------------------------------
def feature_engineering(df):
    df = df.copy()
    
    # Feature: family size
    df['family_size'] = df['sibsp'] + df['parch'] + 1
    df['is_alone'] = (df['family_size'] == 1).astype(int)

    # Fill missing values
    df['age'] = df['age'].fillna(df['age'].median())
    df['fare'] = df['fare'].fillna(df['fare'].median())
    df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])

    # Drop irrelevant columns
    df = df.drop(columns=['class', 'who', 'deck', 'embark_town', 'alive', 'adult_male'], errors='ignore')

    return df

In [24]:
# -----------------------------------------
# Step 3: Apply Feature Engineering
# -----------------------------------------
df_fe = feature_engineering(df)
X = df_fe.drop(columns=["survived"])
y = df_fe["survived"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
print("Feature engineering results:")
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Features created:", X.columns.tolist())
print("Missing values in X:", X.isnull().sum().sum())

Feature engineering results:
X shape: (891, 10)
y shape: (891,)
Features created: ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'alone', 'family_size', 'is_alone']
Missing values in X: 0


In [26]:
# -----------------------------------------
# Step 4: Preprocessing Pipelines
# -----------------------------------------
numeric_features = ['age', 'fare', 'family_size']
categorical_features = ['sex', 'embarked', 'is_alone']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [27]:
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)
print("All features:", X.columns.tolist())

Numeric features: ['age', 'fare', 'family_size']
Categorical features: ['sex', 'embarked', 'is_alone']
All features: ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'alone', 'family_size', 'is_alone']


In [54]:
# ----------------------------
# Step 5: SVM with Grid Search
# ----------------------------

# TODO DO STEP 5 FOR EACH OF THE TOP 5

from sklearn.svm import SVC

svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', SVC(random_state=42, probability=True))  # probability=True for ROC-AUC
])

# SVM parameter grid
param_grid = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__kernel': ['rbf', 'linear'],
    'clf__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

print("\nTuning SVM (Best Performer) with GridSearchCV...")
svm_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
svm_search.fit(X_train, y_train)



Tuning SVM (Best Performer) with GridSearchCV...
Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [66]:
# -----------------------------------------
# Step 5B: EXPANDED Grid Search with tqdm Progress Bar
# -----------------------------------------
from tqdm import tqdm
import time
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.svm import SVC

print("🔍 EXPANDED Grid Search with tqdm Progress Bar...")

# EXPANDED parameter grid
expanded_param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100, 1000],
    'clf__kernel': ['rbf', 'linear', 'poly'],
    'clf__gamma': ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 1, 10],
    'clf__degree': [2, 3, 4]
}

# Get all parameter combinations
param_combinations = list(ParameterGrid(expanded_param_grid))
total_combinations = len(param_combinations)

print(f"Testing {total_combinations} parameter combinations...")
print("🚀 Starting grid search with beautiful tqdm progress bar...\n")

# Track results
best_score = 0
best_params = None
all_results = []

start_time = time.time()

# 🎯 Here's the tqdm magic!
for params in tqdm(param_combinations, 
                   desc="🔍 Grid Search Progress", 
                   unit="combo",
                   colour="green",
                   bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"):
    
    try:
        # Create model with current parameters
        current_model = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', SVC(random_state=42, probability=True, **params))
        ])
        
        # Cross-validate with current parameters
        scores = cross_val_score(current_model, X_train, y_train, 
                               cv=5, scoring='accuracy', n_jobs=1)
        mean_score = scores.mean()
        
        # Store results
        all_results.append({
            'params': params,
            'mean_test_score': mean_score,
            'std_test_score': scores.std()
        })
        
        # Update best if this is better
        if mean_score > best_score:
            best_score = mean_score
            best_params = params
            
            # Update tqdm description with new best score
            tqdm.write(f"🎯 New best score: {best_score:.4f} with {params}")
    
    except Exception as e:
        tqdm.write(f"⚠️ Error with {params}: {str(e)}")
        continue

end_time = time.time()

print(f"\n✅ Grid search completed in {end_time - start_time:.1f} seconds!")
print(f"🏆 Best parameters: {best_params}")
print(f"🏆 Best CV score: {best_score:.4f}")

# Create the best model
best_model = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', SVC(random_state=42, probability=True, **best_params))
])
best_model.fit(X_train, y_train)

# Create a simple results object to match GridSearchCV interface
class TqdmGridSearchResults:
    def __init__(self, best_params, best_score, best_estimator, cv_results):
        self.best_params_ = best_params
        self.best_score_ = best_score
        self.best_estimator_ = best_estimator
        self.cv_results_ = cv_results
    
    def predict(self, X):
        return self.best_estimator_.predict(X)

# Store results in GridSearchCV-like object
svm_expanded_search = TqdmGridSearchResults(best_params, best_score, best_model, all_results)

print(f"\n📊 Tested {len(all_results)} parameter combinations successfully")
print(f"⏱️ Average time per combination: {(end_time - start_time) / len(all_results):.2f} seconds")

🔍 EXPANDED Grid Search with tqdm Progress Bar...
Testing 432 parameter combinations...
🚀 Starting grid search with beautiful tqdm progress bar...



🔍 Grid Search Progress:  11%|[32m████████████████▎                                                                                                                                  [0m| 48/432 [00:00<00:00, 478.47combo/s][0m

⚠️ Error with {'clf__C': 0.01, 'clf__degree': 2, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.01, 'clf__degree': 2, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.01, 'clf__degree': 2, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.01, 'clf__degree': 2, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.01, 'clf__degree': 2, 'clf__gamma': 'auto', 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.01, 'clf__degree': 2, 'clf__gamma': 'auto', 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.01, 'clf__degree': 2, 'clf__gamma': 0.0001, 

                                                                                                                                                                                                                      7combo/s][0m

⚠️ Error with {'clf__C': 0.1, 'clf__degree': 2, 'clf__gamma': 1, 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.1, 'clf__degree': 2, 'clf__gamma': 1, 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'


                                                                                                                                                                                                                      2combo/s][0m

⚠️ Error with {'clf__C': 0.1, 'clf__degree': 2, 'clf__gamma': 1, 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.1, 'clf__degree': 2, 'clf__gamma': 10, 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.1, 'clf__degree': 2, 'clf__gamma': 10, 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.1, 'clf__degree': 2, 'clf__gamma': 10, 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.1, 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.1, 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 0.1, 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}: 

🔍 Grid Search Progress:  46%|[32m██████████████████████████████████████████████████████████████████▉                                                                               [0m| 198/432 [00:00<00:00, 289.49combo/s][0m

⚠️ Error with {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 0.01, 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 0.1, 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 0.1, 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 1, 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 1, 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 1, 'clf__kernel': 'poly'}: SVC.__init__() got an unex

                                                                                                                                                                                                                      7combo/s][0m

⚠️ Error with {'clf__C': 10, 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 10, 'clf__degree': 3, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 10, 'clf__degree': 3, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 10, 'clf__degree': 3, 'clf__gamma': 'auto', 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 10, 'clf__degree': 3, 'clf__gamma': 'auto', 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 10, 'clf__degree': 3, 'clf__gamma': 0.0001, 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 10, 'clf__degree': 3, 'clf__gamma': 0.0001, 'clf__kernel': 

🔍 Grid Search Progress: 100%|[32m██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 432/432 [00:00<00:00, 449.69combo/s][0m


⚠️ Error with {'clf__C': 1000, 'clf__degree': 3, 'clf__gamma': 10, 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1000, 'clf__degree': 4, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1000, 'clf__degree': 4, 'clf__gamma': 'scale', 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1000, 'clf__degree': 4, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1000, 'clf__degree': 4, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1000, 'clf__degree': 4, 'clf__gamma': 'auto', 'clf__kernel': 'linear'}: SVC.__init__() got an unexpected keyword argument 'clf__C'
⚠️ Error with {'clf__C': 1000, 'clf__degree': 4, 'clf__gamma': 'auto', 'clf

TypeError: sklearn.svm._classes.SVC() argument after ** must be a mapping, not NoneType

In [None]:
# -----------------------------------------
# Step 6: Evaluation
# -----------------------------------------
print(f"\nBest SVM Parameters: {svm_search.best_params_}")
print(f"Best CV Score: {svm_search.best_score_:.4f}")

# Test performance
best_model = svm_search.best_estimator_
y_pred_svm = svm_search.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

print(f"\nSVM Test Performance:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"F1 Score: {f1_svm:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_svm))

In [None]:
# Compare original vs expanded results
print("\n" + "="*50)
print("GRID SEARCH COMPARISON")
print("="*50)

print(f"\nORIGINAL Grid Search:")
print(f"Best params: {svm_search.best_params_}")
print(f"Best CV score: {svm_search.best_score_:.4f}")

print(f"\nEXPANDED Grid Search:")
print(f"Best params: {svm_expanded_search.best_params_}")
print(f"Best CV score: {svm_expanded_search.best_score_:.4f}")

# Test the expanded model
best_expanded_model = svm_expanded_search.best_estimator_
y_pred_expanded = svm_expanded_search.predict(X_test)
accuracy_expanded = accuracy_score(y_test, y_pred_expanded)

print(f"\nTest Performance Comparison:")
print(f"Original SVM:  {accuracy_svm:.4f}")
print(f"Expanded SVM:  {accuracy_expanded:.4f}")
print(f"Improvement:   {accuracy_expanded - accuracy_svm:+.4f}")

In [None]:
# Create Kaggle submission
print("Creating submission file...")

# Process test data  
test_df = test_data.rename(columns=column_mapping)
test_df_fe = feature_engineering(test_df)
test_features = test_df_fe[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'alone', 'family_size', 'is_alone']]

# Predictions
final_predictions = best_model.predict(test_features)

# Submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'], 
    'Survived': final_predictions
})

submission.to_csv('titanic_submission.csv', index=False)
print(f"✅ Submission ready! Predicted survival rate: {final_predictions.mean():.1%}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# -----------------------------
# Step 1: Reload and prepare data (same for both models)
# -----------------------------
import seaborn as sns

df = sns.load_dataset("titanic").drop(columns=["deck", "embark_town", "alive", "who"], errors="ignore")
df = df.dropna(subset=["survived"])

def feature_engineering(df):
    df = df.copy()
    df['family_size'] = df['sibsp'] + df['parch'] + 1
    df['is_alone'] = (df['family_size'] == 1).astype(int)
    df['age'] = df['age'].fillna(df['age'].median())
    df['fare'] = df['fare'].fillna(df['fare'].median())
    df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
    df = df.drop(columns=['class', 'who', 'deck', 'embark_town', 'alive', 'adult_male'], errors='ignore')
    return df

df_fe = feature_engineering(df)
X = df_fe.drop(columns=["survived"])
y = df_fe["survived"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# Step 2: Create Preprocessing Pipeline
# -----------------------------
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_features = ['age', 'fare', 'family_size']
categorical_features = ['sex', 'embarked', 'is_alone']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# -----------------------------
# Step 3: Default Gradient Boosting
# -----------------------------
from sklearn.ensemble import GradientBoostingClassifier

default_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', GradientBoostingClassifier(random_state=42))
])

default_pipeline.fit(X_train, y_train)
y_pred_default = default_pipeline.predict(X_test)

# -----------------------------
# Step 4: Tuned Gradient Boosting
# -----------------------------
from sklearn.model_selection import GridSearchCV

tuned_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', GradientBoostingClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__learning_rate': [0.05, 0.1, 0.2],
    'clf__max_depth': [3, 5],
    'clf__min_samples_split': [2, 5],
    'clf__min_samples_leaf': [1, 3],
}

grid = GridSearchCV(tuned_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=0)
grid.fit(X_train, y_train)
y_pred_tuned = grid.predict(X_test)

# -----------------------------
# Step 5: Plot Confusion Matrices
# -----------------------------
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

cm_default = confusion_matrix(y_test, y_pred_default)
disp_default = ConfusionMatrixDisplay(cm_default, display_labels=["Did Not Survive", "Survived"])
disp_default.plot(ax=axes[0], values_format='d')
axes[0].set_title("Default Gradient Boosting")

cm_tuned = confusion_matrix(y_test, y_pred_tuned)
disp_tuned = ConfusionMatrixDisplay(cm_tuned, display_labels=["Did Not Survive", "Survived"])
disp_tuned.plot(ax=axes[1], values_format='d')
axes[1].set_title("Tuned Gradient Boosting")

plt.tight_layout()
plt.show()
