In [1]:
# Method 1: Using seaborn (easiest - Titanic is built-in)
import seaborn as sns
import pandas as pd

# Load the Titanic dataset
titanic = sns.load_dataset('titanic')

# Save it locally for future use
titanic.to_csv('../Data/titanic.csv', index=False)

print("✅ Titanic dataset downloaded!")
print(f"📊 Shape: {titanic.shape}")
print(f"🎯 Target variable: 'survived' (0=died, 1=survived)")
print("\n📋 First few rows:")
titanic.head()

✅ Titanic dataset downloaded!
📊 Shape: (891, 15)
🎯 Target variable: 'survived' (0=died, 1=survived)

📋 First few rows:


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [2]:
# Check missing values
print("Missing Values Analysis:")
print(titanic.isnull().sum())
print(f"\nMissing percentages:")
print((titanic.isnull().sum() / len(titanic) * 100).round(2))

Missing Values Analysis:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

Missing percentages:
survived        0.00
pclass          0.00
sex             0.00
age            19.87
sibsp           0.00
parch           0.00
fare            0.00
embarked        0.22
class           0.00
who             0.00
adult_male      0.00
deck           77.22
embark_town     0.22
alive           0.00
alone           0.00
dtype: float64


In [3]:
# Step 1: DROP the deck column (77% missing - too unreliable)
print("🗑️ Dropping 'deck' column (77% missing)")
titanic_clean = titanic.drop('deck', axis=1)
print(f"Columns before: {titanic.shape[1]} → After: {titanic_clean.shape[1]}")

🗑️ Dropping 'deck' column (77% missing)
Columns before: 15 → After: 14


In [4]:
# Step 2: FILL embarked (only 2 missing - use most common)
print("\n🚢 Handling missing embarkation ports:")
print("Embarked value counts:")
print(titanic_clean['embarked'].value_counts())

# Fill with most common port (Southampton = 'S')
mode_embarked = titanic_clean['embarked'].mode()[0]
titanic_clean['embarked'].fillna(mode_embarked, inplace=True)
titanic_clean['embark_town'].fillna('Southampton', inplace=True)

print(f"✅ Filled 2 missing embarked values with '{mode_embarked}'")


🚢 Handling missing embarkation ports:
Embarked value counts:
embarked
S    644
C    168
Q     77
Name: count, dtype: int64
✅ Filled 2 missing embarked values with 'S'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_clean['embarked'].fillna(mode_embarked, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_clean['embark_town'].fillna('Southampton', inplace=True)


In [5]:
# Step 3: IMPUTE age (19% missing - use smart strategy)
print("\n👶 Handling missing ages:")
print("Age by passenger class and sex:")
age_by_groups = titanic_clean.groupby(['pclass', 'sex'])['age'].median()
print(age_by_groups)

# Fill missing ages based on passenger class and sex
def fill_age(row):
    if pd.isna(row['age']):
        return age_by_groups[row['pclass'], row['sex']]
    return row['age']

titanic_clean['age'] = titanic_clean.apply(fill_age, axis=1)
print(f"✅ Filled {177} missing ages using class-sex median")


👶 Handling missing ages:
Age by passenger class and sex:
pclass  sex   
1       female    35.0
        male      40.0
2       female    28.0
        male      30.0
3       female    21.5
        male      25.0
Name: age, dtype: float64
✅ Filled 177 missing ages using class-sex median


In [6]:
# Step 4: VERIFY our cleaning worked
print("\n🔍 VERIFICATION - Missing values after cleaning:")
missing_after = titanic_clean.isnull().sum()
print(missing_after[missing_after > 0])  # Only show columns with missing values

if missing_after.sum() == 0:
    print("🎉 SUCCESS! No missing values remaining!")
else:
    print("⚠️ Still have missing values to handle")
    
print(f"\n📊 Final dataset shape: {titanic_clean.shape}")


🔍 VERIFICATION - Missing values after cleaning:
Series([], dtype: int64)
🎉 SUCCESS! No missing values remaining!

📊 Final dataset shape: (891, 14)


In [7]:
# Step 5: QUICK PREVIEW of our cleaned data
print("\n📋 Cleaned dataset preview:")
print("Age column now complete:")
print(f"Age range: {titanic_clean['age'].min():.1f} - {titanic_clean['age'].max():.1f}")
print(f"Average age: {titanic_clean['age'].mean():.1f}")

print("\nFirst 5 rows of cleaned data:")
titanic_clean.head()


📋 Cleaned dataset preview:
Age column now complete:
Age range: 0.4 - 80.0
Average age: 29.1

First 5 rows of cleaned data:


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [8]:
# Step 6: CATEGORICAL ENCODING - Convert text to numbers
print("🔤 Converting categorical variables to numerical...")

# Create a copy to work with
titanic_ml = titanic_clean.copy()

# Binary encoding for sex (0=female, 1=male)
titanic_ml['sex_male'] = (titanic_ml['sex'] == 'male').astype(int)

# One-hot encoding for embarked ports
titanic_ml = pd.get_dummies(titanic_ml, columns=['embarked'], prefix='port')

print("✅ Categorical encoding complete!")
print(f"New columns created: {[col for col in titanic_ml.columns if col.startswith('port_') or col == 'sex_male']}")

🔤 Converting categorical variables to numerical...
✅ Categorical encoding complete!
New columns created: ['sex_male', 'port_C', 'port_Q', 'port_S']


In [9]:
# Step 7: FEATURE ENGINEERING - Create meaningful new features
print("\n🔧 Creating new features...")

# Family size and relationships
titanic_ml['family_size'] = titanic_ml['sibsp'] + titanic_ml['parch'] + 1
titanic_ml['is_alone'] = (titanic_ml['family_size'] == 1).astype(int)

# Age groups (more interpretable than raw age)
titanic_ml['age_group'] = pd.cut(titanic_ml['age'], 
                                bins=[0, 12, 18, 35, 60, 100], 
                                labels=['Child', 'Teen', 'Young_Adult', 'Adult', 'Senior'])

# Convert age groups to dummy variables
titanic_ml = pd.get_dummies(titanic_ml, columns=['age_group'], prefix='age')

# Fare categories (economic status indicator)
titanic_ml['fare_category'] = pd.cut(titanic_ml['fare'], 
                                    bins=[0, 7.91, 14.45, 31.0, 512.33], 
                                    labels=['Low', 'Medium', 'High', 'Very_High'])

titanic_ml = pd.get_dummies(titanic_ml, columns=['fare_category'], prefix='fare')

print("✅ New features created:")
print(f"• Family size: {titanic_ml['family_size'].min()}-{titanic_ml['family_size'].max()}")
print(f"• Age groups: {[col for col in titanic_ml.columns if col.startswith('age_')]}")
print(f"• Fare categories: {[col for col in titanic_ml.columns if col.startswith('fare_')]}")


🔧 Creating new features...
✅ New features created:
• Family size: 1-11
• Age groups: ['age_Child', 'age_Teen', 'age_Young_Adult', 'age_Adult', 'age_Senior']
• Fare categories: ['fare_Low', 'fare_Medium', 'fare_High', 'fare_Very_High']


In [10]:
# Step 8: SELECT FINAL FEATURES for modeling
print("\n🎯 Selecting features for logistic regression...")

# Features to use in our model
features_to_use = [
    # Demographics
    'pclass', 'sex_male', 'age',
    # Family
    'family_size', 'is_alone',
    # Economic
    'fare',
    # Embarkation ports
    'port_C', 'port_Q', 'port_S',
    # Age groups
    'age_Child', 'age_Teen', 'age_Young_Adult', 'age_Adult', 'age_Senior',
    # Fare categories  
    'fare_Low', 'fare_Medium', 'fare_High', 'fare_Very_High'
]

# Create final dataset
X = titanic_ml[features_to_use]
y = titanic_ml['survived']

print(f"✅ Final feature set:")
print(f"• Features (X): {X.shape}")
print(f"• Target (y): {y.shape}")
print(f"• Feature names: {list(X.columns)}")


🎯 Selecting features for logistic regression...
✅ Final feature set:
• Features (X): (891, 18)
• Target (y): (891,)
• Feature names: ['pclass', 'sex_male', 'age', 'family_size', 'is_alone', 'fare', 'port_C', 'port_Q', 'port_S', 'age_Child', 'age_Teen', 'age_Young_Adult', 'age_Adult', 'age_Senior', 'fare_Low', 'fare_Medium', 'fare_High', 'fare_Very_High']


In [11]:
# Step 9: SPLIT DATA for training and testing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("📊 Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✅ Data split complete:")
print(f"• Training set: {X_train.shape[0]} passengers")
print(f"• Test set: {X_test.shape[0]} passengers")
print(f"• Training survival rate: {y_train.mean():.1%}")
print(f"• Test survival rate: {y_test.mean():.1%}")

📊 Splitting data into train and test sets...
✅ Data split complete:
• Training set: 712 passengers
• Test set: 179 passengers
• Training survival rate: 38.3%
• Test survival rate: 38.5%


In [12]:
# Step 10: CREATE and TRAIN the logistic regression model
print("\n🤖 Creating logistic regression model...")

# Create the model
model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
print("🏋️ Training the model...")
model.fit(X_train, y_train)

print("✅ Model training complete!")
print(f"• Model type: {type(model).__name__}")
print(f"• Features used: {len(X_train.columns)}")
print(f"• Training samples: {len(X_train)}")


🤖 Creating logistic regression model...
🏋️ Training the model...
✅ Model training complete!
• Model type: LogisticRegression
• Features used: 18
• Training samples: 712


In [13]:
# Step 11: MAKE PREDICTIONS and EVALUATE performance
print("\n🔮 Making predictions...")

# Predictions on test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of survival

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Predictions complete!")
print(f"🎯 Model Accuracy: {accuracy:.1%}")
print(f"📊 Detailed Results:")
print(classification_report(y_test, y_pred, target_names=['Died', 'Survived']))


🔮 Making predictions...
✅ Predictions complete!
🎯 Model Accuracy: 81.0%
📊 Detailed Results:
              precision    recall  f1-score   support

        Died       0.83      0.87      0.85       110
    Survived       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [14]:
# Step 12: CREATE ADVANCED FEATURES
print("🔧 Creating advanced features...")

# Title extraction from names (social status indicator)
titanic_ml['title'] = titanic_clean['who'].copy()  # Use existing 'who' column
titanic_ml = pd.get_dummies(titanic_ml, columns=['title'], prefix='title')

# Fare per person (family economics)
titanic_ml['fare_per_person'] = titanic_ml['fare'] / titanic_ml['family_size']

# Age-Class interaction (young first class vs old third class)
titanic_ml['age_pclass_interaction'] = titanic_ml['age'] * titanic_ml['pclass']

# Family survival strategy
titanic_ml['family_survival_strategy'] = (
    (titanic_ml['family_size'] > 1) & (titanic_ml['age'] < 18)
).astype(int)

print("✅ Advanced features created!")

🔧 Creating advanced features...
✅ Advanced features created!


In [15]:
# Step 13: HYPERPARAMETER OPTIMIZATION
from sklearn.model_selection import GridSearchCV

print("🔍 Finding optimal hyperparameters...")

# Define parameter grid
param_grid = {
    'C': [0.1, 1.0, 10.0, 100.0],  # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet'],  # Regularization type
    'solver': ['liblinear', 'saga'],  # Optimization algorithm
    'max_iter': [1000, 2000]
}

# Grid search with cross-validation
grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

print(f"✅ Best parameters: {grid_search.best_params_}")
print(f"✅ Best CV score: {grid_search.best_score_:.3f}")

🔍 Finding optimal hyperparameters...
✅ Best parameters: {'C': 1.0, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
✅ Best CV score: 0.796


80 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Abdul Salam M\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Abdul Salam M\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1363, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Abdul Salam M\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1210, in fit
    s

In [16]:
# Step 14: FEATURE SELECTION
from sklearn.feature_selection import SelectKBest, chi2, RFE

print("🎯 Selecting most important features...")

# Method A: Statistical feature selection
selector = SelectKBest(score_func=chi2, k=10)  # Top 10 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_features = X_train.columns[selector.get_support()]
print(f"Selected features: {list(selected_features)}")

# Method B: Recursive Feature Elimination
rfe = RFE(LogisticRegression(random_state=42), n_features_to_select=12)
rfe.fit(X_train, y_train)
rfe_features = X_train.columns[rfe.support_]
print(f"RFE selected features: {list(rfe_features)}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


🎯 Selecting most important features...
Selected features: ['pclass', 'sex_male', 'age', 'is_alone', 'fare', 'port_C', 'port_S', 'age_Child', 'fare_Low', 'fare_Very_High']


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


RFE selected features: ['pclass', 'sex_male', 'family_size', 'is_alone', 'port_Q', 'age_Child', 'age_Adult', 'age_Senior', 'fare_Low', 'fare_Medium', 'fare_High', 'fare_Very_High']


In [17]:
# Step 15: ROBUST EVALUATION
from sklearn.model_selection import cross_val_score, StratifiedKFold

print("📊 Cross-validation evaluation...")

# Stratified K-Fold (maintains class balance)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Test multiple models
models = {
    'Basic': LogisticRegression(random_state=42),
    'Tuned': best_model,
    'L1_Regularized': LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=42),
    'L2_Regularized': LogisticRegression(penalty='l2', C=0.1, random_state=42)
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name:15}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

📊 Cross-validation evaluation...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

Basic          : 0.803 (+/- 0.048)
Tuned          : 0.798 (+/- 0.044)
L1_Regularized : 0.782 (+/- 0.044)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

L2_Regularized : 0.796 (+/- 0.037)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# Step 16: ADDRESS CLASS IMBALANCE
from sklearn.utils.class_weight import compute_class_weight

print("⚖️ Handling class imbalance...")

# Calculate class weights
class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_train), 
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Model with balanced class weights
balanced_model = LogisticRegression(
    class_weight='balanced',
    random_state=42,
    max_iter=1000
)

balanced_model.fit(X_train, y_train)
balanced_pred = balanced_model.predict(X_test)
balanced_accuracy = accuracy_score(y_test, balanced_pred)

print(f"Balanced model accuracy: {balanced_accuracy:.3f}")
print(f"Class weights: {class_weight_dict}")

⚖️ Handling class imbalance...


NameError: name 'np' is not defined

In [19]:
# Step 17: POLYNOMIAL FEATURES (interaction terms)
from sklearn.preprocessing import PolynomialFeatures

print("🔄 Creating polynomial features...")

# Create interaction terms (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train with polynomial features
poly_model = LogisticRegression(C=0.1, max_iter=2000, random_state=42)
poly_model.fit(X_train_poly, y_train)
poly_pred = poly_model.predict(X_test_poly)
poly_accuracy = accuracy_score(y_test, poly_pred)

print(f"Polynomial features accuracy: {poly_accuracy:.3f}")
print(f"Feature count: {X_train.shape[1]} → {X_train_poly.shape[1]}")

🔄 Creating polynomial features...
Polynomial features accuracy: 0.793
Feature count: 18 → 171


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# First, fix the import issue
import numpy as np

# Step 18: CLEAN MODEL COMPARISON (no errors!)
print("🧹 Clean model comparison...")

# Simple, working models
models_clean = {
    'Baseline': LogisticRegression(random_state=42, max_iter=2000),
    'Regularized_L1': LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42),
    'Regularized_L2': LogisticRegression(penalty='l2', C=1.0, max_iter=2000, random_state=42),
    'Balanced_Classes': LogisticRegression(class_weight='balanced', max_iter=2000, random_state=42)
}

print("Model Performance Comparison:")
print("-" * 50)

for name, model in models_clean.items():
    # Train and test
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    
    print(f"{name:<20}: Train={train_score:.3f}, Test={test_score:.3f}")

🧹 Clean model comparison...
Model Performance Comparison:
--------------------------------------------------
Baseline            : Train=0.819, Test=0.810
Regularized_L1      : Train=0.812, Test=0.810
Regularized_L2      : Train=0.819, Test=0.810
Balanced_Classes    : Train=0.802, Test=0.799


In [21]:
# Step 19: FEATURE IMPORTANCE ANALYSIS
print("\n🔍 What features matter most?")

# Use the best performing model
best_simple_model = LogisticRegression(max_iter=2000, random_state=42)
best_simple_model.fit(X_train, y_train)

# Get feature importance (coefficients)
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': best_simple_model.coef_[0],
    'abs_coefficient': np.abs(best_simple_model.coef_[0])
}).sort_values('abs_coefficient', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))


🔍 What features matter most?
Top 10 Most Important Features:
           feature  coefficient  abs_coefficient
1         sex_male    -2.547501         2.547501
0           pclass    -1.039991         1.039991
9        age_Child     0.971110         0.971110
17  fare_Very_High     0.645187         0.645187
15     fare_Medium     0.600081         0.600081
16       fare_High     0.558584         0.558584
4         is_alone    -0.501588         0.501588
10        age_Teen    -0.407292         0.407292
3      family_size    -0.400520         0.400520
14        fare_Low     0.353460         0.353460


In [22]:
# Step 20: FINAL MODEL EVALUATION
print("\n🏆 Final Results Summary:")

final_model = LogisticRegression(max_iter=2000, random_state=42)
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_predictions)

print(f"✅ Final Model Accuracy: {final_accuracy:.1%}")
print(f"📊 Improvement from start: {final_accuracy:.1%} vs 81.0%")

# Classification report (clean)
print("\nDetailed Performance:")
print(classification_report(y_test, final_predictions, target_names=['Died', 'Survived']))


🏆 Final Results Summary:
✅ Final Model Accuracy: 81.0%
📊 Improvement from start: 81.0% vs 81.0%

Detailed Performance:
              precision    recall  f1-score   support

        Died       0.83      0.87      0.85       110
    Survived       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

