In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import joblib

def prepare_and_train():
    # Load the Titanic dataset from URL (for deployment compatibility)
    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    df = pd.read_csv(url)

    # Advanced Feature Engineering
    # 1. Handle missing values more intelligently
    df['Age'].fillna(df.groupby(['Pclass', 'Sex'])['Age'].transform('median'), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Fare'].fillna(df.groupby('Pclass')['Fare'].transform('median'), inplace=True)

    # 2. Create more sophisticated features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Extract and categorize titles
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                       'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                       'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # 3. Create age and fare bins with better boundaries
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100],
                           labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])
    df['FareGroup'] = pd.qcut(df['Fare'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

    # 4. Create interaction features
    df['Age_Class'] = df['Age'] * df['Pclass']
    df['Fare_Per_Person'] = df['Fare'] / df['FamilySize']
    df['Title_Pclass'] = df['Title'].astype(str) + '_' + df['Pclass'].astype(str)

    # 5. Create deck feature from cabin
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'].fillna('Unknown', inplace=True)
    df['HasCabin'] = (~df['Cabin'].isna()).astype(int)

    # 6. Create ticket features
    df['TicketPrefix'] = df['Ticket'].str.extract('([A-Za-z]+)', expand=False)
    df['TicketPrefix'].fillna('None', inplace=True)

    # Select enhanced features
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
                'FamilySize', 'IsAlone', 'Title', 'AgeGroup', 'FareGroup',
                'Age_Class', 'Fare_Per_Person', 'Title_Pclass', 'Deck', 'HasCabin',
                'TicketPrefix']

    X = df[features].copy()
    y = df['Survived']

    # Encode categorical variables
    le_dict = {}
    categorical_cols = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareGroup',
                       'Title_Pclass', 'Deck', 'TicketPrefix']

    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        le_dict[col] = le

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=42, stratify=y)

    # Train optimized Random Forest model
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=4,
        min_samples_leaf=2,
        max_features='sqrt',
        bootstrap=True,
        random_state=42,
        class_weight='balanced'
    )

    rf_model.fit(X_train, y_train)

    # Evaluate model
    train_accuracy = rf_model.score(X_train, y_train)
    test_accuracy = rf_model.score(X_test, y_test)

    # Cross-validation score
    cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)

    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Cross-validation Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))

    # Save model and encoders
    joblib.dump(rf_model, 'titanic_model.pkl')
    joblib.dump(le_dict, 'label_encoders.pkl')
    joblib.dump(features, 'feature_names.pkl')

    return rf_model, le_dict, features

if __name__ == "__main__":
    model, encoders, features = prepare_and_train()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df.groupby(['Pclass', 'Sex'])['Age'].transform('median'), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work becau

Training Accuracy: 0.9242
Test Accuracy: 0.7989
Cross-validation Score: 0.8343 (+/- 0.0439)

Top 10 Most Important Features:
            feature  importance
1               Sex    0.191599
14     Title_Pclass    0.102313
13  Fare_Per_Person    0.101066
5              Fare    0.100971
12        Age_Class    0.088468
9             Title    0.085827
2               Age    0.069218
0            Pclass    0.039306
15             Deck    0.037348
7        FamilySize    0.030057
