In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler

scalar = StandardScaler()

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

train_df_2 = pd.read_csv('../data/train.csv')
test_df_2 = pd.read_csv('../data/test.csv')

print("Train DataFrame Shape:", train_df.shape)
print("Test DataFrame Shape:", test_df.shape)


Train DataFrame Shape: (891, 12)
Test DataFrame Shape: (418, 11)


In [46]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [3]:
def preprocess_data_improved(train_df, test_df):
    
    # Age: Use mean
    train_mean_age = train_df['Age'].mean()
    # Embarked: Use mode
    train_mode_embarked = train_df['Embarked'].mode()[0]
    # Fare: Use median for robustness against outliers (from train_df or test_df as needed)
    test_median_fare = test_df['Fare'].median() # Only need to calculate for test since train has 0 nulls

    # Impute Embarked (using train mode for both)
    train_df['Embarked'].fillna(train_mode_embarked, inplace=True)
    test_df['Embarked'].fillna(train_mode_embarked, inplace=True)

    # Impute Fare (using test median for test set)
    test_df['Fare'].fillna(test_median_fare, inplace=True)

    # Impute Age (using train mean for both)
    train_df['Age'].fillna(train_mean_age, inplace=True)
    test_df['Age'].fillna(train_mean_age, inplace=True)
    
    # features that require standardization (mainly numerical)
    numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
    
    # Fitting scalar only on training data
    # The scalar learns the parameters (mean and std) from training data
    scalar.fit(train_df[numerical_features])
    
    # Transforming both training and test data
    # Using the learned mean/std dev from the training set to transform both sets
    train_df[numerical_features] = scalar.transform(train_df[numerical_features])
    test_df[numerical_features] = scalar.transform(test_df[numerical_features])
    
    # While training I stumbled on Error: ValueError: could not convert string to float: 'A/5 21171'(Ticket), Fix:
    # Simpler approach: Drop Ticket in used loop
    # using label encoder could be helpful if using for research purposes

    for df in [train_df, test_df]:
        df.drop('Ticket', axis=1, inplace=True) 
    # Cabin/Deck Feature
        df['Deck'] = df['Cabin'].str[0].fillna('M')
        df.drop('Cabin', axis=1, inplace=True)
        
    # Title Feature (using robust RAW string regex extraction)
        df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False) # <--- FIXED LINE
    
    # Simplify/Group Titles
        df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'], 
                                      ['Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Miss', 'Miss', 'Mrs'])
    
    # Drop the original Name column
        df.drop('Name', axis=1, inplace=True)
        
    #one-hot encode for all fields
    categorical_cols = ['Sex', 'Embarked', 'Deck', 'Title']
    train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
    test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)
        
    return train_df, test_df

train_df, test_df = preprocess_data_improved(train_df, test_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(train_mode_embarked, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Embarked'].fillna(train_mode_embarked, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object

In [13]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Deck           0
Sex_male       0
Sex_female     0
Title          0
dtype: int64
PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Deck           0
Sex_male       0
Sex_female     0
Title          0
dtype: int64


In [55]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_M,Deck_T,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0,3,-0.592481,0.432793,-0.473674,-0.502445,True,False,True,...,False,False,False,False,True,False,False,True,False,False
1,2,1,1,0.638789,0.432793,-0.473674,0.786845,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,3,1,3,-0.284663,-0.474545,-0.473674,-0.488854,False,False,True,...,False,False,False,False,True,False,True,False,False,False
3,4,1,1,0.407926,0.432793,-0.473674,0.42073,False,False,True,...,False,False,False,False,False,False,False,False,True,False
4,5,0,3,0.407926,-0.474545,-0.473674,-0.486337,True,False,True,...,False,False,False,False,True,False,False,True,False,False


In [4]:
# separating features(X) and target variable(y)
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']

# The test set is already a feature matrix (X_test)
X_test = test_df.copy()

In [5]:
train_cols = X_train.columns

for col in set(train_cols) - set(X_test.columns):
    X_test[col] = 0
    
X_test = X_test[train_cols]

In [11]:
# Model Selection and Training
from sklearn.ensemble import RandomForestClassifier

# Model initialization
# n_estimators is the number of trees in the forest
model = RandomForestClassifier(n_estimators=100, max_depth=15, min_samples_leaf=2, random_state=42)

model.fit(X_train, y_train)

predictions = pd.Series(model.predict(X_test))

In [76]:
def get_user_input_and_predict(model, original_train_df, preprocess_func):
    """
    Dynamically takes passenger details from the user and makes a survival prediction.
    """
    print("\n--- Enter Passenger Details for Prediction ---")

    # Simple input for required features (unchanged)
    pclass = int(input("1. Pclass (1, 2, or 3): "))
    name = input("2. Name (e.g., Smith, Mr. John): ")
    sex = input("3. Sex (male or female): ")
    age = float(input("4. Age: "))
    sibsp = int(input("5. SibSp (Number of siblings/spouses): "))
    parch = int(input("6. Parch (Number of parents/children): "))
    ticket = input("7. Ticket (e.g., A/5 21171): ")
    fare = float(input("8. Fare: "))
    cabin = input("9. Cabin (e.g., C123 or leave blank for unknown): ") or None
    embarked = input("10. Embarked (S, C, or Q): ")
    
    # 1. Create a DataFrame from user input (unchanged)
    new_passenger_data = pd.DataFrame([{
        'PassengerId': 0, 
        'Pclass': pclass,
        'Name': name,
        'Sex': sex,
        'Age': age,
        'SibSp': sibsp,
        'Parch': parch,
        'Ticket': ticket,
        'Fare': fare,
        'Cabin': cabin,
        'Embarked': embarked
    }])
    
    # 2. Apply the full preprocessing pipeline (unchanged)
    processed_train, processed_new_data = preprocess_func(original_train_df.copy(), new_passenger_data.copy())
    
    # --- Prepare for Prediction (FIXED SECTION) ---
    
    # FIX: Only exclude 'Survived' from the training columns.
    # The error indicates that 'PassengerId' MUST be included in the prediction data.
    X_train_cols = [col for col in processed_train.columns if col != 'Survived']
    
    # Ensure the new data has the same columns in the same order
    for col in X_train_cols:
        if col not in processed_new_data.columns:
            processed_new_data[col] = 0
            
    # Final align and select only the required features
    X_predict = processed_new_data[X_train_cols]

    # 3. Get the prediction (unchanged)
    prediction = model.predict(X_predict)[0]
    
    # 4. Display the result (unchanged)
    survival_status = "Survived" if prediction == 1 else "Did Not Survive"
    
    print("\n--- Prediction Result ---")
    print(f"The model predicts the passenger: **{survival_status}** (Code: {prediction})")

In [77]:
get_user_input_and_predict(model, train_df_2, preprocess_data_improved)


--- Enter Passenger Details for Prediction ---

--- Prediction Result ---
The model predicts the passenger: **Survived** (Code: 1)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(train_mode_embarked, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Embarked'].fillna(train_mode_embarked, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object

In [12]:
from sklearn.metrics import accuracy_score

y_train_pred = model.predict(X_train)

# Calculate the Accuracy Score
# Compare the model's predictions (y_train_pred) against the true labels (y_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

print(f"Model Training Accuracy: {train_accuracy * 100:.2f}%")

Model Training Accuracy: 92.26%
