# Titanic. Random Forest predictor

In [1]:
import numpy as np
import pandas as pd
import os

data_train_path = 'input/train.csv'
data_test_path = 'input/test.csv'

# Load train data
train_df = pd.read_csv(data_train_path)
test_df = pd.read_csv(data_test_path)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Clean data helper function
from sklearn.preprocessing import MinMaxScaler

def clean_data(df_in):
    df = df_in.copy()
    # --- 1. Handle Missing Values ---
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

    # --- 2. Create Dummy Variables for Categorical Features ---
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Pclass'] = df['Pclass'].astype('category')
    df_dummies = pd.get_dummies(df[['Embarked', 'Pclass']], drop_first=True, dtype=int)
    df = pd.concat([df, df_dummies], axis=1)
    df.drop(columns=['Embarked', 'Pclass'], inplace=True)

    # --- 3. Normalize Numerical Columns ---
    numerical_cols = ['Age', 'Fare', 'SibSp', 'Parch']
    scaler = MinMaxScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # --- 4. Final Data Preparation ---
    df.drop(columns=['Ticket', 'Cabin', 'Name'], inplace=True)
    return df

In [3]:
train_data_cleaned = clean_data(train_df)
test_data_cleaned = clean_data(test_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [4]:
# Use RandomForest to do predictions
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = train_data_cleaned.drop(columns=['Survived', 'PassengerId'])
y = train_data_cleaned['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle potential missing columns in test set after cleaning
X_test = test_data_cleaned.drop(columns=['PassengerId'])
# Align columns of test set with training set
X_test_aligned = X_test.reindex(columns = X_train.columns, fill_value=0)

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Run predictions
y_pred_val = rf_classifier.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred_val)
print(f"\nModel Accuracy with Random Forest: {accuracy:.4f}")


Model Accuracy with Random Forest: 0.8156


In [5]:
# Predict on the test data
y_pred_test = rf_classifier.predict(X_test_aligned)

# Create a submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_test
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

print('Submission file created successfully!')

Submission file created successfully!
