In [21]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Load train and test data
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

test_df.drop('Name', axis=1, inplace=True)
test_df.drop('Ticket', axis=1, inplace=True)
test_df.drop('Embarked', axis=1, inplace=True)
train_df.drop('Name', axis=1, inplace=True)
train_df.drop('PassengerId', axis=1, inplace=True)
train_df.drop('Ticket', axis=1, inplace=True)
train_df.drop('Embarked', axis=1, inplace=True)

def add_cabin_info_columns(df2):
    df_copy = df2.copy()  # Create a copy to avoid modifying the original DataFrame

    def extract_cabin_info(cabin):
        if pd.isna(cabin) or cabin == '':
            return 0, 'N'  # No cabin info
        cabins = cabin.split()
        count = len(cabins)
        first_cabin = cabins[0]  # First cabin assignment
        match = re.match(r'([A-G])\d+', first_cabin)  # Only A-G allowed
        if match:
            letter = match.groups()[0]
            return count, letter
        return count, 'N'

    # Apply transformation and expand into new columns
    df_copy[['cabin_count', 'cabin_letter']] = df_copy['Cabin'].apply(
        lambda x: pd.Series(extract_cabin_info(x))
    )

    # Drop original 'Cabin' column
    df_copy.drop('Cabin', axis=1, inplace=True)

    # Encode `cabin_letter` as numerical values
    cabin_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'N': 7}
    df_copy['cabin_letter'] = df_copy['cabin_letter'].map(cabin_mapping).astype(int)

    return df_copy

# Preprocess test set: Encode categorical variables
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
#train_df['Embarked'] = train_df['Embarked'].map({'C': 1, 'Q': 2, 'S': 3})

test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})
#test_df['Embarked'] = test_df['Embarked'].map({'C': 1, 'Q': 2, 'S': 3})

# **Step 1: Preprocess Test Data**
test_processed = test_df.copy()
train_processed = train_df.copy()

train_processed['was_originally_null'] = train_processed['Age'].isnull().astype(int)
test_processed['was_originally_null'] = test_processed['Age'].isnull().astype(int)
age_medians = train_processed.groupby(['Pclass', 'Sex'])['Age'].median()
train_processed['Age'] = train_processed.apply(
    lambda row: age_medians[row['Pclass'], row['Sex']] if pd.isna(row['Age']) else row['Age'], axis=1
)
test_processed['Age'] = test_processed.apply(
    lambda row: age_medians[row['Pclass'], row['Sex']] if pd.isna(row['Age']) else row['Age'], axis=1
)



# Add Cabin information
test_processed = add_cabin_info_columns(test_processed) 
train_processed = add_cabin_info_columns(train_processed) 

# **Step 2: Split Test Data**
train_with_age = train_processed[train_processed['was_originally_null']==0].drop(columns=['cabin_count', 'cabin_letter', 'was_originally_null'])
train_without_age = train_processed[train_processed['was_originally_null']==1].drop(columns=['was_originally_null'])

test_with_age = test_processed[test_processed['was_originally_null']==0].drop(columns=['cabin_count', 'cabin_letter', 'was_originally_null'])
test_without_age = test_processed[test_processed['was_originally_null']==1].drop(columns=['was_originally_null'])


# Align features with trained models
features_case_with_age = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Age']
features_case_no_age = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Age', 'cabin_count', 'cabin_letter']

X_train_with_age = train_with_age[features_case_with_age]
y_train_with_age = train_with_age['Survived']
X_train_without_age = train_without_age[features_case_no_age]
y_train_without_age = train_without_age['Survived']


X_test_with_age = test_with_age[features_case_with_age]
X_test_without_age = test_without_age[features_case_no_age]

# **Step 3: Train Models**
rf_with_age = RandomForestClassifier(random_state=42)
rf_with_age.fit(X_train_with_age, y_train_with_age) 


rf_without_age = RandomForestClassifier(random_state=42)
rf_without_age.fit(X_train_without_age, y_train_without_age)


# **Make Predictions**
X_test_with_age['Fare'].fillna(X_test_with_age['Fare'].mode()[0], inplace=True)
pred_with_age = rf_with_age.predict(X_test_with_age)
pred_without_age = rf_without_age.predict(X_test_without_age)


# Assign predictions
test_with_age['Survived'] = pred_with_age
test_without_age['Survived'] = pred_without_age

# Combine both predictions into final DataFrame
final_predictions = pd.concat([test_with_age[['PassengerId', 'Survived']], 
                               test_without_age[['PassengerId', 'Survived']]])

final_predictions = pd.concat([test_with_age[['PassengerId', 'Survived']], 
                               test_without_age[['PassengerId', 'Survived']]])

# Ensure the predictions are sorted correctly
final_predictions.sort_values(by='PassengerId', inplace=True)

# **Step 5: Save Predictions**
final_predictions.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")


Predictions saved to submission.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_with_age['Fare'].fillna(X_test_with_age['Fare'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_with_age['Fare'].fillna(X_test_with_age['Fare'].mode()[0], inplace=True)
