# Step 1: Load the Data

In [282]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Step 2: Explore the Data

In [283]:
print(train.shape, test.shape)
print('-'*50)
print(train.head())
print('-'*50)
print(train.describe())
print('-'*50)
print(train.dtypes)
print('-'*50)
print('Checking for missing values:')
print(train.isnull().sum())
print('-'*50)
print('Checking for duplicates:')
print(train.duplicated().sum())

(8693, 14) (4277, 13)
--------------------------------------------------
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False

# Step 3: Data Preprocessing
## Separate features from target

In [284]:
X_train = train.drop('Transported', axis=1)
y_train = train['Transported']

## Combine train and test datasets to preprocess together

In [285]:
combined = pd.concat([X_train, test], keys=['train', 'test'])

## Split columns with multiple information
Then drop the original columns as they are no longer needed.

In [286]:
combined[['Group', 'NumberInGroup']] = combined['PassengerId'].str.split('_', expand=True)
combined[['Deck', 'Num', 'Side']] = combined['Cabin'].str.split('/', expand=True)
combined.drop(columns=['PassengerId', 'Cabin'], inplace=True)

## Drop likely useless columns

In [287]:
combined.drop(columns=['HomePlanet'], inplace=True)

## Generate new column "TotalSpending"

In [288]:
# Create a new feature for total spending
combined['TotalSpending'] = (
        combined['RoomService'] +
        combined['FoodCourt'] +
        combined['ShoppingMall'] +
        combined['Spa'] +
        combined['VRDeck']
)

         CryoSleep  Destination   Age    VIP  RoomService  FoodCourt  \
train 7       True  TRAPPIST-1e  28.0  False          0.0        0.0   
      9       True  55 Cancri e  14.0  False          0.0        0.0   
      10      True  TRAPPIST-1e  34.0  False          0.0        0.0   
      18      True  TRAPPIST-1e  45.0  False          0.0        0.0   
      21      True  TRAPPIST-1e   1.0  False          0.0        0.0   

          ShoppingMall  Spa  VRDeck                Name Group NumberInGroup  \
train 7            0.0  0.0     NaN  Candra Jacostaffey  0006            02   
      9            0.0  0.0     0.0      Erraiam Flatic  0008            01   
      10           NaN  0.0     0.0      Altardr Flatic  0008            02   
      18           0.0  0.0     0.0          Alus Upead  0016            01   
      21           0.0  0.0     0.0   Almary Brantuarez  0020            01   

         Deck Num Side  TotalSpending checking  
train 7     G   0    S            NaN    Fa

## Handling missing values

In [289]:
# Fill missing values for categorical columns with a new class "Unknown"
# Fill CryoSleep with False when any spending is detected
combined['CryoSleep'] = combined.apply(
    lambda row: 'False' if pd.isna(row['CryoSleep']) and (
            row['RoomService'] > 0 or
            row['FoodCourt'] > 0 or
            row['ShoppingMall'] > 0 or
            row['Spa'] > 0 or
            row['VRDeck'] > 0
    ) else row['CryoSleep'],
    axis=1
)
# Convert CryoSleep to string and fill remaining NaNs with 'Unknown'
combined['CryoSleep'] = combined['CryoSleep'].astype(str).fillna('Unknown')
combined['Deck'] = combined['Deck'].fillna('Unknown')
combined['Num'] = combined['Num'].fillna('Unknown')
combined['Side'] = combined['Side'].fillna('Unknown')
combined['Destination'] = combined['Destination'].fillna('Unknown')
combined['VIP'] = combined['VIP'].astype(str).fillna('Unknown')
combined['Name'] = combined['Name'].fillna('Unknown')

# Flag Imputed Values
combined['Age_imputed'] = combined['Age'].isnull().astype(int)
combined['RoomService_imputed'] = combined['RoomService'].isnull().astype(int)
combined['FoodCourt_imputed'] = combined['FoodCourt'].isnull().astype(int)
combined['ShoppingMall_imputed'] = combined['ShoppingMall'].isnull().astype(int)
combined['Spa_imputed'] = combined['Spa'].isnull().astype(int)
combined['VRDeck_imputed'] = combined['VRDeck'].isnull().astype(int)
combined['TotalSpending_imputed'] = combined['TotalSpending'].isnull().astype(int)

# Fill missing values for numerical columns with the median
combined['Age'] = combined['Age'].fillna(combined['Age'].median())
combined['RoomService'] = combined['RoomService'].fillna(combined['RoomService'].median())
combined['FoodCourt'] = combined['FoodCourt'].fillna(combined['FoodCourt'].median())
combined['ShoppingMall'] = combined['ShoppingMall'].fillna(combined['ShoppingMall'].median())
combined['Spa'] = combined['Spa'].fillna(combined['Spa'].median())
combined['VRDeck'] = combined['VRDeck'].fillna(combined['VRDeck'].median())
combined['TotalSpending'] = combined['TotalSpending'].fillna(combined['TotalSpending'].median())

print(combined.isnull().sum())

CryoSleep                0
Destination              0
Age                      0
VIP                      0
RoomService              0
FoodCourt                0
ShoppingMall             0
Spa                      0
VRDeck                   0
Name                     0
Group                    0
NumberInGroup            0
Deck                     0
Num                      0
Side                     0
TotalSpending            0
checking                 0
Age_imputed              0
RoomService_imputed      0
FoodCourt_imputed        0
ShoppingMall_imputed     0
Spa_imputed              0
VRDeck_imputed           0
TotalSpending_imputed    0
dtype: int64


## Handling features bases on their types

In [290]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Define columns.
categorical_features = ['CryoSleep', 'Deck', 'Num', 'Side', 'Destination', 'VIP', 'Name']
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending']

# One-Hot Encoding for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Scaling for numerical features
numerical_transformer = StandardScaler()

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ])

# TODO consider if certain interactions between features might be valuable to include (e.g., interactions between Age and VIP or TotalSpending and HomePlanet). These can be added using PolynomialFeatures or manually creating new interaction features.

## Split combined dataset back into train and test sets

In [291]:
X_train = combined.xs('train')
X_test = combined.xs('test')

# Step 4: Train a Model
We will use the entire train data and won't split it since we will validate with kaggle against the test data.

In [292]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)

# Step 5: Evaluate the Model
The best way to evaluate the model is by submitting the predictions to the competition.

Let's study the importance of each feature used

In [293]:
importances = pipeline.named_steps['classifier'].feature_importances_
feature_names = preprocessor.get_feature_names_out()
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                           Feature    Importance
6               num__TotalSpending  7.008176e-02
4                         num__Spa  5.006032e-02
1                 num__RoomService  4.816888e-02
5                      num__VRDeck  4.563948e-02
8              cat__CryoSleep_True  4.307603e-02
...                            ...           ...
7616     cat__Name_Muonea Geakerat  1.276240e-09
9783   cat__Name_Unukath Proorbeng  1.050611e-09
4338     cat__Name_Dyonon Sloweber  5.378093e-10
9331     cat__Name_Taraid Aillyber  1.000234e-10
10044    cat__Name_Weid Dishocatal  0.000000e+00

[10321 rows x 2 columns]


# Step 6: Prepare Submission File

In [294]:
test_predictions = pipeline.predict(X_test)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Transported': test_predictions})
submission.to_csv('submission.csv', index=False)