# Step 1: Load the Data

In [7]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Step 2: Explore the Data

In [8]:
print(train.shape, test.shape)
print('-'*50)
print(train.head())
print('-'*50)
print(train.describe())
print('-'*50)
print(train.dtypes)
print('-'*50)
print('Checking for missing values:')
print(train.isnull().sum())
print('-'*50)
print('Checking for duplicates:')
print(train.duplicated().sum())

(8693, 14) (4277, 13)
--------------------------------------------------
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False

# Step 3: Preprocess the Data

## Separate training features from target

In [9]:
X_train = train.drop('Transported', axis=1)
y_train = train['Transported']

## Feature Engineering
Combine train and test datasets to ensure all possible features are present when training model.

In [10]:
combined = pd.concat([X_train, test], keys=['train', 'test'])

### Split columns with multiple information
Then drop the original columns as they are no longer needed.

In [11]:
combined[['Group', 'NumberInGroup']] = combined['PassengerId'].str.split('_', expand=True)
combined[['Deck', 'Num', 'Side']] = combined['Cabin'].str.split('/', expand=True)
combined.drop(columns=['PassengerId'], inplace=True)
combined.drop(columns=['Cabin'], inplace=True)

### Drop likely useless columns
I also tried removing the Name column, but it reduced the accuracy, I believe that same surnames have high chances of the sharing the same target result since they are likely family and had high chances of being together at the moment of the accident.

In [12]:
combined.drop(columns=['HomePlanet', 'Destination', 'Age', 'VIP'], inplace=True)

### Handle Categorical Columns

In [13]:
combined = pd.get_dummies(combined)

print(combined.head())

         RoomService  FoodCourt  ShoppingMall     Spa  VRDeck  \
train 0          0.0        0.0           0.0     0.0     0.0   
      1        109.0        9.0          25.0   549.0    44.0   
      2         43.0     3576.0           0.0  6715.0    49.0   
      3          0.0     1283.0         371.0  3329.0   193.0   
      4        303.0       70.0         151.0   565.0     2.0   

         CryoSleep_False  CryoSleep_True  Name_Aard Backo  Name_Aard Curle  \
train 0             True           False            False            False   
      1             True           False            False            False   
      2             True           False            False            False   
      3             True           False            False            False   
      4             True           False            False            False   

         Name_Aard Kraie  ...  Num_992  Num_993  Num_994  Num_995  Num_996  \
train 0            False  ...    False    False    False    Fa

### Split combined dataset back into train and test sets

In [14]:
X_train = combined.xs('train')
X_test = combined.xs('test')

### Fix missing values

In [15]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Step 4: Train a Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_split, y_train_split)


# Step 5: Evaluate the Model

In [None]:
# Predict on the training set
y_train_pred = model.predict(X_train_split)

# Predict on the validation set
y_val_pred = model.predict(X_val_split)

# Evaluate the model
train_accuracy = accuracy_score(y_train_split, y_train_pred)
val_accuracy = accuracy_score(y_val_split, y_val_pred)

print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')

Best validation accuracy so far: 0.7867

# Step 6: Prepare Submission File

In [None]:
test_predictions = model.predict(X_test)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Transported': test_predictions})
submission.to_csv('submission.csv', index=False)