# Step 1: Load the Data

In [57]:
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Step 2: Explore the Data

In [58]:
print(train.shape, test.shape)
print('-'*50)
print(train.head())
print('-'*50)
print(train.describe())
print('-'*50)
print(train.dtypes)
print('-'*50)
print('Checking for missing values:')
print(train.isnull().sum())
print('-'*50)
print('Checking for duplicates:')
print(train.duplicated().sum())

(8693, 14) (4277, 13)
--------------------------------------------------
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False

# Step 3: Preprocess the Data

In [59]:
# Separate training features and target
X_train = train.drop('Transported', axis=1)
y_train = train['Transported']

# Combine train and test datasets to ensure all possible features are present when training model.
combined = pd.concat([X_train, test], keys=['train', 'test'])

# Split PassengerId into Group and NumberInGroup
combined[['Group', 'NumberInGroup']] = combined['PassengerId'].str.split('_', expand=True)

# Drop the original PassengerId column as it's no longer needed
combined.drop(columns=['PassengerId'], inplace=True)

# Split Cabin into Deck, Num, and Side
combined[['Deck', 'Num', 'Side']] = combined['Cabin'].str.split('/', expand=True)

# Drop the original Cabin column as it's no longer needed
combined.drop(columns=['Cabin'], inplace=True)

# Drop unlikely useless columns. I also tried removing the Name column, but it reduced the accuracy, I believe that same surnames have high chances of the sharing the same target result since they are likely family and had high chances of being together at the moment of the accident.
combined.drop(columns=['HomePlanet', 'Destination', 'Age', 'VIP'], inplace=True)

# Apply pd.get_dummies to the combined dataset
combined = pd.get_dummies(combined)

# Split combined dataset back into train and test sets
X_train = combined.xs('train')
X_test = combined.xs('test')

# Fill missing values with 0 (or another strategy)
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Step 4: Train a Model

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_split, y_train_split)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Step 5: Evaluate the Model

In [61]:
# Predict on the training set
y_train_pred = model.predict(X_train_split)

# Predict on the validation set
y_val_pred = model.predict(X_val_split)

# Evaluate the model
train_accuracy = accuracy_score(y_train_split, y_train_pred)
val_accuracy = accuracy_score(y_val_split, y_val_pred)

print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')

Training Accuracy: 0.8394
Validation Accuracy: 0.7867


Best validation accuracy so far: 0.7867

# Step 6: Prepare Submission File

In [62]:
test_predictions = model.predict(X_test)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Transported': test_predictions})
submission.to_csv('submission.csv', index=False)