# Import the libraries

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load Dataset

In [54]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Data Exploration

In [55]:
df.tail(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False
8692,9280_02,Europa,False,E/608/S,TRAPPIST-1e,44.0,False,126.0,4688.0,0.0,0.0,12.0,Propsh Hontichre,True


In [56]:
df.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [57]:
print(f"Number of Rows:{df.shape[0]},Number of Columns:{df.shape[1]}")
print(df.shape)

Number of Rows:8693,Number of Columns:14
(8693, 14)


In [58]:
print("-- Attributes in Data --")
for cols in df.columns:
    print(cols)

-- Attributes in Data --
PassengerId
HomePlanet
CryoSleep
Cabin
Destination
Age
VIP
RoomService
FoodCourt
ShoppingMall
Spa
VRDeck
Name
Transported


In [59]:
print("-- Number of instances in Data --")
print(df.count())

-- Number of instances in Data --
PassengerId     8693
HomePlanet      8492
CryoSleep       8476
Cabin           8494
Destination     8511
Age             8514
VIP             8490
RoomService     8512
FoodCourt       8510
ShoppingMall    8485
Spa             8510
VRDeck          8505
Name            8493
Transported     8693
dtype: int64


In [60]:
print("-- Number of Unique Values in Data --")
print(df.nunique())

-- Number of Unique Values in Data --
PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64


In [61]:
print("-- Number of Null Values in Data --")
print(df.isnull().sum())

-- Number of Null Values in Data --
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [62]:
print("-- Details of Data --")
df.describe()

-- Details of Data --


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [63]:
print("-- Insights of Data --")
df.info()

-- Insights of Data --
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# Data Pre-Processing

In [64]:
print("-- Number of Null Values in Data --")
print(df.isnull().sum())

-- Number of Null Values in Data --
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [65]:
def Handle_Numerical_Missing_values(df, strategy="mean"):
    numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns
    for col in numerical_columns:
        if df[col].isnull().sum() > 0:
            if strategy == "mean":
                fill_value = df[col].mean() 
            elif strategy == "median":
                fill_value = df[col].median()  
            else:
                raise ValueError("Invalid strategy. Choose from 'mean', 'median'.")
            df[col].fillna(fill_value, inplace=True)
    
    print(f"Numerical missing values handled with {strategy}!")

In [66]:
Handle_Numerical_Missing_values(df, strategy="mean")


Numerical missing values handled with mean!


In [67]:
print("-- check Number of Null Values in Data --")
print(df.isnull().sum().sum())

-- check Number of Null Values in Data --
1202


In [68]:
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


# Split data

In [69]:
train_df = df.drop(columns=['Name'])
test_df = test_df.drop(columns=['Name'])

In [70]:
X_train = train_df.drop(columns=['Transported', 'PassengerId'])
y_train = train_df['Transported']
X_test = test_df.drop(columns=['PassengerId'])


In [71]:
X_train = pd.get_dummies(X_train, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin'], drop_first=True)

In [72]:
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [73]:
X_train.fillna(-1, inplace=True)
X_test.fillna(-1, inplace=True)


In [74]:
y_train = y_train.astype(int)


In [75]:
print("Data Types of X_train:")
print(X_train.dtypes)

Data Types of X_train:
Age             float64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
                 ...   
Cabin_T/0/P        bool
Cabin_T/1/P        bool
Cabin_T/2/P        bool
Cabin_T/2/S        bool
Cabin_T/3/P        bool
Length: 6571, dtype: object


In [76]:

print("\nMissing Values in X_train and y_train:")
print(X_train.isnull().sum())
print(y_train.isnull().sum())


Missing Values in X_train and y_train:
Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
               ..
Cabin_T/0/P     0
Cabin_T/1/P     0
Cabin_T/2/P     0
Cabin_T/2/S     0
Cabin_T/3/P     0
Length: 6571, dtype: int64
0


In [77]:
print(y_train.unique())

[0 1]


# RandomForestClassifier

In [78]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [79]:
y_pred = model.predict(X_train)

In [80]:
print("Model Evaluation on Training Set")
print(classification_report(y_train, y_pred))
print(f"Accuracy: {accuracy_score(y_train, y_pred) * 100:.0f}%")

Model Evaluation on Training Set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4315
           1       1.00      1.00      1.00      4378

    accuracy                           1.00      8693
   macro avg       1.00      1.00      1.00      8693
weighted avg       1.00      1.00      1.00      8693

Accuracy: 100%


# testing the model

In [81]:
test_predictions = model.predict(X_test)

# create submission file

In [82]:
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Transported': test_predictions})
submission['Transported'] = submission['Transported'].astype(bool)

In [83]:

submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv
