In [124]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [125]:
df=pd.read_csv("train.csv")

In [126]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [127]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [128]:
# Step 1: Split Cabin into new columns
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)

In [129]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNum,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [130]:
# Optionally drop the original Cabin column
df.drop(columns=['Cabin'], inplace=True)

In [131]:
# Step 1: Fill missing values first (mode is a safe bet)
df['HomePlanet'].fillna(df['HomePlanet'].mode()[0], inplace=True)

# Step 2: One-hot encode into separate columns
homeplanet_encoded = pd.get_dummies(df['HomePlanet'], prefix='HomePlanet')

# Step 3: Concatenate with your original DataFrame
df = pd.concat([df, homeplanet_encoded], axis=1)

# Step 4: (Optional) Drop the original column if no longer needed
df.drop(columns=['HomePlanet'], inplace=True)

In [132]:
df.head()

Unnamed: 0,PassengerId,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNum,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,0001_01,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,False,True,False
1,0002_01,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,True,False,False
2,0003_01,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,False,True,False
3,0003_02,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,False,True,False
4,0004_01,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,True,False,False


In [133]:
# Drop high-cardinality or non-informative columns
df.drop(columns=['Name',], inplace=True)

# Optionally drop spending columns if not helpful
# df.drop(columns=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], inplace=True)

# Or create a combined spend feature
df['TotalSpend'] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis=1)


In [134]:
# Optionally drop spending columns if not helpful
df.drop(columns=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], inplace=True)

In [135]:
df.head()

Unnamed: 0,PassengerId,CryoSleep,Destination,Age,VIP,Transported,Deck,CabinNum,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend
0,0001_01,False,TRAPPIST-1e,39.0,False,False,B,0,P,False,True,False,0.0
1,0002_01,False,TRAPPIST-1e,24.0,False,True,F,0,S,True,False,False,736.0
2,0003_01,False,TRAPPIST-1e,58.0,True,False,A,0,S,False,True,False,10383.0
3,0003_02,False,TRAPPIST-1e,33.0,False,False,A,0,S,False,True,False,5176.0
4,0004_01,False,TRAPPIST-1e,16.0,False,True,F,1,S,True,False,False,1091.0


In [136]:
# Replace True and False with 1 everywhere
df.replace({True: 1, False: 0}, inplace=True)

In [137]:
df.head(10)

Unnamed: 0,PassengerId,CryoSleep,Destination,Age,VIP,Transported,Deck,CabinNum,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend
0,0001_01,0.0,TRAPPIST-1e,39.0,0.0,0,B,0,P,0,1,0,0.0
1,0002_01,0.0,TRAPPIST-1e,24.0,0.0,1,F,0,S,1,0,0,736.0
2,0003_01,0.0,TRAPPIST-1e,58.0,1.0,0,A,0,S,0,1,0,10383.0
3,0003_02,0.0,TRAPPIST-1e,33.0,0.0,0,A,0,S,0,1,0,5176.0
4,0004_01,0.0,TRAPPIST-1e,16.0,0.0,1,F,1,S,1,0,0,1091.0
5,0005_01,0.0,PSO J318.5-22,44.0,0.0,1,F,0,P,1,0,0,774.0
6,0006_01,0.0,TRAPPIST-1e,26.0,0.0,1,F,2,S,1,0,0,1584.0
7,0006_02,1.0,TRAPPIST-1e,28.0,0.0,1,G,0,S,1,0,0,0.0
8,0007_01,0.0,TRAPPIST-1e,35.0,0.0,1,F,3,S,1,0,0,1018.0
9,0008_01,1.0,55 Cancri e,14.0,0.0,1,B,1,P,0,1,0,0.0


In [138]:
df.drop(columns=['VIP'], inplace=True)

In [139]:
df.head()

Unnamed: 0,PassengerId,CryoSleep,Destination,Age,Transported,Deck,CabinNum,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend
0,0001_01,0.0,TRAPPIST-1e,39.0,0,B,0,P,0,1,0,0.0
1,0002_01,0.0,TRAPPIST-1e,24.0,1,F,0,S,1,0,0,736.0
2,0003_01,0.0,TRAPPIST-1e,58.0,0,A,0,S,0,1,0,10383.0
3,0003_02,0.0,TRAPPIST-1e,33.0,0,A,0,S,0,1,0,5176.0
4,0004_01,0.0,TRAPPIST-1e,16.0,1,F,1,S,1,0,0,1091.0


In [140]:
# One-hot encode the Deck column
deck_encoded = pd.get_dummies(df['Deck'], prefix='Deck')

# Keep only A, B, F columns (if others exist, drop them)
deck_encoded = deck_encoded[['Deck_A', 'Deck_B', 'Deck_F']]

# Merge with original DataFrame
df = pd.concat([df, deck_encoded], axis=1)

# Drop the original 'Deck' column
df.drop(columns=['Deck'], inplace=True)

df.replace({True: 1, False: 0}, inplace=True)


In [141]:
df.head()

Unnamed: 0,PassengerId,CryoSleep,Destination,Age,Transported,CabinNum,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend,Deck_A,Deck_B,Deck_F
0,0001_01,0.0,TRAPPIST-1e,39.0,0,0,P,0,1,0,0.0,0,1,0
1,0002_01,0.0,TRAPPIST-1e,24.0,1,0,S,1,0,0,736.0,0,0,1
2,0003_01,0.0,TRAPPIST-1e,58.0,0,0,S,0,1,0,10383.0,1,0,0
3,0003_02,0.0,TRAPPIST-1e,33.0,0,0,S,0,1,0,5176.0,1,0,0
4,0004_01,0.0,TRAPPIST-1e,16.0,1,1,S,1,0,0,1091.0,0,0,1


In [142]:
# Drop Deck one-hot columns and other spatial features
df.drop(columns=['Deck_A', 'Deck_B', 'Deck_F', 'CabinNum', 'Side'], inplace=True)

In [143]:
df.head()

Unnamed: 0,PassengerId,CryoSleep,Destination,Age,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend
0,0001_01,0.0,TRAPPIST-1e,39.0,0,0,1,0,0.0
1,0002_01,0.0,TRAPPIST-1e,24.0,1,1,0,0,736.0
2,0003_01,0.0,TRAPPIST-1e,58.0,0,0,1,0,10383.0
3,0003_02,0.0,TRAPPIST-1e,33.0,0,0,1,0,5176.0
4,0004_01,0.0,TRAPPIST-1e,16.0,1,1,0,0,1091.0


In [144]:
# Show distinct values in the Destination column
print(df['Destination'].unique())

['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]


In [145]:
# Step 1: One-hot encode the Destination column
destination_encoded = pd.get_dummies(df['Destination'], prefix='Destination')

# Step 2: Concatenate with original DataFrame
df = pd.concat([df, destination_encoded], axis=1)

# Step 3: Drop the original column if no longer needed
df.drop(columns=['Destination'], inplace=True)

In [146]:
df.head()

Unnamed: 0,PassengerId,CryoSleep,Age,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0001_01,0.0,39.0,0,0,1,0,0.0,False,False,True
1,0002_01,0.0,24.0,1,1,0,0,736.0,False,False,True
2,0003_01,0.0,58.0,0,0,1,0,10383.0,False,False,True
3,0003_02,0.0,33.0,0,0,1,0,5176.0,False,False,True
4,0004_01,0.0,16.0,1,1,0,0,1091.0,False,False,True


In [147]:
df.replace({True: 1, False: 0}, inplace=True)

In [148]:
df.head(15)

Unnamed: 0,PassengerId,CryoSleep,Age,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0001_01,0.0,39.0,0,0,1,0,0.0,0,0,1
1,0002_01,0.0,24.0,1,1,0,0,736.0,0,0,1
2,0003_01,0.0,58.0,0,0,1,0,10383.0,0,0,1
3,0003_02,0.0,33.0,0,0,1,0,5176.0,0,0,1
4,0004_01,0.0,16.0,1,1,0,0,1091.0,0,0,1
5,0005_01,0.0,44.0,1,1,0,0,774.0,0,1,0
6,0006_01,0.0,26.0,1,1,0,0,1584.0,0,0,1
7,0006_02,1.0,28.0,1,1,0,0,0.0,0,0,1
8,0007_01,0.0,35.0,1,1,0,0,1018.0,0,0,1
9,0008_01,1.0,14.0,1,0,1,0,0.0,1,0,0


In [149]:
df.isnull().sum()

PassengerId                    0
CryoSleep                    217
Age                          179
Transported                    0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
TotalSpend                     0
Destination_55 Cancri e        0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
dtype: int64

In [150]:
# Step 1: Calculate mode
cryosleep_mode = df['CryoSleep'].mode()[0]

# Step 2: Fill missing values with the mode
df['CryoSleep'].fillna(cryosleep_mode, inplace=True)

# Calculate median age
age_median = df['Age'].median()

# Fill nulls with median
df['Age'].fillna(age_median, inplace=True)


In [151]:
df.isnull().sum()

PassengerId                  0
CryoSleep                    0
Age                          0
Transported                  0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
TotalSpend                   0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
dtype: int64

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PassengerId                8693 non-null   object 
 1   CryoSleep                  8693 non-null   float64
 2   Age                        8693 non-null   float64
 3   Transported                8693 non-null   int64  
 4   HomePlanet_Earth           8693 non-null   int64  
 5   HomePlanet_Europa          8693 non-null   int64  
 6   HomePlanet_Mars            8693 non-null   int64  
 7   TotalSpend                 8693 non-null   float64
 8   Destination_55 Cancri e    8693 non-null   int64  
 9   Destination_PSO J318.5-22  8693 non-null   int64  
 10  Destination_TRAPPIST-1e    8693 non-null   int64  
dtypes: float64(3), int64(7), object(1)
memory usage: 747.2+ KB


In [153]:
df.drop(columns=['PassengerId'], inplace=True)

In [154]:
# Target
y = df['Transported']

# Features (everything else except Transported)
X = df.drop(columns=['Transported'])

In [155]:
df.head()

Unnamed: 0,CryoSleep,Age,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,TotalSpend,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0.0,39.0,0,0,1,0,0.0,0,0,1
1,0.0,24.0,1,1,0,0,736.0,0,0,1
2,0.0,58.0,0,0,1,0,10383.0,0,0,1
3,0.0,33.0,0,0,1,0,5176.0,0,0,1
4,0.0,16.0,1,1,0,0,1091.0,0,0,1


In [158]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import BatchNormalization


# Separate features and target
X = df.drop(columns=["Transported"])
y = df["Transported"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])


# Compile
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])


# Train
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define base model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearch
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy'  # or 'f1' if your classes are imbalanced
)

# Fit to training data
grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Use your optimized parameters
rf_best = RandomForestClassifier(
    bootstrap=True,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

# Train the model
rf_best.fit(X_train, y_train)

# Evaluate
y_pred = rf_best.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
