In [218]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  

## our Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## Model Evaulations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.metrics import precision_score , recall_score , f1_score, accuracy_score
from sklearn.metrics import RocCurveDisplay

In [219]:
train_Data=pd.read_csv("Data Files/train.csv")
len(train_Data)

8693

In [220]:
test_Data=pd.read_csv("Data Files/test.csv")
len(test_Data)

4277

In [221]:
train_Data


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [222]:
train_Data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [223]:
train_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Data Preprocessing 

## First Cleaning The Useless Data 

In [224]:
TrainData=train_Data.drop("Name",axis=1)
TestData=test_Data.drop("Name",axis=1)

In [233]:
# print all numerical columns and categorical columns
def print_numerical_and_categorical_columns(DF):
    # Numerical columns
    numerical_cols = DF.select_dtypes(include=['number']).columns.tolist()
    print("Numerical Columns:")
    print(numerical_cols)

    # Categorical columns
    categorical_cols = DF.select_dtypes(include=['object', 'category']).columns.tolist()
    print("\nCategorical Columns:")
    print(categorical_cols)
    
print_numerical_and_categorical_columns(TrainData)

Numerical Columns:
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

Categorical Columns:
['PassengerId', 'HomePlanet', 'Cabin', 'Destination']


In [235]:
categorical_columns = TrainData.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_columns

['PassengerId', 'HomePlanet', 'Cabin', 'Destination']

In [236]:
numerical_columns = TrainData.columns[TrainData.dtypes == 'float64']
numerical_columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')

## Second Handling The Null Data

## Categorical Columns

In [237]:
#Most common values in each column
modes = TrainData[categorical_columns].mode().iloc[0]

# Fill in missing values with the most common ones
TrainData[categorical_columns] = TrainData[categorical_columns].fillna(modes)
TestData[categorical_columns] = TestData[categorical_columns].fillna(modes)

## Numerical Columns

In [238]:
# Fill in missing values with the average value of each column
TrainData[numerical_columns] = TrainData[numerical_columns].fillna(TrainData[numerical_columns].mean())
TestData[numerical_columns] = TestData[numerical_columns].fillna(TestData[numerical_columns].mean())

In [239]:
Train_Data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [240]:
Test_Data.isna().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [135]:
from sklearn.preprocessing import StandardScaler

# replace all true and false with 0 and 1

# Train_Data.loc[:, 'VIP'] = Train_Data['VIP'].replace(to_replace=[False, True], value=[0, 1])
# Train_Data.loc[:, 'CryoSleep'] = Train_Data['CryoSleep'].replace(to_replace=[False, True], value=[0, 1])
# Test_Data.loc[:, 'Transported'] = Train_Data['Transported'].replace(to_replace=[False, True], value=[0, 1])

# change categorical columns into numerical between -1 , 1 (scaler)

scaler = StandardScaler()
# Train_Data.loc[: ,'PassengerId'] = scaler.fit_transform(df[['PassengerId']])
# Train_Data.loc[: ,'Age'] = scaler.fit_transform(Train_Data[['Age']])
# Train_Data.loc[:,'RoomService'] = scaler.fit_transform(Train_Data[['RoomService']])
# Train_Data.loc[:,'FoodCourt'] = scaler.fit_transform(Train_Data[['FoodCourt']])
# Train_Data.loc[:,'ShoppingMall'] = scaler.fit_transform(Train_Data[['ShoppingMall']])
# Train_Data.loc[:,'Spa'] = scaler.fit_transform(Train_Data[['Spa']])
# Train_Data.loc[:,'VRDeck'] = scaler.fit_transform(Train_Data[['VRDeck']])
# Train_Data




In [94]:
from sklearn.preprocessing import LabelEncoder
columns_to_encode = ['HomePlanet', 'Cabin', 'Destination', 'Name']

le = LabelEncoder()
for column in columns_to_encode:
    Train_Data.loc[:,column] = le.fit_transform(Train_Data[column])

In [106]:
Train_Data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,1,0,149,2,0.709437,0,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,5252,False
1,0002_01,0,0,2184,2,-0.336717,0,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,4502,True
2,0003_01,1,0,1,2,2.034566,1,-0.275409,1.955616,-0.290817,5.694289,-0.225782,457,False
3,0003_02,1,0,1,2,0.290975,0,-0.340590,0.517406,0.330225,2.683471,-0.098708,7149,False
4,0004_01,0,0,2186,2,-0.894666,0,0.118709,-0.243409,-0.038048,0.225732,-0.267258,8319,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,1,0,146,0,0.848924,1,-0.340590,3.989682,-0.290817,1.184286,-0.203720,3524,False
8689,9278_01,0,1,5280,1,-0.755179,0,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,4780,False
8690,9279_01,0,0,5285,2,-0.197230,0,-0.340590,-0.287314,2.842851,-0.275774,-0.269023,3002,True
8691,9280_01,1,0,2131,0,0.221232,0,-0.340590,0.370637,-0.290817,0.037223,2.585740,1596,False


In [101]:
len(Train_Data)

8693

In [102]:
len(Train_Data.columns)

14

In [107]:
X=Train_Data.drop("Transported",axis=1)
y=Train_Data["Transported"]

In [108]:
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [109]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [110]:
len(X_train)

6954

In [112]:
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    scores = {"Training F1-score": f1_score(y_train, train_preds),
              "Valid F1-score": f1_score(y_val, val_preds),
              "Training Precision": precision_score(y_train, train_preds),
              "Valid Precision": precision_score(y_val, val_preds),
              "Training Recall": recall_score(y_train, train_preds),
              "Valid Recall": recall_score(y_val, val_preds)}
    return scores

In [113]:
model=RandomForestClassifier()
model.fit(X_train, y_train)

In [114]:
show_scores(model)

{'Training F1-score': 1.0,
 'Valid F1-score': 0.7888762769580023,
 'Training Precision': 1.0,
 'Valid Precision': 0.7861990950226244,
 'Training Recall': 1.0,
 'Valid Recall': 0.7915717539863326}

In [153]:
rf_grid = {
    "n_estimators": np.arange(50, 200, 50),
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": np.arange(2, 10, 2),
    "min_samples_leaf": np.arange(1, 10, 2),
    "max_features": [None, 'sqrt', 'log2']  # Replace 'auto' with None
}

np.random.seed(42)

rs_rf = RandomizedSearchCV(
    RandomForestClassifier(),
    param_distributions=rf_grid,
    cv=5,
    n_iter=20
)

rs_rf.fit(X_train, y_train);

In [154]:
rs_rf.best_params_

{'n_estimators': 150,
 'min_samples_split': 4,
 'min_samples_leaf': 7,
 'max_features': 'sqrt',
 'max_depth': None}

In [155]:
ideal_model = RandomForestClassifier(n_estimators=150,
                                    min_samples_split=4,
                                    min_samples_leaf=7,
                                    max_features = "sqrt",
                                    max_depth=None)

ideal_model.fit(X_train, y_train)

In [156]:
show_scores(ideal_model)

{'Training F1-score': 0.8739622641509434,
 'Valid F1-score': 0.8094885100074131,
 'Training Precision': 0.8766086298258895,
 'Valid Precision': 0.8029411764705883,
 'Training Recall': 0.871331828442438,
 'Valid Recall': 0.8161434977578476}

In [157]:
df_test=pd.read_csv("Data Files/test.csv")
df_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [159]:
# Replace all True and False with 0 and 1
df_test['VIP'] = df_test['VIP'].replace({False: 0, True: 1})
df_test['CryoSleep'] = df_test['CryoSleep'].replace({False: 0, True: 1})

# Change categorical columns into numerical values between -1 and 1 using StandardScaler
scaler = StandardScaler()

# Scale the specified columns
columns_to_scale = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df_test[columns_to_scale] = scaler.fit_transform(df_test[columns_to_scale])


In [160]:
 from sklearn.preprocessing import StandardScaler

if df_test.isnull().sum().sum() :
    df_test = df_test.dropna()

# replace all true and false with 0 and 1

df_test.loc[:, 'VIP'] = df_test['VIP'].replace(to_replace=[False, True], value=[0, 1])
df_test.loc[:, 'CryoSleep'] = df_test['CryoSleep'].replace(to_replace=[False, True], value=[0, 1])

In [161]:
# change categorical columns into numerical between -1 , 1 (scaler)

# scaler = StandardScaler()

# # scaler = StandardScaler()

# # Replace the 'df' references with 'df_test'
# df_test.loc[:, 'PassengerId'] = scaler.transform(df_test[['PassengerId']])
# df_test.loc[:, 'Age'] = scaler.transform(df_test[['Age']])
# df_test.loc[:, 'RoomService'] = scaler.transform(df_test[['RoomService']])
# df_test.loc[:, 'FoodCourt'] = scaler.transform(df_test[['FoodCourt']])
# df_test.loc[:, 'ShoppingMall'] = scaler.transform(df_test[['ShoppingMall']])
# df_test.loc[:, 'Spa'] = scaler.transform(df_test[['Spa']])
# df_test.loc[:, 'VRDeck'] = scaler.transform(df_test[['VRDeck']])
# df_test


scaler = StandardScaler()
scaler.fit(df[['PassengerId', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])

# Replace the 'df' references with 'df_test'
df_test[['PassengerId', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.transform(df_test[['PassengerId', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
from sklearn.preprocessing import LabelEncoder
columns_to_encode = ['HomePlanet', 'Cabin', 'Destination', 'Name']

le = LabelEncoder()
for column in columns_to_encode:
    df_test.loc[:,column] = le.fit_transform(df_test[column])


In [162]:
print(set(X_train.columns)-set(df_test.columns))

set()


In [163]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
y_preds=ideal_model.predict(X_test)
accuracy_score(y_test,y_preds)

0.834510595358224

In [164]:
df_preds = pd.DataFrame()
df_preds["PassengerId"] = X_test["PassengerId"]
df_preds["Transported"] = y_preds

In [165]:
df_preds['Transported'] = df_preds['Transported'].replace(to_replace=[0 , 1], value=['False','True'])

In [168]:
df_preds

Unnamed: 0,PassengerId,Transported
8441,9014_01,True
8058,8615_02,True
320,0358_01,False
2548,2732_02,True
8027,8594_01,False
...,...,...
8078,8637_02,False
3794,4047_01,True
154,0177_01,False
8263,8825_01,True


In [170]:
df_preds.to_csv("Final_submission.csv", index = False)