In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isnull().sum() * 100 / len(train)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(10,4))
sns.histplot(x=train[train['Transported'] == 0]['Age'], ax=axes[0], color='b', bins=20)
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Number')
axes[0].set_title('Transported')

sns.histplot(x=train[train['Transported'] == 1]['Age'], ax=axes[1], color='r', bins=20)
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Number')
axes[1].set_title('Not Transported')

sns.histplot(x=train['Age'], ax=axes[2], color='black', bins=20)
axes[2].set_xlabel('Age')
axes[2].set_ylabel('Number')
axes[2].set_title('Total')

In [None]:
train_copy = train.copy()
test_copy = test.copy()

train_copy['Embarked'] = train['Cabin']
train_copy['Deck'] = train['Cabin']
for i in range(train.shape[0]):
    if type(train['Cabin'][i]) == str:
        train_copy['Embarked'][i] = train['Cabin'][i][-1]
        train_copy['Deck'][i] = train['Cabin'][i][0]
        
test_copy['Embarked'] = test['Cabin']
test_copy['Deck'] = test['Cabin']
for i in range(test.shape[0]):
    if type(test['Cabin'][i]) == str:
        test_copy['Embarked'][i] = test['Cabin'][i][-1]
        test_copy['Deck'][i] = test['Cabin'][i][0]
        
bins=['P', 'S']
train_copy['Embarked'] = pd.Categorical(train_copy['Embarked'], categories=bins, ordered=True)
test_copy['Embarked'] = pd.Categorical(test_copy['Embarked'], categories=bins, ordered=True)

fig, axes = plt.subplots(1, 3, figsize=(10,4))
sns.histplot(x=train_copy[train_copy['Transported'] == 0]['Embarked'], ax=axes[0], color='b', bins=len(bins))
axes[0].set_xlabel('Embarked')
axes[0].set_ylabel('Number')
axes[0].set_title('Transported')

sns.histplot(x=train_copy[train_copy['Transported'] == 1]['Embarked'], ax=axes[1], color='r', bins=len(bins))
axes[1].set_xlabel('Embarked')
axes[1].set_ylabel('Number')
axes[1].set_title('Not Transported')

sns.histplot(x=train_copy['Embarked'], ax=axes[2], color='black', bins=len(bins))
axes[2].set_xlabel('Embarked')
axes[2].set_ylabel('Number')
axes[2].set_title('Total')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(10,4))
bins=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
train_copy['Deck'] = pd.Categorical(train_copy['Deck'], categories=bins, ordered=True)
test_copy['Deck'] = pd.Categorical(test_copy['Deck'], categories=bins, ordered=True)

sns.histplot(x=train_copy[train_copy['Transported'] == 0]['Deck'], ax=axes[0], color='b', bins=len(bins))
axes[0].set_xlabel('Deck')
axes[0].set_ylabel('Number')
axes[0].set_title('Transported')

sns.histplot(x=train_copy[train_copy['Transported'] == 1]['Deck'], ax=axes[1], color='r', bins=len(bins))
axes[1].set_xlabel('Deck')
axes[1].set_ylabel('Number')
axes[1].set_title('Not Transported')

sns.histplot(x=train_copy['Deck'], ax=axes[2], color='black', bins=len(bins))
axes[2].set_xlabel('Deck')
axes[2].set_ylabel('Number')
axes[2].set_title('Total')

In [None]:
train_copy['Money Spent'] = train_copy.apply(lambda row: row['RoomService'] + row['FoodCourt'] + row['ShoppingMall'] + row['Spa'] + row['VRDeck'], axis=1)
test_copy['Money Spent'] = test_copy.apply(lambda row: row['RoomService'] + row['FoodCourt'] + row['ShoppingMall'] + row['Spa'] + row['VRDeck'], axis=1)

In [None]:
train_copy2 = train_copy.copy()
train_copy2 = train_copy.drop(['Cabin', 'PassengerId', 'Name', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)

test_copy2 = test_copy.copy()
test_copy2 = test_copy.drop(['Cabin', 'PassengerId', 'Name', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis=1)

In [None]:
deck_mapping = {'A': 1, 'B': 2, 'C': 2, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}

train_copy2['Deck'] = train_copy2['Deck'].map(deck_mapping)
test_copy2['Deck'] = test_copy2['Deck'].map(deck_mapping)

In [None]:
train_copy2['Deck'] = train_copy2['Deck'].fillna(0)
test_copy2['Deck'] = test_copy2['Deck'].fillna(0)

In [None]:
destination = {'TRAPPIST-1e': 1, 'PSO J318.5-22': 2, '55 Cancri e': 3}

train_copy2['Destination'] = train_copy2['Destination'].map(destination)
test_copy2['Destination'] = test_copy2['Destination'].map(destination)

In [None]:
train_copy2['Destination'] = train_copy2['Destination'].fillna(0)
test_copy2['Destination'] = test_copy2['Destination'].fillna(0)

In [None]:
planet = {'Europa': 1, 'Earth': 2, 'Mars': 3}

train_copy2['HomePlanet'] = train_copy2['HomePlanet'].map(planet)
train_copy2['HomePlanet'] = train_copy2['HomePlanet'].fillna(0)

test_copy2['HomePlanet'] = test_copy2['HomePlanet'].map(planet)
test_copy2['HomePlanet'] = test_copy2['HomePlanet'].fillna(0)

In [None]:
label_encoder = LabelEncoder()

train_copy2['Embarked'] = label_encoder.fit_transform(train_copy2['Embarked']) + 1
train_copy2['Embarked'] = train_copy2['Embarked'].replace(3, 0)

test_copy2['Embarked'] = label_encoder.fit_transform(test_copy2['Embarked']) + 1
test_copy2['Embarked'] = test_copy2['Embarked'].replace(3, 0)

In [None]:
mean_train = train_copy2['Age'].mean()
train_copy2['Age'] = train_copy2['Age'].fillna(mean_train)

mean_test = test_copy2['Age'].mean()
test_copy2['Age'] = test_copy2['Age'].fillna(mean_test)

In [None]:
median_money_train = train_copy2['Money Spent'].median()
train_copy2['Money Spent'] = train_copy2['Money Spent'].fillna(median_money_train)

median_money_test = test_copy2['Money Spent'].median()
test_copy2['Money Spent'] = test_copy2['Money Spent'].fillna(median_money_test)

In [None]:
majority_value_train = train_copy2['CryoSleep'].mode()[0]
train_copy2['CryoSleep'].fillna(majority_value_train, inplace=True)
majority_value2_train = train_copy2['VIP'].mode()[0]
train_copy2['VIP'].fillna(majority_value2_train, inplace=True)

majority_value_test = test_copy2['CryoSleep'].mode()[0]
test_copy2['CryoSleep'].fillna(majority_value_test, inplace=True)
majority_value2_test = test_copy2['VIP'].mode()[0]
test_copy2['VIP'].fillna(majority_value2_test, inplace=True)

In [None]:
data = [train_copy2, test_copy2]

for d in data:
    d['CryoSleep'] = d['CryoSleep'].astype(int)
    d['VIP'] = d['VIP'].astype(int)
    d['Embarked'] = d['Embarked'].astype(int)
    d['HomePlanet'] = d['HomePlanet'].astype(int)
    d['Destination'] = d['Destination'].astype(int)
    d['Age'] = d['Age'].astype(int)
    d['Deck'] = d['Deck'].astype(int)
    
train_copy2['Transported'] = train_copy2['Transported'].astype(int)

In [None]:
X_train = train_copy2.drop('Transported', axis=1)
y_train = train_copy2['Transported']
X_test = test_copy2

In [None]:
scaler = StandardScaler()

# Normalize all columns using Z-Score
normalized_columns = scaler.fit_transform(X_train)

# Create a new DataFrame with normalized data
df_normalized = pd.DataFrame(normalized_columns, columns=X_train.columns)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)

y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)

In [None]:
submission = {"PassengerId": test['PassengerId'], "Transported": y_prediction.astype(bool)}
submisson_df = pd.DataFrame(submission)

print(submisson_df)
submisson_df.to_csv('/kaggle/working/sub.csv', index=False)