In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


In [None]:
db = pd.read_csv('data/train.csv')
db.describe()

In [None]:
db.head()
# titanic_feature_names = db.columns.values
# class_feature_names = ['False', 'True']

In [None]:
db.isnull().sum()

In [None]:
# Drop the Name column as it is not useful
db.drop(['Name','Cabin'], axis=1, inplace=True)
db.head()

In [None]:
# Put the mean of each numerical column in the missing values
columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall','Spa','VRDeck']
for column in columns:
    db[column].fillna(db[column].mean(), inplace=True)
db.isnull().sum()

In [None]:
# Put the mode for the missing values in Category columns
columns = ['HomePlanet','CryoSleep','Destination','VIP']
for column in columns:
    db[column].fillna(db[column].mode()[0], inplace=True)
db.isnull().sum()

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# split the data into train and test
X = db.drop('Transported', axis=1)
y = db['Transported']

# One hot encoding for the categorical columns by specifying the columns
X = pd.get_dummies(X, columns=['HomePlanet','CryoSleep','Destination','VIP'])
X


In [None]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
# Check the accuracy of the model
model.score(X_test, y_test)

In [None]:
from sklearn import tree
# Plot the decision tree
plt.figure(figsize=(20, 10))  # Set the figure size for better readability
tree.plot_tree(model, 
               filled=True,
                feature_names=X.columns,
                class_names=['No', 'Yes'], 
               rounded=True,
               fontsize=12)
plt.show()

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [None]:
# Check the accuracy of the model
model.score(X_test, y_test)


## Submission

In [None]:
test_db = pd.read_csv('data/test.csv')

In [None]:
# Preprocessing that was done on the training data
test_db.drop(['Name','Cabin'], axis=1, inplace=True)
columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall','Spa','VRDeck']
for column in columns:
    db[column].fillna(db[column].mean(), inplace=True)
columns = ['HomePlanet','CryoSleep','Destination','VIP']
for column in columns:
    db[column].fillna(db[column].mode()[0], inplace=True)

test_db = pd.get_dummies(test_db, columns=['HomePlanet','CryoSleep','Destination','VIP'])


In [None]:
# predict the values
y_pred = model.predict(test_db)

In [None]:
# Make a submission dataframe
submission = pd.DataFrame({'PassengerId': test_db['PassengerId'], 'Transported': y_pred})
submission.to_csv('submission.csv', index=False)
submission.head()
