In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

In [99]:
df=pd.read_csv('../input/spaceship-titanic/train.csv')
df = df.drop(["PassengerId","Name","Cabin"], axis=1)
df.dropna(inplace=True)
df.tail()

In [100]:
df['VRDeck'].fillna(df['VRDeck'].mean(), inplace=True)
df['RoomService'].fillna(df['RoomService'].mean(), inplace=True)
df['FoodCourt'].fillna(df['FoodCourt'].mean(), inplace=True)
df['ShoppingMall'].fillna(df['ShoppingMall'].mean(), inplace=True)
df['Spa'].fillna(df['Spa'].mean(), inplace=True)
df["TotalCost"] = df['VRDeck']//4 + df['RoomService']//4 + df['FoodCourt']//4 + df['ShoppingMall']//4
df = df.astype({'TotalCost':'int'})


In [101]:
def encode_total_cost(x):
    max_cost = df['TotalCost'].max()
    if x == 0: return 0
    elif x>0 and x<= max_cost/4: return 1
    elif x> max_cost/4 and x<= max_cost/2: return 2
    return 3
df["TotalCost"] = df["TotalCost"].map(encode_total_cost)
df.tail()

In [102]:
df.isnull().sum()

In [103]:
# df.dropna(inplace=True)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df["Transported"] = df["Transported"].astype(int)
df["CryoSleep"] = df["CryoSleep"].astype(int)
df["VIP"] = df["VIP"].astype(int)
df.replace({'HomePlanet':{'Europa':0,'Earth':1, 'Mars':2}, 'Destination':{'TRAPPIST-1e':0,'55 Cancri e':1,'PSO J318.5-22':2}}, inplace=True)

In [104]:
df.tail()

In [105]:
# DATA ANALYSIS AND VISUALIZATION
df.describe()

In [106]:
# NUMBER OF VALUES FOR EACH QUALITY
sns.countplot('HomePlanet', hue='Destination', data=df)

In [107]:
df.shape

In [108]:
# FIND CORRELATION by MAKING A HEAT MAP
correlation=df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(correlation,cbar=True,fmt='.1f',annot=True,annot_kws={'size':8},cmap='Blues')

In [109]:
# DATA PREPROCESSING
#   SEPARATE THE DATA AND LABEL
X=df.drop(columns=['Transported',"RoomService"	,"FoodCourt",	"ShoppingMall",	"Spa",	"VRDeck"],axis=1)
# LABEL BINARIZATION
Y=df['Transported']
Y.value_counts()

In [110]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)


In [139]:
model = LogisticRegression(C = 0.01)
model.fit(X_train, Y_train)


In [140]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
print(X_train_prediction)


In [141]:
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

In [142]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
print(X_test_prediction)


In [115]:
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

In [118]:
classifier = svm.SVC(kernel='linear')
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)


In [131]:
import tensorflow as tf
model = tf.keras.Sequential([                        
    tf.keras.layers.Dense(10,input_shape=(X_train.shape[-1],), activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'), 
])
model.summary()

In [132]:
model.compile(tf.keras.optimizers.Adam(
    learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam'
),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC()])
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(
  monitor='val_accuracy', factor=0.1, patience=2,
  mode='auto', min_delta=0.0001, cooldown=0, min_lr=0,
)]
history=model.fit(X_train, Y_train, validation_data=(X_test,Y_test), epochs=10, batch_size=16, callbacks=callbacks)

In [137]:
y_pred=model.predict(X_test)
labels=Y_test
y_pred=[1 if y_pred[i][0] > 0.5 else 0 for i in range(0,len(y_pred))]

In [138]:
cm = confusion_matrix(y_true=labels, y_pred=y_pred)
cm_plot_labels = ['not transported','transported']
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=cm_plot_labels)


print("The val accuracy of the test model is "+str((cm[0][0]+cm[1][1])/Y_test.shape[0]))
print("\n")
disp = disp.plot()
plt.show()

In [126]:
model.save("spaceship.h5")