# Analysis of  Spaceship Titanic Data

In [1]:
from pyspark.sql import SparkSession
import seaborn as sns
import matplotlib.pyplot as plt
spark = SparkSession.builder.appName("Spaceship_Transportation_Analytics").getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark

ModuleNotFoundError: No module named 'pyspark'

In [None]:
df= spark.read.csv("spaceship/train.csv",header=True,inferSchema=True)

In [None]:
import pyspark.pandas as ps
import numpy as np
import pandas as pd
psdf = df.toPandas().set_index("PassengerId")
psdf.drop("Name",axis=1,inplace=True)
psdf.head(5)

In [None]:
np.sum(psdf.isna())

In [None]:
psdf.HomePlanet = psdf.HomePlanet.fillna("UnkownPlanet")
np.sum(psdf.HomePlanet.isna())

In [None]:
psdf["CryoSleep"].fillna(False,inplace=True)
np.sum(psdf["CryoSleep"].isna())

In [None]:
pd.crosstab(index=psdf["Transported"],columns=psdf["CryoSleep"]).plot.barh(stacked=True)
plt.show()

In [None]:
psdf["Cabin"] = psdf.Cabin.map(lambda x:x[0] if x else "H")
psdf['Destination'].fillna(psdf['Destination'].mode()[0],inplace=True)

In [None]:
temp = pd.crosstab(index=psdf["Destination"],columns=psdf["Transported"])
temp.div(np.sum(temp,axis=1),axis=0).plot.bar(stacked=True)
plt.show()

In [None]:
temp = pd.crosstab(index=psdf["Cabin"],columns=psdf["Transported"])
temp.div(np.sum(temp,axis=1),axis=0).plot.bar(stacked=True)
plt.show()

In [None]:
temp = pd.crosstab(index=psdf["Cabin"],columns=psdf["Destination"])
temp.div(np.sum(temp,axis=1),axis=0).plot.bar(stacked=True)
plt.show()

In [None]:
temp = pd.crosstab(index=psdf["Cabin"],columns=[psdf['Transported'],psdf["Destination"]]).swaplevel(axis=1).sort_index(axis=1)
temp = temp.divide(temp.groupby(axis=1,level=0).sum(),level=0)
temp.plot(kind='barh',subplots=True,layout=(3,2),figsize=(9,9),sharey=True)
plt.show()

### It can be Infered from above graph Cabin B and C are more dangerous regardless of Destination and people going on Trappist less likely  to select A, B and C i.e Trappist people who doesn't select  Cabin A, B and C have lower chance of being teleported

In [None]:
sns.histplot(psdf["Age"],kde=True)
plt.show()

In [None]:
psdf["Age"].fillna(psdf["Age"].median(),inplace=True)
np.sum(psdf["Age"].isna())

In [None]:
sns.histplot(psdf.loc[psdf["Transported"]==True]["Age"])
plt.show()

In [None]:
sns.histplot(psdf.loc[psdf["Transported"]==False]["Age"])
plt.show()

In [None]:
fig,ax= plt.subplots(1,1,figsize=(7,6))
sns.boxplot(psdf,x="Transported",y="Age",ax=ax)
plt.show()

##### From above it can be seen that small kids have high chance of teleportation

In [None]:
psdf["VIP"].fillna(False,inplace=True)
temp = pd.crosstab(index=psdf["Transported"],columns=psdf["VIP"])
temp.div(np.sum(temp,axis=1),axis=0).plot.bar(stacked=True)
plt.show()

### There doesn't seems to be much relation between transported and VIP feature

In [None]:
temp = psdf.loc[psdf["VIP"]==True][["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]]
temp.sum()/temp.count()

In [None]:
temp = psdf.loc[psdf["VIP"]==False][["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]]
temp.sum()/temp.count()

#### VIP people are much likely to spend on RoomService, FoodCourt, ShoppingMall, Spa and VRDeck than normal people

In [None]:
sns.heatmap(psdf.select_dtypes(exclude="object").corr(),annot=True,vmin=-1,vmax=1)
plt.show()

#### Peop

In [None]:
temp = psdf[["ShoppingMall","Spa","VRDeck","FoodCourt","Transported","RoomService"]]
temp =  temp.loc[
    (temp["ShoppingMall"]<temp["ShoppingMall"].quantile(0.99))
    &(temp["Spa"]<temp["Spa"].quantile(0.99))
    &(temp["VRDeck"]<temp["VRDeck"].quantile(0.99))
    &(temp["FoodCourt"]<temp["FoodCourt"].quantile(0.99))
    &(temp["RoomService"]<temp["RoomService"].quantile(0.99))
]
sns.pairplot(temp,diag_kind="kde",hue="Transported")
plt.show()

#### From above graph we can infer, people that spend more money on Spa, RoomService and VRDeck are likely to be transported and people that spend money on Spa and VRDeck are less likely to spend in ShoppingMall and RoomService

# Applying ML models

In [None]:
train = pd.read_csv('./Dataset/spaceship-titanic/train.csv')
test = pd.read_csv('./Dataset/spaceship-titanic/train.csv')

In [None]:
STRATEGY = 'median'
TARGET = 'Transported'

In [None]:
imputer_cols = ["Age", "FoodCourt", "ShoppingMall", "Spa", "VRDeck" ,"RoomService"]
imputer = SimpleImputer(strategy=STRATEGY )
imputer.fit(train[imputer_cols])
train[imputer_cols] = imputer.transform(train[imputer_cols])
test[imputer_cols] = imputer.transform(test[imputer_cols])
train["HomePlanet"].fillna('Z', inplace=True)
test["HomePlanet"].fillna('Z', inplace=True)

In [None]:
label_cols = ["HomePlanet", "CryoSleep","Cabin", "Destination" ,"VIP"]
def label_encoder(train,test,columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] =  LabelEncoder().fit_transform(test[col])
    return train, test

train ,test = label_encoder(train,test ,label_cols)

In [None]:
train.drop(["Name" ,"Cabin"] , axis = 1 ,inplace = True)
test.drop(["Name" ,"Cabin"] , axis = 1 ,inplace = True)
X = train.drop(TARGET , axis =1 )
y = train[TARGET]
X_train , X_test , y_train , y_test = train_test_split(X, y, random_state = 20, test_size =0.33)

In [None]:
lr = LogisticRegression(solver='liblinear', random_state=0)

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.classes_

In [None]:
lr.intercept_

In [None]:
lr.coef_

In [None]:
lr.predict_proba(X_test)

In [None]:
lr.predict(X_test)

In [None]:
lr.score(X_test, y_test)

In [None]:
confusion_matrix(y_test, lr.predict(X_test))

In [None]:
lr.score(X_train, y_train)

In [None]:
cm = confusion_matrix(y_test, lr.predict(X_test))

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()

In [None]:
print(classification_report(y_test, lr.predict(X_test)))