<a href="https://www.kaggle.com/code/drewftw260/spaceship-titanic-analysis?scriptVersionId=173753046" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


## Spaceship Titanic - In this competition your task is to predict whether a passenger was 
## transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. 
## To help you make these predictions,
## you're given a set of personal records recovered from the ship's damaged computer system.
## Importing Libraries

import pandas as pd
import matplotlib as plt
import numpy as np


# In[2]:


#Reading data from .csv file
sttrn = pd.read_csv("train.csv")


# In[3]:


#Displaying the first five rows of the dataframe
sttrn.head()


# I confirm that the csv file is uploaded and I am able to view all the information.

# In[4]:


#Number of rows and columns in the dataframe
sttrn.shape


# This dataframe consist of 8693 rows and 14 columns.

# In[5]:


## Statistical summaries about the dataframe
sttrn.describe()


# A lot of the information is either mssing or is not numeric

# In[6]:


# Information about the different columns in dataframe
sttrn.info()


# The non-null count does not match to the total amount of entries in this dataframe.

# In[7]:


#Checking for missing data
sttrn.isnull().sum()


# There are a lot of missing values.

# # -Data Cleaning-

# In[8]:


## Dropping uncessary columns for my prediction
sttrn = sttrn.drop(['PassengerId','Name'],axis=1)


# In[9]:


sttrn.head(50)


# In[10]:


sttrn.tail(50)


# In[11]:


## Changing NaN values to 0
sttrn['RoomService'] = sttrn['RoomService'].fillna(0)
sttrn['FoodCourt'] = sttrn['FoodCourt'].fillna(0)
sttrn['ShoppingMall'] = sttrn['ShoppingMall'].fillna(0)
sttrn['Spa'] = sttrn['Spa'].fillna(0)
sttrn['VRDeck'] = sttrn['VRDeck'].fillna(0)
sttrn['Age'] = sttrn['Age'].fillna(0)


# In[12]:


sttrn.head(10)


# In[13]:


##Changing float to int
sttrn['RoomService'] = sttrn['RoomService'].astype('int64')
sttrn['FoodCourt'] = sttrn['FoodCourt'].astype('int64')
sttrn['ShoppingMall'] = sttrn['ShoppingMall'].astype('int64')
sttrn['Spa'] = sttrn['Spa'].astype('int64')
sttrn['VRDeck'] = sttrn['VRDeck'].astype('int64')
sttrn['Age'] = sttrn['Age'].astype('int64')


# In[14]:


sttrn.head()


# In[15]:


## Dropping rows with Na Values
sttrn.dropna(subset = ['HomePlanet','CryoSleep','Cabin','Destination','VIP'],inplace=True)


# In[16]:


sttrn.shape


# In[17]:


sttrn.info()


# ## - Mapping Values -

# In[18]:


sttrn.HomePlanet.value_counts()


# In[19]:


sttrn.Destination.value_counts()


# In[20]:


sttrn['HomePlanet'] = sttrn['HomePlanet'].map({'Europa':1, 'Earth':2, 'Mars':3})


# In[21]:


sttrn['Destination'] = sttrn['Destination'].map({'TRAPPIST-1e':1, '55 Cancri e': 2, 'PSO J378.5-22': 3})


# In[22]:


sttrn.head(10)


# In[23]:


## Cleaning the cabin column
c_c = sttrn['Cabin'].str.split('/', expand = True)
c_c.columns = ['Deck', 'Cabin_Number','Cabin_Position']
c_c


# In[24]:


sttrn = pd.concat([sttrn, c_c],axis = 1)


# In[25]:


sttrn.head()


# In[26]:


sttrn.drop('Cabin',axis=1, inplace = True)


# In[27]:


sttrn.head()


# In[28]:


decks = pd.get_dummies(sttrn.Deck, drop_first = True)
cabin_positions = pd.get_dummies(sttrn.Cabin_Position, drop_first = True)


# In[29]:


sttrn = pd.concat([sttrn, decks, cabin_positions], axis = 1)


# In[30]:


sttrn.drop(['Deck','Cabin_Position'], axis = 1, inplace=True)


# In[31]:


sttrn.head()


# ## Save Clean Dataset

# In[32]:


sttrn.to_csv('sttrn_cln')


# In[33]:


sttrn2 = pd.read_csv('sttrn_cln')


# ## Notes so Far 

# HomePlanet: Europa = 1, Earth = 2, Mars = 3
# Destination: TRAPPIST-1e = 1, 55 Cancri e = 2, PSO J378.5-22 = 3

# ## Visualization 

# In[34]:


## Visualizing which Homeplanet had the most survivors
sttrn2.groupby('HomePlanet')['Transported'].value_counts().plot(kind='pie', autopct='%.2f')


# ## Feature Engineering 

# In[35]:


## I want to start by changing the boolean values in Transported
## and CryoSleep columns to numeric

sttrn2.CryoSleep = sttrn2.CryoSleep.replace({True: 1, False: 0})
sttrn2.Transported = sttrn2.Transported.replace({True: 1, False: 0})


# In[36]:


## Dropping Na Values
sttrn2.dropna(subset=['CryoSleep','Transported','Destination',], inplace = True)


# In[37]:


sttrn2.head(10)


# In[38]:


sttrn2 = sttrn2.loc[:, ~sttrn2.columns.str.contains('^Unnamed')]


# In[39]:


sttrn2.head()


# In[40]:


## Importing Train_Test_Split
from sklearn.model_selection import train_test_split


# In[41]:


## Split data into feature and target variables
x = sttrn2.drop('Transported', axis = 1)
y = sttrn2['Transported']


# In[42]:


## Splitting data into training and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state = 1)


# In[43]:


## Create DecisionTree Model
from sklearn.tree import DecisionTreeClassifier


# In[44]:


## Create DecisionTree Classifier Object
clf = DecisionTreeClassifier()


# In[45]:


## Train DecisionTree Classifier
clf = clf.fit(x_train, y_train)


# In[46]:


## Predict the response for the test dataset
y_pred = clf.predict(x_test)


# In[47]:


## Importing Metrics
from sklearn import metrics


# In[48]:


## Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


# In[49]:


## Importing Logistic Regression
from sklearn.linear_model import LogisticRegression


# In[50]:


model = LogisticRegression()


# In[51]:


model.fit(x_train, y_train)


# In[52]:


model.score(x_test,y_test)


# In[53]:


## Import libraries
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold


# In[54]:


## defining parameters for Grid
solvers = ['newton-cg', 'lbfgs','liblinear']
penalty = ['l2']
c_values = [100,10,1.0,0.1,0.01]


# In[55]:


from sklearn.datasets import make_blobs


# In[56]:


x,y= make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)


# In[57]:


##Defining grid search
grid = dict(solver = solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3,random_state = 1)
grid_search = GridSearchCV(estimator = model, param_grid=grid, n_jobs=1, cv=cv, scoring = 'accuracy', error_score = 0)
grid_result = grid_search.fit(x,y)


# In[58]:


##Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means,stds,params):
    print('%f(%f) with: %r' % (mean,stdev,params))


# ## -Importing and Processing Data-

# In[59]:


sttst = pd.read_csv('test.csv')


# In[60]:


sttst.head()


# In[61]:


##Store the passengerIDs
pID = sttst['PassengerId'].values
len(sttst)


# In[62]:


## Creating a function
def process(space):
    space.drop(['PassengerId','Name'],inplace = True, axis=1)
    
    # Map Data
    space['HomePlanet'] = space['HomePlanet'].map({'Europa':1, 'Earth':2, 'Mars':3})
    space['Destination'] = space['Destination'].map({'TRAPPIST-1e':1, '55 Cancri e': 2, 'PSO J378.5-22': 3})
    
    #Dropping rows with Na Values
    #space.dropna(subset = ['HomePlanet','CryoSleep','Cabin','Destination','VIP',],inplace=True)
        
    c_c = space['Cabin'].str.split('/', expand = True)
    c_c.columns = ['Deck', 'Cabin_Number','Cabin_Position']
    space = pd.concat([space, c_c],axis = 1)
    space.drop('Cabin', axis = 1, inplace = True)
    
    decks = pd.get_dummies(space.Deck, drop_first = True)
    cabin_positions = pd.get_dummies(space.Cabin_Position, drop_first = True)
    
    space = pd.concat([space, decks, cabin_positions], axis = 1)
    
    #Drop decks and cabin position
    space.drop(['Deck', 'Cabin_Position'], axis = 1, inplace = True)
    
    # Filling missing values with 0
    space['Cabin_Number'] = space['Cabin_Number'].fillna(0)
    space['RoomService'] = space['RoomService'].fillna(0)
    space['FoodCourt'] = space['FoodCourt'].fillna(0)
    space['ShoppingMall'] = space['ShoppingMall'].fillna(0)
    space['Spa'] = space['Spa'].fillna(0)
    space['VRDeck'] = space['VRDeck'].fillna(0)
    space['Age'] = space['Age'].fillna(0)
    space['HomePlanet'] = space['HomePlanet'].fillna(0)
    space['CryoSleep'] = space['CryoSleep'].fillna(0)
    space['Destination'] = space['Destination'].fillna(0)
    space['VIP'] = space['VIP'].fillna(0)
    space['B'] = space['B'].fillna(space['B'].mode())
    space['C'] = space['C'].fillna(space['C'].mode())
    space['D'] = space['D'].fillna(space['D'].mode())
    space['E'] = space['E'].fillna(space['E'].mode())
    space['F'] = space['F'].fillna(space['F'].mode())
    space['G'] = space['G'].fillna(space['G'].mode())
    space['T'] = space['T'].fillna(space['T'].mode())
    space['S'] = space['S'].fillna(space['S'].mode())
    
    #Changing float values to int
    space['RoomService'] = space['RoomService'].astype('int64')
    space['FoodCourt'] = space['FoodCourt'].astype('int64')
    space['ShoppingMall'] = space['ShoppingMall'].astype('int64')
    space['Spa'] = space['Spa'].astype('int64')
    space['VRDeck'] = space['VRDeck'].astype('int64')
    space['Age'] = space['Age'].astype('int64')
    space['HomePlanet'] = space['HomePlanet'].astype('int64')
    space['Destination'] = space['Destination'].astype('int64')
    space['B'] = space['B'].astype('int64')
    space['C'] = space['C'].astype('int64')
    space['D'] = space['D'].astype('int64')
    space['E'] = space['E'].astype('int64')
    space['F'] = space['F'].astype('int64')
    space['G'] = space['G'].astype('int64')
    space['T'] = space['T'].astype('int64')
    space['S'] = space['S'].astype('int64')
    
    return space   
    
    
    


# In[63]:


sttst = process(sttst)


# In[64]:


sttst.head()


# In[65]:


sttst.shape


# In[66]:


sttst.info()


# In[67]:


sttst.isnull().sum()


# In[68]:


## Predict Test Dataset
y_prediction = model.predict(sttst)


# In[69]:


submission = pd.DataFrame({'PassengerId' : pID,
                          'Transported' : y_prediction})


# In[70]:


submission.head(20)


# In[71]:


submission.shape


# In[72]:


submission.to_csv("Spaceship_Titanic_submission.csv", index = False)


# In[ ]:





# In[ ]: