<a href="https://colab.research.google.com/github/anjalidabare/Spaceship_Titanic_KaggleCompetition/blob/main/Spaceship_Titanic_Kaggle_Competition_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notes

*   Data Set is Balanced for target variable
*   Many variables have missing values
*   'Passenger_ID' is an identification variable. Hence pre-processing is not needed.
*   'Name' should be dropped due to high cardinality
*   'Cabin' variable should be splitted before use.
*   Target Variable is Binary. Hence Binary Classification models should be used.








In [None]:
!pip install kaggle

In [None]:
!mkdir ~/.kaggle

In [117]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [118]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
#!kaggle competitions download -c spaceship-titanic

In [None]:
!unzip spaceship_titanic.zip

In [None]:
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [7]:
import pandas as pd
from pandas_profiling import ProfileReport

In [41]:
df=pd.read_csv('train.csv')

In [None]:
df.info()

In [10]:
profile = ProfileReport(df, title='Spaceship Titanic')

In [None]:
profile.to_notebook_iframe()

In [None]:
df.columns

In [26]:
#split fucntion for cabin variable
def split_function(x):
  if len(str(x).split('/'))<3:
    return ["Missing","Missing","Missing"]
  else:
    return str(x).split('/')


In [47]:
#Pre-processing function
def pre_processing(df):
  #fill missing values of Homeplanet with'missing'
  df['HomePlanet'].fillna('Missing',inplace=True)
  #fill missing values of Cryosleep with 'missing'
  df['CryoSleep'].fillna('Missing',inplace=True)
  #split the cabin variable and create a Tempcabin variable
  df['Tempcabin']=df['Cabin'].apply(split_function)
  #create a new column for cabin Deck
  df['Deck']=df['Tempcabin'].apply(lambda x: x[0])
  #create a new column for cabin side
  df['Side']=df['Tempcabin'].apply(lambda x: x[2])
  #dropping Tempcabin variable
  df.drop('Tempcabin',axis=1,inplace=True)
  #dropping Cabin variable
  df.drop('Cabin',axis=1,inplace=True)
  #fill missing values of Deastination with'missing'
  df['Destination'].fillna('Missing',inplace=True)
  #fill missing values of Age with'mean'
  df['Age'].fillna(df['Age'].mean(),inplace=True)
  #fill missing values of VIP with'missing'
  df['VIP'].fillna('Missing',inplace=True)
  #fill missing values of monetory variables with 0
  df['RoomService'].fillna(0,inplace=True)
  df['FoodCourt'].fillna(0,inplace=True)
  df['ShoppingMall'].fillna(0,inplace=True)
  df['Spa'].fillna(0,inplace=True)
  df['VRDeck'].fillna(0,inplace=True)
  #dropping Name variable
  df.drop('Name',axis=1,inplace=True)


In [48]:
#creating analytical base table
abt=df.copy()

In [49]:
pre_processing(abt)

In [None]:
abt.info()

#Modelling

*   Create Feature and Target variables
*   train test split
*   one hot encoding for categorical variables
*   create model pipleines  





In [52]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [55]:
#Feature Variable
X=abt.drop(['Transported','PassengerId'],axis=1)
#onehotecoding
X=pd.get_dummies(X)
#Target Variable
y=abt['Transported']

In [56]:
#train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=123)

In [73]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [74]:
#setup ML Pipeline

pipelines= {
    'lr':make_pipeline(StandardScaler(), LogisticRegression(random_state=123)),
    'kn':make_pipeline(StandardScaler(), KNeighborsClassifier()),
    'svm':make_pipeline(StandardScaler(), LinearSVC(random_state=123)),
    'rf':make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123)),

}

In [79]:
#create grid

grid={
    'lr':{
        'logisticregression__C':[0.5,0.1,0.15] ,'logisticregression__max_iter':[50,100,150,200]
    },
    'kn':{
        'kneighborsclassifier__n_neighbors':[5,10,15],'kneighborsclassifier__n_jobs':[-1]
    },
    'svm':{
        'linearsvc__C':[1.0,1.5,2.0],'linearsvc__max_iter':[1000,2000,3000,4000,5000]
    },
    'rf':{
        'randomforestclassifier__n_estimators':[100,200,300],'randomforestclassifier__n_jobs':[-1]
    },
    'gb':{
        'gradientboostingclassifier__n_estimators':[100,200,300]
    }
}

In [None]:
fitted_models={}
#loop through all the algos
for algo,pipeline in pipelines.items():
  #create new gridsearch cv class
  model=GridSearchCV(pipeline,grid[algo],n_jobs=-1,cv=10)
  #train the models for selected parameters
  model.fit(X_train,y_train)
  #store the model in the dictionary
  fitted_models[algo]=model

#Model Evaluation

In [83]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [86]:
#test the accuracy of each model fitted
for algo,model in fitted_models.items():
  yhat=model.predict(X_test)
  accuracy=accuracy_score(y_test,yhat)
  precision=precision_score(y_test,yhat)
  recall=recall_score(y_test,yhat)
  print(f'Metric for {algo}: accuracy- {accuracy}, precision- {precision}, recall- {recall}')



Metric for lr: accuracy- 0.8056354226566993, precision- 0.7956989247311828, recall- 0.8333333333333334
Metric for kn: accuracy- 0.7768832662449684, precision- 0.8246753246753247, recall- 0.7150900900900901
Metric for svm: accuracy- 0.8062104657849338, precision- 0.8017524644030668, recall- 0.8243243243243243
Metric for rf: accuracy- 0.80448533640023, precision- 0.8246445497630331, recall- 0.7837837837837838
Metric for gb: accuracy- 0.816561242093157, precision- 0.7979057591623037, recall- 0.8581081081081081


#Saving the Model

In [87]:
import pickle

In [88]:
with open('gradientboosted.pkl','wb')as f:
  pickle.dump(fitted_models['gb'],f)

In [89]:
#reload the saved model
with open('gradientboosted.pkl','rb')as f:
  reloaded_model=pickle.load(f)

In [None]:
reloaded_model

#Predicition on Test Data

*  test data set does not have the target variable
*  so the prediction should be done on feature variables after following through the pre-processing and one hot encoding


In [91]:
test_df=pd.read_csv('test.csv')

In [92]:
abt_test=test_df.copy()

In [94]:
#preprocess test data
pre_processing(abt_test)

In [98]:
#onehot encoding on test data
abt_test_final=pd.get_dummies(abt_test.drop('PassengerId',axis=1))


In [102]:
yhat_test=fitted_models['gb'].predict(abt_test_final)

In [106]:
test_df['PassengerId']

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

In [109]:
submission_df=pd.DataFrame([test_df['PassengerId'],yhat_test]).T

In [111]:
submission_df.columns=['PassengerId','Transported']

#Submit to Kaggle

In [114]:
submission_df.to_csv('kaggle_submission.csv',index=False)

In [121]:
!kaggle competitions submit -c spaceship-titanic -m "Initial Model" -f "kaggle_submission.csv"

100% 56.2k/56.2k [00:03<00:00, 18.0kB/s]
Successfully submitted to Spaceship Titanic