### ***Spaceship Titanic***
Challeng is to predict which passengers were transported by the anomaly using records recovered from the spaceship’s damaged computer system.

#### ***Approach***
***
<img src='https://i.ibb.co/R2g3qTS/space-titanic-V1.png' width='60%'>

In [1]:
import pandas as pd

data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [159]:
def data_clean(df):#handle object and int missing values
    
    for i in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination']:
        df[i].fillna('Missing',inplace=True)
        
    for i in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Age']:
        df[i].fillna(df[i].mean(),inplace=True)
        
    df.drop(['VIP','Name','PassengerId'],axis=1,inplace=True)
    df['Cabin'] = [i[0] for i in df['Cabin'].astype('string')]
    
    return df

In [160]:
def bring_dic(df): #handle categorical values
    cat_dict = {}
    
    for i in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination']:
        temp = list(df.groupby([i])['Transported'].mean().sort_values().index)
        cat_dict[i] = {temp[i] : i+1 for i in range(len(temp))}
    
    return cat_dict

In [161]:
def cat_handel(df,cat_dict): #support function
    
    for i in cat_dict.keys():
        df[i] = df[i].map(cat_dict[i])
    
    return df

In [162]:
from sklearn.base import BaseEstimator , TransformerMixin 
## this class isn't used it was just for some testing/trying coustom transformer
class data_preprocess(BaseEstimator,TransformerMixin):
    def __init__(self,dataframe):
        self.dataframe = dataframe
    def fit(self):
        return self
    def transform(self,X=None,y=None):
        data_cleaned = data_clean(self.dataframe)
        data_cat_handled = cat_handel(self.dataframe,bring_dic(self.dataframe))
        
        return data_cat_handled

In [163]:
#### appling all function ####
data_cleaned = data_clean(data)
rel_dic = bring_dic(data)
data_cat_handled = cat_handel(data,rel_dic)

In [164]:
X,y = data.drop('Transported',axis=1),data['Transported']

In [165]:
from sklearn.preprocessing import StandardScaler
std_scl = StandardScaler()
X_std = std_scl.fit_transform(X)

In [166]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
rfc = RandomForestClassifier()

param_grid = { 
    'n_estimators': [150,200,300,400,500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,9,10],
    'criterion' :['gini', 'entropy']
}

rscv = RandomizedSearchCV(rfc,param_grid,n_iter=75,n_jobs=-1,random_state=42,cv=3,verbose=2)
rscv.fit(X_std,y)
rscv.best_estimator_ , rscv.best_score_

Fitting 3 folds for each of 75 candidates, totalling 225 fits


(RandomForestClassifier(max_depth=10, max_features='log2', n_estimators=200),
 0.7998402161029167)

In [170]:
tuned_rfc = RandomForestClassifier(max_depth=10, max_features='log2',
                        n_estimators=200)
tuned_rfc.fit(X_std,y)

In [171]:
test = pd.read_csv('data/test.csv')
test.head(2)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers


In [172]:
test_cleaned = data_clean(test)
test_cat_handled = cat_handel(test_cleaned,rel_dic)

In [175]:
test_scl = std_scl.fit_transform(test_cat_handled)
pred = tuned_rfc.predict(test_scl)

l = pd.read_csv('data/test.csv')
sub = pd.DataFrame({
    'PassengerId' : l['PassengerId'],
    'Transported' : pred
})

sub.to_csv('submission_2.csv',index=False)