In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [890]:
train_df = pd.read_csv("data/train.csv")
test_df=pd.read_csv("data/test.csv")
(test_df.columns), (train_df.columns)

(Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

In [891]:
test_df.set_index("PassengerId")

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


### Feature Engineering using Kaggle provided steps

In [892]:
def transform_df(df, is_test_dataset=None):
    def substrings_in_string(big_string, substrings):
        for substring in substrings:
            if type(big_string)==str:
                if big_string.find(substring) != -1:
                    return substring
        #print (big_string)
        return np.nan

    def replace_titles(x):
        title=x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title =='Dr':
            if x['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title


    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev','Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess','Don', 'Jonkheer']
    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))


    #replacing all titles with mr, mrs, miss, master
    df['Title']=df.apply(replace_titles, axis=1)


    #Turning cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

    #Creating new family_size column
    df['Family_Size']=df['SibSp']+df['Parch']

    #Age*Class
    df['Age*Class']=df['Age']*df['Pclass']

       
    # dropping passengerId and Family_Size
    df.drop(['Family_Size'], axis=1, inplace=True)
    if is_test_dataset == False or is_test_dataset == None:
        df.drop(['PassengerId'], axis=1, inplace=True)
    df.drop(["Cabin","Deck"], axis=1, inplace=True)
    df.drop(["Name","Ticket"], axis=1, inplace=True)
     #Imputation using ColumnTransformer
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import LabelEncoder
    si1=SimpleImputer(strategy='mean');
    si2=SimpleImputer(strategy='constant',fill_value='S');
    le= LabelEncoder()

    num_cols = ["Age", "Age*Class","Fare"];
    emb_col = ["Embarked"]

    #Imputing Southampton to Embarked since it is the starting point
    ct = ColumnTransformer([
        ("mean_", si1,num_cols),
        ("emb_", si2,emb_col)
    ])
    fill_mean_emb=ct.fit_transform(df)
    filled_mean_emb = pd.DataFrame(data=fill_mean_emb,columns=["Age","Age*Class","Fare","Embarked"])

    for column in filled_mean_emb.columns:
        df[column]=filled_mean_emb[column]
    df["Fare"].fillna(value=df["Fare"].mean(), inplace=True)
    return df


In [893]:
train_df=transform_df(train_df)
test_df=transform_df(test_df, True)

In [894]:
test_df.set_index("PassengerId")

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age*Class
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,3,male,34.5,0,0,7.8292,Q,Mr,103.5
893,3,female,47.0,1,0,7.0000,S,Mrs,141.0
894,2,male,62.0,0,0,9.6875,Q,Mr,124.0
895,3,male,27.0,0,0,8.6625,S,Mr,81.0
896,3,female,22.0,1,1,12.2875,S,Mrs,66.0
...,...,...,...,...,...,...,...,...,...
1305,3,male,30.27259,0,0,8.0500,S,Mr,59.033373
1306,1,female,39.0,0,0,108.9000,C,Mr,39.0
1307,3,male,38.5,0,0,7.2500,S,Mr,115.5
1308,3,male,30.27259,0,0,8.0500,S,Mr,59.033373


In [849]:
test_df.isna().sum(), train_df.isna().sum()

(PassengerId    0
 Pclass         0
 Sex            0
 Age            0
 SibSp          0
 Parch          0
 Fare           0
 Embarked       0
 Title          0
 Age*Class      0
 dtype: int64,
 Survived     0
 Pclass       0
 Sex          0
 Age          0
 SibSp        0
 Parch        0
 Fare         0
 Embarked     0
 Title        0
 Age*Class    0
 dtype: int64)

### Data Preparation or Preprocessing

In [850]:
train_df['Age']=train_df['Age'].astype(int)
train_df['Age*Class']=train_df['Age*Class'].astype(int)
train_df['Sex']=train_df['Sex'].apply(lambda x: 1 if x=='male' else 0)

In [851]:
test_df['Age']=test_df['Age'].astype(int)
test_df['Age*Class']=test_df['Age*Class'].astype(int)
test_df['Sex']=test_df['Sex'].apply(lambda x: 1 if x=='male' else 0)

In [852]:
#pd.get_dummies(train_df, columns=["Title"])
def encode_ordinal(df, column):
    from sklearn.preprocessing import OrdinalEncoder
    ord1=OrdinalEncoder()
    ord1=ord1.fit([df[column]])
    df[column]=ord1.fit_transform(df[[column]])
    return df;

In [853]:
train_df=encode_ordinal(train_df,'Title')
train_df=encode_ordinal(train_df,'Embarked')

test_df=encode_ordinal(test_df,'Title')
test_df=encode_ordinal(test_df,'Embarked')

# Modelling

In [806]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

In [807]:
MODEL_NAMES= {"RandomForestClassifier": RandomForestClassifier,"BaggingClassifier" : BaggingClassifier,
              "KNeighborsClassifier":KNeighborsClassifier,
             "SVC":SVC,"LinearSVC": LinearSVC}
MODEL_NAMES
SCORES= {"RandomForestClassifier": 0,"BaggingClassifier" :0,"KNeighborsClassifier":0,
        "SVC":0, "LinearSVC":LinearSVC}


In [808]:
X, y = train_df.drop("Survived",axis=1), train_df["Survived"]

In [809]:
for modelname,model in MODEL_NAMES.items():
    clf = model()
    clf.fit(X,y)
    #clf.feature_importances_
    np.random.seed(42)
    SCORES[modelname]=cross_val_score(clf, X,y,cv=5).mean()

In [810]:
SCORES

{'RandomForestClassifier': 0.8204444165463561,
 'BaggingClassifier': 0.8193145439708743,
 'KNeighborsClassifier': 0.6858075450379764,
 'SVC': 0.7049588851923922,
 'LinearSVC': 0.7407632916954366}

In [811]:
clf1 = RandomForestClassifier()
clf1.fit(X,y)
clf1.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [812]:
#GridSearchCV for RandomForestClassifier 
from sklearn.model_selection import GridSearchCV

param_grid={
    "n_estimators" :[10,50,100],
    "max_features" : ["sqrt","log2"],
    "max_depth": [20,90],
    "min_samples_split": [3,10],
    "min_samples_leaf": [2,10]
    
}

In [813]:
from sklearn.model_selection import train_test_split

In [814]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [815]:
param_grid={
    "n_estimators" :[10,50,100],
    "max_features" : ["sqrt","log2"],
    "max_depth": [20,90],
    "min_samples_split": [3,5,10],
    "min_samples_leaf": [2,10]
}
gsc = GridSearchCV(estimator=clf1,param_grid=param_grid,n_jobs=8)
gsc.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(), n_jobs=8,
             param_grid={'max_depth': [20, 90],
                         'max_features': ['sqrt', 'log2'],
                         'min_samples_leaf': [2, 10],
                         'min_samples_split': [3, 5, 10],
                         'n_estimators': [10, 50, 100]})

In [816]:
gsc.best_score_, gsc.best_estimator_

(0.8369348960898257,
 RandomForestClassifier(max_depth=90, max_features='sqrt', min_samples_leaf=2,
                        min_samples_split=10, n_estimators=10))

### Using Best Model 

In [854]:
model = RandomForestClassifier(max_depth=20, max_features='log2', 
                       min_samples_leaf=2, min_samples_split=5, 
                       n_estimators=10)
model.fit(X,y)
np.random.seed(42)
cross_val_score(model, X,y,cv=5).mean()

0.8238152030632101

### Running Predictions on Test Dataset

In [875]:
test_df.head()
cols=test_df.columns.to_list()
cols.remove("PassengerId")


In [877]:
preds=model.predict(test_df[cols])

###  Submissions

In [885]:
submissions = pd.DataFrame({"PassengerId":test_df.PassengerId, "Survived":preds})
submissions.reset_index(drop=True, inplace=True)

In [888]:
submissions.to_csv("data/my_submissions.csv", index=False)