In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


This notebook is based on:
https://www.kaggle.com/code/cocorin/yh-cur-titanic-top-4-with-ensemble-modeling


Workflow:
##### 2 Load and check data

    2.1 load data
    2.2 Outlier detection
    2.3 joining train and test set
    2.4 null and missing values

##### 3 Feature analysis

    3.1 Numerical values
    3.2 Categorical values

##### 4 Filling missing Values

##### 5 Feature engineering

    5.1 tool 1
    5.2 tool 2
    5.3 tool 3
    5.4 tool 4

##### 6 Modeling

    6.1 Simple modeling
        6.1.1 Cross validate models.
        6.1.2 Hyperparameter tunning for best models
        6.1.3 Plot learning curves
        6.1.4 Feature importance of the tree based classifiers
    6.2 Ensemble modeling
        6.2.1 Combining models
    6.3 Prediction
        6.3.1 Predict and Submit results



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

print('all imports loaded')

all imports loaded


In [3]:
# Load data
# make sure to add your data to the notebook look --> and click + add data 
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

## join train and test datasets in order to obtain the same number of features during categorical conversion
train_len= len(train)
dataset= pd.concat(objs= [train, test], axis= 0).reset_index(drop= True)

In [4]:
# Outlier detection

def detect_outliers(df, n, features):
    '''
    Takes a dataframe df of features and returns a list of the indices corresponding
    to the obeservations containing more than n outliers according to the Tukey method
    '''
    outlier_indices= []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col], 75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step= 1.5* IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col= df[(df[col]< Q1 - outlier_step) | (df[col]> Q3 + outlier_step)].index
        
        # append the found outlier indices for col to the list of outlier in dices
        outlier_indices.extend(outlier_list_col)
        
    # select observations contaning more than 2 outliers
    outlier_indices= Counter(outlier_indices)
    multiple_outliers= list(k for k, v in outlier_indices.items() if v> n)
    
    return multiple_outliers

# detect outliers from selected columns
Outliers_to_drop= detect_outliers(train, 2, ['Age', 'SibSp', 'Parch', 'Fare'])

In [5]:
# Show the outliers row
train.loc[Outliers_to_drop] 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
180,181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
201,202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
846,847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S


In [6]:
# uncomment the line below to drop outliers
train= train.drop(Outliers_to_drop, axis= 0).reset_index(drop= True)

In [7]:
# Fill empty and NaNs values with NaN
dataset= dataset.fillna(np.nan)

# Check for Null values
dataset.isnull().sum().sort_values(ascending= False)

Cabin          1014
Survived        418
Age             263
Embarked          2
Fare              1
PassengerId       0
Pclass            0
Name              0
Sex               0
SibSp             0
Parch             0
Ticket            0
dtype: int64