In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_style("whitegrid")
pd.set_option('display.max_columns', 100)
plt.rcParams['font.size'] = 10

### Inspection

In [2]:
gender_df = pd.read_csv("gender_submission.csv")

In [3]:
gender_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [4]:
test_df = pd.read_csv("test.csv")

In [5]:
test_df.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [6]:
train_df = pd.read_csv("train (3).csv")

In [7]:
gender_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [8]:
test_df.isna().sum()
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
numerical_cols = test_df.select_dtypes(np.number).columns.tolist()[1:]
categorical_cols = test_df.select_dtypes('object').columns.tolist()

In [10]:
input_cols = numerical_cols + categorical_cols
input_cols

['Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Name',
 'Sex',
 'Ticket',
 'Cabin',
 'Embarked']

In [11]:
target_cols = gender_df.columns[1:]
target_cols

Index(['Survived'], dtype='object')

In [12]:
""""  
Each name serve as a unique identifier
The sex has two unique values male and female 
"""
train_df[categorical_cols].nunique()

Name        891
Sex           2
Ticket      681
Cabin       147
Embarked      3
dtype: int64

In [13]:
test_df[categorical_cols].nunique()

Name        418
Sex           2
Ticket      363
Cabin        76
Embarked      3
dtype: int64

In [14]:
train_df[numerical_cols].nunique()
# Passenger ID is a unique identifier for each row
# Survived consist of either 0 = No, 1 = Yes
# Pclass is the ticket class consisting of 1 = 1st, 2 = 2nd, 3 = 3rd

Pclass      3
Age        88
SibSp       7
Parch       7
Fare      248
dtype: int64

In [15]:
test_df[numerical_cols].nunique()

Pclass      3
Age        79
SibSp       7
Parch       8
Fare      169
dtype: int64

### Checking for missing values

In [16]:
train_df[categorical_cols].isna().sum()

Name          0
Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

In [17]:
test_df[categorical_cols].isna().sum()

Name          0
Sex           0
Ticket        0
Cabin       327
Embarked      0
dtype: int64

In [18]:
train_df[numerical_cols].isna().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

In [19]:
test_df[numerical_cols].isna().sum()

Pclass     0
Age       86
SibSp      0
Parch      0
Fare       1
dtype: int64

### Filling in categorical variables with missing values i.e. for Embarked and Cabin

In [20]:
train_df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [21]:
train_df['Embarked'] = train_df['Embarked'].fillna("Unknown")

In [22]:
train_df['Cabin'] = train_df['Cabin'].fillna("Unknown")

In [23]:
test_df['Cabin'] = test_df['Cabin'].fillna("Unknown")

### Filling in numerical variables with missing values i.e. for Age

## Missing data is also a data
#### Preserving the missing values

In [24]:
train_map = {False: 0, True: 1}
train_df['Missing_age'] = train_df['Age'].isna().map(train_map)

In [25]:
test_df['Missing_age'] = test_df['Age'].isna().map(train_map)

In [26]:
train_df.tail(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Missing_age
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,Unknown,S,0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,Unknown,S,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C,0
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,Unknown,Q,0


In [27]:
train_df['Age'].isna().sum()

177

In [28]:
train_df['Name'].head(10)

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
5                                     Moran, Mr. James
6                              McCarthy, Mr. Timothy J
7                       Palsson, Master. Gosta Leonard
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                  Nasser, Mrs. Nicholas (Adele Achem)
Name: Name, dtype: object

In [29]:
def insert_social_stats(df):
    import re
    import warnings

    warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
    social_status = ["Mr", "Mrs", "Miss", "Master", "Dr"]
    df["Social_status"] = "Others"
    for i in range(0, len(df)):
        if re.search(r"\bMr\b\.", df["Name"][i]):

            df.loc[i, "Social_status"] = "Mr"

        elif re.search(r"\bMrs\b\.", df["Name"][i]):

            df.loc[i, "Social_status"] = "Mrs"

        elif re.search(r"\bMiss\b\.", df["Name"][i]):

            df.loc[i, "Social_status"] = "Miss"

        elif re.search(r"\bMaster\b\.", train_df["Name"][i]):

            df.loc[i, "Social_status"] = "Master"

        elif re.search(r"\bDr\b\.", df["Name"][i]):

            df.loc[i, "Social_status"] = "Dr"

        else:
            df.loc[i, "Social_status"] = "Others"
    warnings.resetwarnings()

In [30]:
insert_social_stats(train_df)

In [None]:
insert_social_stats(test_df)

In [None]:
train_df["Social_status"].value_counts()

In [None]:
test_df["Social_status"].value_counts()

In [None]:
train_df[numerical_cols].nunique()

In [None]:
sns.catplot(
    x="Survived",
    y="Age",
    hue="Social_status",
    data=train_df,
    kind="strip",
    jitter=True,
    order=[1, 0],
)

In [None]:
sns.catplot(
    x="Survived",
    y="Fare",
    hue="Social_status",
    data=train_df,
    kind="bar",
    order=[1, 0],
)

In [None]:
train_df[train_df['Age'].isna()].head(10)

#### Filling in Missed Ages based on their social status

In [None]:
import math
def provide_age_based_on_social_stats(df): 
    missing_ = df['Age'].isna()
    not_missing = df[~df['Age'].isna()]
    _for_mr = not_missing[not_missing["Social_status"] == "Mr"]
    _for_mrs = not_missing[not_missing["Social_status"] == "Mrs"]
    _for_miss = not_missing[not_missing["Social_status"] == "Miss"]
    _for_master = not_missing[not_missing["Social_status"] == "Master"]
    _for_dr = not_missing[not_missing["Social_status"] == "Dr"]
    _for_others = not_missing[not_missing["Social_status"] == "Others"]
    
    missing_df = df[missing_]
    # Missing ages based on their social status
    missing_for_mr = missing_df[missing_df["Social_status"] == "Mr"]
    missing_for_mrs = missing_df[missing_df["Social_status"] == "Mrs"]
    missing_for_miss = missing_df[missing_df["Social_status"] == "Miss"]
    missing_for_master = missing_df[missing_df["Social_status"] == "Master"]
    missing_for_dr = missing_df[missing_df["Social_status"] == "Dr"]
    missing_for_others = missing_df[missing_df["Social_status"] == "Others"]
    
    # Replace missing
    if not missing_for_mr.empty:
        df.loc[missing_for_mr.index, "Age"] = math.floor(_for_mr['Age'].mean())
    if not missing_for_mrs.empty:
        df.loc[missing_for_mrs.index, "Age"] = math.floor(_for_mrs['Age'].mean())
    if not missing_for_miss.empty:
        df.loc[missing_for_miss.index, "Age"] = math.floor(_for_miss['Age'].mean())
    if not missing_for_master.empty:
        df.loc[missing_for_master.index, "Age"] = math.floor(_for_master['Age'].mean())
    if not missing_for_dr.empty:
        df.loc[missing_for_dr.index, "Age"] = math.floor(_for_dr['Age'].mean())
    if not missing_for_others.empty:
        df.loc[missing_for_others.index, "Age"] = math.floor(_for_others['Age'].mean())
        
    return df


In [None]:
provide_age_based_on_social_stats(train_df)

In [None]:
provide_age_based_on_social_stats(test_df)

In [None]:
train_df[train_df['Age'].isna()]['Social_status'].value_counts()

In [None]:
sns.relplot(x='Sex', y='Age', data=train_df, hue="Social_status")

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean").fit(test_df[numerical_cols])

In [None]:
test_df[numerical_cols]["Fare"].mean()

### Feature Enginneering

### Categorical Variables

In [None]:
train_df[categorical_cols]['Embarked'].value_counts()

In [None]:
train_df["Sex"].value_counts()

In [None]:
sex_code = {"male": 1, "female": 0}
train_df['Sex_code'] = train_df['Sex'].map(sex_code)

In [None]:
sns.catplot(train_df, x='Sex', kind='count')

In [None]:
train_df['Ticket'].nunique()

In [None]:
train_df['Ticket'].value_counts()

In [None]:
temp = train_df["Ticket"].value_counts()
train_df['Ticket_counts'] = train_df["Ticket"].apply(lambda x: temp[x])

In [None]:
train_df.tail(30)

In [None]:
len(train_df[train_df['Ticket_counts'] < 4])

In [None]:
for i in range(0, len(train_df)):
    if train_df['Ticket_counts'][i] < 4:
        train_df["Ticket"][i] = "Others"

In [None]:
len(train_df[train_df['Ticket']  != "Others"])

In [None]:
train_df[categorical_cols].nunique()

In [None]:
train_df['Embarked'].unique()

In [None]:
train_df

In [None]:
test_df[["Ticket", "Embarked", "Cabin"]].nunique()