# Data preparation

Help: https://www.kaggle.com/code/startupsci/titanic-data-science-solutions

In [1]:
import jupyter_black
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
jupyter_black.load()

In [3]:
le = LabelEncoder()
scaler = StandardScaler()

## Load

In [4]:
gender_submission = pd.read_csv(
    "./datasets/gender_submission.csv", index_col="PassengerId"
)
gender_submission

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [5]:
train_df = pd.read_csv("./datasets//train.csv", index_col="PassengerId")
train_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [6]:
test_df = pd.read_csv("./datasets/test.csv", index_col="PassengerId")
test_df

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...
1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [7]:
df = pd.concat([train_df, test_df])
df["isTest"] = df.index.isin(test_df.index).astype(int)
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTest
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,1
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 1 to 1309
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
 11  isTest    1309 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [9]:
(df.isna().sum() / len(df)).sort_values(ascending=False)

Cabin       0.774637
Survived    0.319328
Age         0.200917
Embarked    0.001528
Fare        0.000764
Pclass      0.000000
Name        0.000000
Sex         0.000000
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
isTest      0.000000
dtype: float64

# Prepare

In [10]:
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTest
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,1
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,1


## Pclass

In [11]:
df["Pclass"].value_counts(dropna=False, normalize=True)

Pclass
3    0.541635
1    0.246753
2    0.211612
Name: proportion, dtype: float64

## Name

In [12]:
df[["Name"]]

Unnamed: 0_level_0,Name
PassengerId,Unnamed: 1_level_1
1,"Braund, Mr. Owen Harris"
2,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
3,"Heikkinen, Miss. Laina"
4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
5,"Allen, Mr. William Henry"
...,...
1305,"Spector, Mr. Woolf"
1306,"Oliva y Ocana, Dona. Fermina"
1307,"Saether, Mr. Simon Sivertsen"
1308,"Ware, Mr. Frederick"


In [13]:
df["Name_FirstName"] = df["Name"].str.split(",", regex=False).str[0].str.strip()
df["Name_Title"] = df["Name"].str.split(",").str[1].str.split(". ").str[0].str.strip()
df["Name_LastName"] = (
    df["Name"].str.split(",").str[1].str.split(". ").str[1].str.strip()
)

df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTest,Name_FirstName,Name_Title,Name_LastName
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,Braund,Mr,Owe
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,Cumings,Mrs,Joh
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,Heikkinen,Miss,Laina
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0,Futrelle,Mrs,Jacque
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0,Allen,Mr,Willia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,1,Spector,Mr,Woolf
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1,Oliva y Ocana,Dona,Fermina
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1,Saether,Mr,Simo
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,1,Ware,Mr,Frederick


### FirstName

In [14]:
df["Name_FirstName"].value_counts()

Name_FirstName
Andersson    11
Sage         11
Goodwin       8
Asplund       8
Davies        7
             ..
Milling       1
Maisner       1
Goncalves     1
Campbell      1
Saether       1
Name: count, Length: 875, dtype: int64

In [15]:
cnt = df["Name_FirstName"].value_counts()

df["Name_FirstName"] = df["Name_FirstName"].map(
    lambda x: "Others" if x in cnt[cnt == 1].index else x
)
df["Name_FirstName"].value_counts()

Name_FirstName
Others                  637
Sage                     11
Andersson                11
Goodwin                   8
Asplund                   8
                       ... 
Fleming                   2
Penasco y Castellana      2
Abelson                   2
Lahtinen                  2
Gibson                    2
Name: count, Length: 239, dtype: int64

### Title

In [16]:
df["Name_Title"].value_counts()

Name_Title
Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Major         2
Ms            2
Lady          1
Sir           1
Mme           1
Don           1
Capt          1
th            1
Jonkheer      1
Dona          1
Name: count, dtype: int64

In [17]:
cnt = df["Name_Title"].value_counts()

df["Name_Title"] = df["Name_Title"].map(
    lambda x: "Others" if x in cnt[cnt == 1].index else x
)
df["Name_Title"].value_counts()

Name_Title
Mr        757
Miss      260
Mrs       197
Master     61
Others      8
Rev         8
Dr          8
Col         4
Ms          2
Major       2
Mlle        2
Name: count, dtype: int64

### LastName

In [18]:
df["Name_LastName"].value_counts()

Name_LastName
Willia         55
Joh            41
Charle         27
Thoma          23
Georg          23
               ..
Ida             1
Carl/Charle     1
Shadrach        1
Juha            1
Michae          1
Name: count, Length: 640, dtype: int64

In [19]:
cnt = df["Name_LastName"].value_counts()

df["Name_LastName"] = df["Name_LastName"].map(
    lambda x: "Others" if x in cnt[cnt == 1].index else x
)
df["Name_LastName"].value_counts()

Name_LastName
Others     457
Willia      55
Joh         41
Charle      27
Thoma       23
          ... 
Philip       2
Henry        2
Gerious      2
Jako         2
Phili        2
Name: count, Length: 184, dtype: int64

In [20]:
df["Name_FirstName"] = le.fit_transform(df["Name_FirstName"])
df["Name_Title"] = le.fit_transform(df["Name_Title"])
df["Name_LastName"] = le.fit_transform(df["Name_LastName"])

In [21]:
df.drop("Name", axis=1, inplace=True)

## Sex

In [22]:
df["Sex"] = (df["Sex"] == "male").astype(int)
df["Sex"].value_counts()

Sex
1    843
0    466
Name: count, dtype: int64

## SibSp / Parch (of siblings / of parents)

In [23]:
df["SibSp"].value_counts()

SibSp
0    891
1    319
2     42
4     22
3     20
8      9
5      6
Name: count, dtype: int64

In [24]:
df["Parch"].value_counts()

Parch
0    1002
1     170
2     113
3       8
5       6
4       6
6       2
9       2
Name: count, dtype: int64

In [25]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["FamilySize"].value_counts()

FamilySize
1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: count, dtype: int64

In [26]:
df["isAlone"] = (df["FamilySize"] == 1).astype(int)
df["isAlone"].value_counts()

isAlone
1    790
0    519
Name: count, dtype: int64

## Ticket

In [27]:
df[["Ticket"]][-30:]

Unnamed: 0_level_0,Ticket
PassengerId,Unnamed: 1_level_1
1280,364858
1281,349909
1282,12749
1283,PC 17592
1284,C.A. 2673
1285,C.A. 30769
1286,315153
1287,13695
1288,371109
1289,13567


In [28]:
df["Ticket"] = df["Ticket"].str.split(" ").str[0].str.upper()
df["Ticket"]

PassengerId
1              A/5
2               PC
3         STON/O2.
4           113803
5           373450
           ...    
1305          A.5.
1306            PC
1307    SOTON/O.Q.
1308        359309
1309          2668
Name: Ticket, Length: 1309, dtype: object

In [29]:
df["Ticket"] = df["Ticket"].map(lambda x: "X" if x.isnumeric() else x.replace(".", ""))
df["Ticket"] = df["Ticket"].map(lambda x: x.split("/")[0] if "/" in x else x)
df["Ticket"].value_counts()

Ticket
X        957
PC        92
CA        69
A         36
SC        29
SOTON     27
STON      22
W         15
FCC        9
SO         8
C          8
SOC        7
PP         4
LINE       4
FC         3
A5         3
P          2
AQ         2
WEP        2
WE         2
SW         2
A4         1
SCO        1
FA         1
SOP        1
SP         1
LP         1
Name: count, dtype: int64

In [30]:
df["Ticket"] = le.fit_transform(df["Ticket"])

## Fare

In [31]:
df[["Fare"]]

Unnamed: 0_level_0,Fare
PassengerId,Unnamed: 1_level_1
1,7.2500
2,71.2833
3,7.9250
4,53.1000
5,8.0500
...,...
1305,8.0500
1306,108.9000
1307,7.2500
1308,8.0500


In [32]:
df["Fare"].describe()

count    1308.000000
mean       33.295479
std        51.758668
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: Fare, dtype: float64

In [33]:
df[df["Fare"].isna()]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTest,Name_FirstName,Name_Title,Name_LastName,FamilySize,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1044,,3,1,60.5,0,0,26,,,S,1,175,6,174,1,1


In [34]:
df["Fare"].fillna(df["Fare"].mean(), inplace=True)

In [35]:
df["Fare"] = scaler.fit_transform(df[["Fare"]])
df["Fare"].describe()

count    1.309000e+03
mean     2.442660e-17
std      1.000382e+00
min     -6.437751e-01
25%     -4.911082e-01
50%     -3.643001e-01
75%     -3.906640e-02
max      9.262219e+00
Name: Fare, dtype: float64

## Embarked

In [36]:
df["Embarked"].value_counts(dropna=False, normalize=True)

Embarked
S      0.698243
C      0.206264
Q      0.093965
NaN    0.001528
Name: proportion, dtype: float64

In [37]:
df[df["Embarked"].isna()]

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTest,Name_FirstName,Name_Title,Name_LastName,FamilySize,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
62,1.0,1,0,38.0,0,0,26,0.903042,B28,,0,175,4,149,1,1
830,1.0,1,0,62.0,0,0,26,0.903042,B28,,0,175,7,73,1,1


In [38]:
df["Embarked"].fillna("S", inplace=True)

In [39]:
df["Embarked"] = le.fit_transform(df["Embarked"])

In [40]:
df["Embarked"].value_counts()

Embarked
2    916
0    270
1    123
Name: count, dtype: int64

# Save

In [41]:
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTest,Name_FirstName,Name_Title,Name_LastName,FamilySize,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.0,3,1,22.0,1,0,0,-0.503595,,2,0,29,6,150,2,0
2,1.0,1,0,38.0,1,0,12,0.734503,C85,0,0,61,7,104,2,0
3,1.0,3,0,26.0,0,0,21,-0.490544,,2,0,175,4,149,1,1
4,1.0,1,0,35.0,1,0,26,0.382925,C123,2,0,88,7,96,2,0
5,0.0,3,1,35.0,0,0,26,-0.488127,,2,0,5,6,182,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,1,,0,0,2,-0.488127,,2,1,175,6,149,1,1
1306,,1,0,39.0,0,0,12,1.461829,C105,0,1,175,9,149,1,1
1307,,3,1,38.5,0,0,19,-0.503595,,2,1,175,6,149,1,1
1308,,3,1,,0,0,26,-0.488127,,2,1,220,6,72,1,1


In [42]:
(df.isna().sum() / len(df)).sort_values(ascending=False)

Cabin             0.774637
Survived          0.319328
Age               0.200917
Pclass            0.000000
Sex               0.000000
SibSp             0.000000
Parch             0.000000
Ticket            0.000000
Fare              0.000000
Embarked          0.000000
isTest            0.000000
Name_FirstName    0.000000
Name_Title        0.000000
Name_LastName     0.000000
FamilySize        0.000000
isAlone           0.000000
dtype: float64

In [43]:
df.to_csv("./datasets/prepared.csv")