In [2]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Raw Data/test.csv')

df.to_csv('testing.csv', index=False)

In [6]:
df["Name Length"] = df["Name"].apply(len)
df["Name Length"].describe()

count    418.000000
mean      27.483254
std        9.971228
min       13.000000
25%       20.000000
50%       25.000000
75%       30.750000
max       63.000000
Name: Name Length, dtype: float64

In [7]:
df["Pclass"].describe()

count    418.000000
mean       2.265550
std        0.841838
min        1.000000
25%        1.000000
50%        3.000000
75%        3.000000
max        3.000000
Name: Pclass, dtype: float64

In [8]:
df["Sex"].describe()

count      418
unique       2
top       male
freq       266
Name: Sex, dtype: object

In [9]:
df["Age"].describe()

count    332.000000
mean      30.272590
std       14.181209
min        0.170000
25%       21.000000
50%       27.000000
75%       39.000000
max       76.000000
Name: Age, dtype: float64

In [10]:
df["AgeEstimated"] = df["Age"].apply(lambda x: 1 if pd.isna(x) or (isinstance(x, float) and x % 1 == 0.5) else 0)

In [11]:
print("Miss.:", df[df["Name"].str.contains("Miss.")]["Age"].mean())
print("Mr.:", df[df["Name"].str.contains("Mr.")]["Age"].mean())
print("Mrs.:", df[df["Name"].str.contains("Mrs.")]["Age"].mean())
print("Master.:", df[df["Name"].str.contains("Master.")]["Age"].mean())

Miss.: 21.774843750000002
Mr.: 33.7469387755102
Mrs.: 38.903225806451616
Master.: 7.406470588235294


In [None]:
age_map = {
    "Miss.": 21.8,
    "Mr.": 33.7,
    "Mrs.": 38.9,
    "Master.": 7.4
}

def fill_age(row):
    if pd.isna(row["Age"]):
        for title, avg_age in age_map.items():
            if title in row["Name"]:
                return avg_age
        return row["Age"]
    return row["Age"]

df["Age"] = df.apply(fill_age, axis=1)

In [13]:
df["Age"].describe()

count    417.000000
mean      30.444125
std       13.070221
min        0.170000
25%       22.000000
50%       30.000000
75%       37.000000
max       76.000000
Name: Age, dtype: float64

In [14]:
print(df[df["Age"].isna()])

    PassengerId  Pclass                     Name     Sex  Age  SibSp  Parch  \
88          980       3  O'Donoghue, Ms. Bridget  female  NaN      0      0   

    Ticket  Fare Cabin Embarked  Name Length  AgeEstimated  
88  364856  7.75   NaN        Q           23             1  


In [15]:
df["Age"].fillna(21.8, inplace=True)

In [16]:
df["SibSp"].describe()

count    418.000000
mean       0.447368
std        0.896760
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64

In [17]:
df["Parch"].describe()

count    418.000000
mean       0.392344
std        0.981429
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        9.000000
Name: Parch, dtype: float64

In [18]:
print(df[df["SibSp"] == 0].shape[0], df[df["SibSp"] == 1].shape[0])
print(df[df["Parch"] == 0].shape[0], df[df["Parch"] == 1].shape[0])

283 110
324 52


In [19]:
df["Family_Size"] = df["SibSp"] + df["Parch"] + 1

In [20]:
df["isAlone"] = (df["Family_Size"] == 1).astype(int)

In [21]:
df.drop("Family_Size", axis=1, inplace=True)

In [22]:
df["Ticket"].describe()

count          418
unique         363
top       PC 17608
freq             5
Name: Ticket, dtype: object

In [23]:
df["Ticket_Prefix"] = df["Ticket"].astype(str).str[0]

In [24]:
print(df["Ticket_Prefix"].value_counts())

Ticket_Prefix
3    128
2     95
1     64
S     33
P     33
C     30
A     13
W      6
F      6
7      4
6      3
4      1
9      1
L      1
Name: count, dtype: int64


In [25]:
df["Fare"].describe()

count    417.000000
mean      35.627188
std       55.907576
min        0.000000
25%        7.895800
50%       14.454200
75%       31.500000
max      512.329200
Name: Fare, dtype: float64

In [26]:
df["Fare"].fillna(35.62, inplace=True)

In [27]:
df["Cabin"].describe()

count                  91
unique                 76
top       B57 B59 B63 B66
freq                    3
Name: Cabin, dtype: object

In [28]:
cabin_first_letter_counts = df[df["Cabin"].notna()]["Cabin"].astype(str).str[0].value_counts()
print(cabin_first_letter_counts)

Cabin
C    35
B    18
D    13
E     9
F     8
A     7
G     1
Name: count, dtype: int64


In [29]:
cabin_survival_map = {
    "A": 0.47,
    "B": 0.74,
    "C": 0.59,
    "D": 0.76,
    "E": 0.75,
    "F": 0.62,
    "G": 0.50,
    "T": 0.00
}

def cabin_info(row):
    if pd.isna(row["Cabin"]):
        return 0.30
    first_letter = str(row["Cabin"])[0]
    return cabin_survival_map.get(first_letter, 0.30)

df["CabinInfo"] = df.apply(cabin_info, axis=1)

In [30]:
df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Name Length', 'AgeEstimated',
       'isAlone', 'Ticket_Prefix', 'CabinInfo'],
      dtype='object')

In [31]:
df["Embarked"].describe()

count     418
unique      3
top         S
freq      270
Name: Embarked, dtype: object

In [32]:
df.drop(columns=["Name", "Ticket", "Cabin"], inplace=True)

In [None]:
df.to_csv("final_testing.csv", index=False)


In [34]:
print(df.isna().sum())

PassengerId      0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Name Length      0
AgeEstimated     0
isAlone          0
Ticket_Prefix    0
CabinInfo        0
dtype: int64


In [None]:
#now we go to model_training_and_testing.ipynb to encode features and train the model.