In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from sklearn.metrics import accuracy_score
from tensorflow.keras.losses import BinaryCrossentropy

In [2]:
df = pd.read_csv("train.csv")

In [102]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,IsAlone,AgeGroup,FareGroup,Title
0,0,3,0,0,0,2,0,4
1,1,1,1,1,0,1,3,2
2,1,3,1,0,1,2,1,3
3,1,1,1,0,0,1,3,2
4,0,3,0,0,1,1,1,4


### We won't require PassengerId, Name, and Ticket for building a predictive model because I am assuming that these features don't contribute to the survival chances.

Let's use Name later to create a column "Title". It might have an effect on the chance of survival.

In [4]:
df = df.drop(["PassengerId", "Ticket"], axis=1)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [7]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
df.shape

(891, 10)

In [9]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [10]:
df["Cabin"].value_counts()

Cabin
G6             4
C23 C25 C27    4
B96 B98        4
F2             3
D              3
              ..
E17            1
A24            1
C50            1
B42            1
C148           1
Name: count, Length: 147, dtype: int64

### Dropping Cabin because it has a large number of NaN values.

In [11]:
df.drop("Cabin", axis=1, inplace=True)

In [12]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

### Fill some NaN values

We can fill the NaN values of Age with its median value

In [13]:
df['Age'].fillna(df["Age"].dropna().median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df["Age"].dropna().median(), inplace=True)


In [14]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [15]:
df.dropna(inplace=True)

In [16]:
df.isnull().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [17]:
df.shape

(889, 9)

In [18]:
df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.626168
1,2,0.472826
2,3,0.242363


In [19]:
df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.740385
1,male,0.188908


In [20]:
df[['Survived', 'SibSp']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,SibSp,Survived
1,1,0.535885
2,2,0.464286
0,0,0.343234
3,3,0.25
4,4,0.166667
5,5,0.0
6,8,0.0


In [21]:
df[['Survived', 'Parch']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Parch,Survived
3,3,0.6
1,1,0.550847
2,2,0.5
0,0,0.341716
5,5,0.2
4,4,0.0
6,6,0.0


In [22]:
df[['Survived', 'Embarked']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.336957


### Combining Features

We can combine Parch and SibSp to create a feature called family size

In [23]:
df['FamilySize'] = df['Parch'] + df['SibSp'] + 1

In [24]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


Creating a new feature called IsAlone. Its value will be 1 if FamilySize is 1 and 0 if FamilySize > 1

In [25]:
df['IsAlone'] = np.where(df['FamilySize'] == 1, 1, 0)

In [26]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,0
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,1


In [27]:
df[['Survived', 'IsAlone']].groupby(['IsAlone'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.300935


We can now drop FamilySize, SibSp, and Parch, replacing them with IsAlone

In [28]:
df.drop(["FamilySize", "SibSp", "Parch"], axis=1, inplace=True)

In [29]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,IsAlone
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,S,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,S,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,S,0
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,S,1


### Age Groups

Let's replace age with age groups. Assuming that the younger age groups have higher chances of survival.

In [30]:
conditions = [
    (df["Age"] <= 16),
    (df["Age"] > 16) & (df["Age"] <= 32),
    (df["Age"] > 32) & (df["Age"] <= 64),
    (df["Age"] > 64)
]
# Age categories
values = [3, 2, 1, 0]

df["AgeGroup"] = np.select(conditions, values)

In [31]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,IsAlone,AgeGroup
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,S,0,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,0,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,S,1,2
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,S,0,1
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,S,1,1


In [32]:
df[['Survived', 'AgeGroup']].groupby(['AgeGroup'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,AgeGroup,Survived
3,3,0.55
1,1,0.407843
2,2,0.344168
0,0,0.090909


We can now drop the age group

In [33]:
df.drop("Age", axis=1, inplace=True)

In [34]:
df.replace({"female": 1, "male": 0, "S": 0, "C": 1, "Q": 2}, inplace=True)

  df.replace({"female": 1, "male": 0, "S": 0, "C": 1, "Q": 2}, inplace=True)


### Fare Category

Let's convert the Fare feature to a categorical column. Assuming that people who paid higher fares have higher chances of survival.

In [35]:
df["FareGroup"] = pd.qcut(df["Fare"], 4)
df[['Survived', 'FareGroup']].groupby(['FareGroup'], as_index=False).mean().sort_values(by='Survived', ascending=False)

  df[['Survived', 'FareGroup']].groupby(['FareGroup'], as_index=False).mean().sort_values(by='Survived', ascending=False)


Unnamed: 0,FareGroup,Survived
3,"(31.0, 512.329]",0.577273
2,"(14.454, 31.0]",0.454955
1,"(7.896, 14.454]",0.303571
0,"(-0.001, 7.896]",0.197309


In [36]:
conditions = [
    (df['Fare'] <= 7.9),
    (df['Fare'] > 7.9) & (df['Fare'] <= 15),
    (df['Fare'] > 15) & (df['Fare'] <= 31),
    (df['Fare'] > 31),
]
# FareBand Values
fare_group_values = [0, 1, 2, 3]

df['FareGroup'] = np.select(conditions, fare_group_values)

In [37]:
df[['Survived', 'FareGroup']].groupby(['FareGroup'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FareGroup,Survived
3,3,0.577273
2,2,0.469194
1,1,0.297872
0,0,0.197309


In [38]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Fare,Embarked,IsAlone,AgeGroup,FareGroup
0,0,3,"Braund, Mr. Owen Harris",0,7.25,0,0,2,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,71.2833,1,0,1,3
2,1,3,"Heikkinen, Miss. Laina",1,7.925,0,1,2,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,53.1,0,0,1,3
4,0,3,"Allen, Mr. William Henry",0,8.05,0,1,1,1


We no longer need Fare

In [39]:
df.drop('Fare', axis=1, inplace=True)

### Title

Let's use Name to create the "Title" column and see how it effects survival chances.

In [40]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [41]:
df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Title,Survived
5,Lady,1.0
13,Ms,1.0
15,Sir,1.0
10,Mme,1.0
16,the Countess,1.0
9,Mlle,1.0
12,Mrs,0.790323
8,Miss,0.696133
7,Master,0.575
6,Major,0.5


In [42]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Embarked,IsAlone,AgeGroup,FareGroup,Title
0,0,3,"Braund, Mr. Owen Harris",0,0,0,2,0,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,1,0,1,3,Mrs
2,1,3,"Heikkinen, Miss. Laina",1,0,1,2,1,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0,0,1,3,Mrs
4,0,3,"Allen, Mr. William Henry",0,0,1,1,1,Mr


In [43]:
df['Title'].value_counts()

Title
Mr              517
Miss            181
Mrs             124
Master           40
Dr                7
Rev               6
Col               2
Mlle              2
Major             2
Ms                1
Mme               1
Don               1
Lady              1
Sir               1
Capt              1
the Countess      1
Jonkheer          1
Name: count, dtype: int64

Clean up rare Titles

In [44]:
title_min_count = 10

title_names = (df['Title'].value_counts() < title_min_count)

In [45]:
title_names

Title
Mr              False
Miss            False
Mrs             False
Master          False
Dr               True
Rev              True
Col              True
Mlle             True
Major            True
Ms               True
Mme              True
Don              True
Lady             True
Sir              True
Capt             True
the Countess     True
Jonkheer         True
Name: count, dtype: bool

In [46]:
df['Title'] = df['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

In [47]:
df['Title'].value_counts()

Title
Mr        517
Miss      181
Mrs       124
Master     40
Misc       27
Name: count, dtype: int64

In [48]:
df.replace({'Mr': 4, 'Miss': 3, 'Mrs': 2, 'Master': 1, 'Misc': 0}, inplace=True)

  df.replace({'Mr': 4, 'Miss': 3, 'Mrs': 2, 'Master': 1, 'Misc': 0}, inplace=True)


### Drop Name

In [49]:
df.drop('Name', axis=1, inplace=True)

In [50]:
df.corr()["Survived"].sort_values(ascending=False)

Survived     1.000000
Sex          0.541585
FareGroup    0.301275
Embarked     0.108669
AgeGroup     0.060406
IsAlone     -0.206207
Pclass      -0.335549
Title       -0.413123
Name: Survived, dtype: float64

### Highest Correlation: Sex
### Lowest Correlation: AgeGroup

In [51]:
df.shape

(889, 8)

In [52]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,IsAlone,AgeGroup,FareGroup,Title
0,0,3,0,0,0,2,0,4
1,1,1,1,1,0,1,3,2
2,1,3,1,0,1,2,1,3
3,1,1,1,0,0,1,3,2
4,0,3,0,0,1,1,1,4


In [53]:
X = df.drop("Survived", axis=1)
Y = df["Survived"]

In [54]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [55]:
print(x_train.head(10))
print(x_test.head(10))

     Pclass  Sex  Embarked  IsAlone  AgeGroup  FareGroup  Title
708       1    1         0        1         2          3      3
240       3    1         1        0         2          1      3
382       3    0         0        1         2          1      4
792       3    1         0        0         2          3      3
683       3    0         0        0         3          3      4
119       3    1         0        0         3          3      3
287       3    0         0        1         2          0      4
313       3    0         0        1         2          0      4
29        3    0         0        1         2          0      4
55        1    0         0        1         2          3      4
     Pclass  Sex  Embarked  IsAlone  AgeGroup  FareGroup  Title
281       3    0         0        1         2          0      4
435       1    1         0        0         3          3      3
39        3    1         1        0         3          1      3
418       2    0         0        1     

In [56]:
print(y_train.head())
print(y_test.head())

708    1
240    0
382    0
792    0
683    0
Name: Survived, dtype: int64
281    0
435    1
39     1
418    0
585    1
Name: Survived, dtype: int64


In [57]:
model = LogisticRegression()

In [58]:
model.fit(x_train, y_train)

In [59]:
y_pred = model.predict(x_test)

In [60]:
y_pred

array([0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1])

In [61]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 75.84%


In [62]:
acc_log = round(model.score(x_train, y_train) * 100, 2)
acc_log

81.86

In [63]:
svc = SVC()
svc.fit(x_train, y_train)
y_pred_svc = svc.predict(x_test)

accuracy_sv = accuracy_score(y_test, y_pred_svc)
print(f"Accuracy: {accuracy * 100:.2f}%")
acc_svc = round(svc.score(x_train, y_train) * 100, 2)
acc_svc

Accuracy: 75.84%


82.84

In [64]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)
y_pred_forest = random_forest.predict(x_test)

acc_forest = accuracy_score(y_test, y_pred_forest)
print(f"Model Accuracy: {acc_forest * 100:.2f}%")
random_forest.score(x_train, y_train)
acc_random_forest = round(random_forest.score(x_train, y_train) * 100, 2)
acc_random_forest

Model Accuracy: 76.97%


87.76

In [65]:
x_train.shape

(711, 7)

In [66]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(7,)),
    tf.keras.layers.Dense(7, activation='relu'),
    tf.keras.layers.Dense(3, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [67]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), 
              loss=BinaryCrossentropy(), 
              metrics=['accuracy']
             )
model.fit(x_train, y_train, epochs=200)

Epoch 1/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6773 - loss: 0.6880
Epoch 2/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7967 - loss: 0.5790
Epoch 3/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7788 - loss: 0.4841
Epoch 4/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7921 - loss: 0.4708
Epoch 5/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8065 - loss: 0.4404
Epoch 6/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8053 - loss: 0.4566
Epoch 7/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7901 - loss: 0.4359
Epoch 8/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8220 - loss: 0.4241
Epoch 9/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1c776f312d0>

In [68]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy * 100:.4f}%")

Loss: 0.5160
Accuracy: 79.7753%


### Test Predictions

In [69]:
df_test = pd.read_csv("test.csv")

In [70]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [71]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [73]:
df_test.drop("Cabin", inplace=True, axis=1)

In [75]:
df_test['Age'].fillna(df_test["Age"].dropna().median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Age'].fillna(df_test["Age"].dropna().median(), inplace=True)


In [76]:
df_test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
dtype: int64

In [78]:
df_test.shape

(418, 10)

In [79]:
df_test['Fare'].fillna(df_test['Fare'].dropna().median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['Fare'].fillna(df_test['Fare'].dropna().median(), inplace=True)


In [80]:
df_test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

### Age Groups

In [82]:
conditions = [
    (df_test['Age'] <= 16),
    (df_test['Age'] > 16) & (df_test['Age'] <= 32),
    (df_test['Age'] > 32) & (df_test['Age'] <= 64),
    (df_test['Age'] > 64)
]

values = [3, 2, 1, 0]

df_test['AgeGroup'] = np.select(conditions, values)

In [83]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,AgeGroup
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,2


In [85]:
df_test['FamilySize'] = df_test['Parch'] + df_test['SibSp'] + 1

In [86]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,AgeGroup,FamilySize
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,1,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,1,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,2,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,2,3


In [89]:
df_test.drop('Age', inplace=True, axis=1)

In [90]:
df_test['IsAlone'] = np.where(df_test['FamilySize'] == 1, 1, 0)

In [91]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,AgeGroup,FamilySize,IsAlone
0,892,3,"Kelly, Mr. James",male,0,0,330911,7.8292,Q,1,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,1,0,363272,7.0,S,1,2,0
2,894,2,"Myles, Mr. Thomas Francis",male,0,0,240276,9.6875,Q,1,1,1
3,895,3,"Wirz, Mr. Albert",male,0,0,315154,8.6625,S,2,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1,1,3101298,12.2875,S,2,3,0


In [93]:
df_test.drop(['FamilySize', 'SibSp', 'Parch'], inplace=True, axis=1)

In [94]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Ticket,Fare,Embarked,AgeGroup,IsAlone
0,892,3,"Kelly, Mr. James",male,330911,7.8292,Q,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,363272,7.0,S,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,240276,9.6875,Q,1,1
3,895,3,"Wirz, Mr. Albert",male,315154,8.6625,S,2,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,3101298,12.2875,S,2,0


In [95]:
df_test.drop('Ticket', inplace=True, axis=1)

In [97]:
df_test["FareGroup"] = pd.qcut(df_test["Fare"], 4)

In [98]:
df_test['FareGroup'].value_counts()

FareGroup
(-0.001, 7.896]      114
(31.472, 512.329]    105
(14.454, 31.472]     102
(7.896, 14.454]       97
Name: count, dtype: int64

In [99]:
conditions = [
    (df_test['Fare'] <= 7.9),
    (df_test['Fare'] > 7.9) & (df_test['Fare'] <= 14.5),
    (df_test['Fare'] > 14.5) & (df_test['Fare'] <= 31.5),
    (df_test['Fare'] > 31.5),
]
# FareBand Values
fare_group_values = [0, 1, 2, 3]

df_test['FareGroup'] = np.select(conditions, fare_group_values)

In [100]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Fare,Embarked,AgeGroup,IsAlone,FareGroup
0,892,3,"Kelly, Mr. James",male,7.8292,Q,1,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,7.0,S,1,0,0
2,894,2,"Myles, Mr. Thomas Francis",male,9.6875,Q,1,1,1
3,895,3,"Wirz, Mr. Albert",male,8.6625,S,2,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,12.2875,S,2,0,1


In [101]:
df_test.drop('Fare', inplace=True, axis=1)

In [104]:
df_test['Title'] = df_test['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [105]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Embarked,AgeGroup,IsAlone,FareGroup,Title
0,892,3,"Kelly, Mr. James",male,Q,1,1,0,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,S,1,0,0,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,Q,1,1,1,Mr
3,895,3,"Wirz, Mr. Albert",male,S,2,1,1,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,S,2,0,1,Mrs


In [106]:
df_test['Title'].value_counts()

Title
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: count, dtype: int64

In [107]:
title_min_count = 10

title_names = (df_test['Title'].value_counts() < title_min_count)

In [108]:
title_names

Title
Mr        False
Miss      False
Mrs       False
Master    False
Col        True
Rev        True
Ms         True
Dr         True
Dona       True
Name: count, dtype: bool

In [109]:
df_test['Title'] = df_test['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)

In [110]:
df_test['Title'].value_counts()

Title
Mr        240
Miss       78
Mrs        72
Master     21
Misc        7
Name: count, dtype: int64

In [112]:
df_test.replace({'Mr': 4, 'Miss': 3, 'Mrs': 2, 'Master': 1, 'Misc': 0}, inplace=True)

  df_test.replace({'Mr': 4, 'Miss': 3, 'Mrs': 2, 'Master': 1, 'Misc': 0}, inplace=True)


In [113]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Embarked,AgeGroup,IsAlone,FareGroup,Title
0,892,3,"Kelly, Mr. James",male,Q,1,1,0,4
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,S,1,0,0,2
2,894,2,"Myles, Mr. Thomas Francis",male,Q,1,1,1,4
3,895,3,"Wirz, Mr. Albert",male,S,2,1,1,4
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,S,2,0,1,2


In [114]:
df_test.drop('Name', inplace=True, axis=1)

In [125]:
df_test.replace({"female": 1, "male": 0, "S": 0, "C": 1, "Q": 2}, inplace=True)

  df_test.replace({"female": 1, "male": 0, "S": 0, "C": 1, "Q": 2}, inplace=True)


In [126]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Embarked,AgeGroup,IsAlone,FareGroup,Title
0,892,3,0,2,1,1,0,4
1,893,3,1,0,1,0,0,2
2,894,2,0,2,1,1,1,4
3,895,3,0,0,2,1,1,4
4,896,3,1,0,2,0,1,2


In [127]:
test_X = df_test.drop("PassengerId", axis=1)

In [128]:
test_X.head()

Unnamed: 0,Pclass,Sex,Embarked,AgeGroup,IsAlone,FareGroup,Title
0,3,0,2,1,1,0,4
1,3,1,0,1,0,0,2
2,2,0,2,1,1,1,4
3,3,0,0,2,1,1,4
4,3,1,0,2,0,1,2


In [129]:
test_X = test_X.to_numpy()

In [130]:
test_X

array([[3, 0, 2, ..., 1, 0, 4],
       [3, 1, 0, ..., 0, 0, 2],
       [2, 0, 2, ..., 1, 1, 4],
       ...,
       [3, 0, 0, ..., 1, 0, 4],
       [3, 0, 0, ..., 1, 1, 4],
       [3, 0, 1, ..., 0, 2, 1]])

In [137]:
test_predict = model.predict(test_X)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [138]:
test_predict

array([[0.02945903],
       [0.0659726 ],
       [0.23199566],
       [0.15081735],
       [0.30199596],
       [0.20088378],
       [0.43058592],
       [0.04156435],
       [0.7297412 ],
       [0.01775465],
       [0.06097174],
       [0.33652475],
       [0.9833167 ],
       [0.04515711],
       [0.9874004 ],
       [0.95348763],
       [0.23199566],
       [0.06967475],
       [0.05230245],
       [0.4051843 ],
       [0.15395124],
       [0.07089791],
       [0.9717766 ],
       [0.14299896],
       [0.96215045],
       [0.03959037],
       [0.9589231 ],
       [0.06967475],
       [0.33652475],
       [0.06918809],
       [0.04515711],
       [0.04156435],
       [0.05794578],
       [0.28762335],
       [0.14299896],
       [0.06967475],
       [0.07443512],
       [0.07443512],
       [0.15081735],
       [0.04605938],
       [0.08035264],
       [0.4161109 ],
       [0.02230623],
       [0.8988413 ],
       [0.9874004 ],
       [0.15081735],
       [0.43058592],
       [0.079

In [141]:
final_predict = np.where(test_predict > 0.5, 1, 0)

In [142]:
final_predict

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
    

In [143]:
df_test["Survived"] = final_predict

In [144]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Embarked,AgeGroup,IsAlone,FareGroup,Title,Survived
0,892,3,0,2,1,1,0,4,0
1,893,3,1,0,1,0,0,2,0
2,894,2,0,2,1,1,1,4,0
3,895,3,0,0,2,1,1,4,0
4,896,3,1,0,2,0,1,2,0


In [145]:
df_test.shape

(418, 9)

In [146]:
final_submission = df_test[['PassengerId', 'Survived']]

In [147]:
final_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [148]:
final_submission.to_csv('submission_1.csv', index=False)