# Data preparation

In [1]:
import pandas as pd
import numpy as np

from config import TRAIN_DATA_PATH, RESULTS_PATH

In [2]:
import importlib
import utils.visualisation
import utils.feature_engineering

# reload custom libraries without restarting Kernel
importlib.reload(utils.visualisation)
importlib.reload(utils.feature_engineering)

from utils.visualisation import display_df, summarize_df
from utils.feature_engineering import fill_missing_values
from utils.feature_engineering import extract_title, create_family_features, simplify_deck, create_age_group, create_fare_band, create_ticket_prefix
from utils.feature_engineering import normalize_deck, normalize_ticket_prefix

### Load the Titanic Dataset

We begin by importing the necessary libraries and loading the Titanic training dataset.
This dataset contains information about passengers, their demographics, and survival outcome.

We will first explore the structure and completeness of the data before applying any transformations.


In [3]:
# Load dataset
df = pd.read_csv(TRAIN_DATA_PATH)
display_df(df, rows=10, show_index=False)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Explore and Understand the Data

We check the dataset for:
- Feature types (numerical, categorical)
- Missing values
- Basic statistics (mean, std, min, max)

This step helps us plan further preprocessing.

In [4]:
summarize_df(df, name = "Titanik Dataset", mode="extended")



Shape: 891 rows × 12 columns

🧾 DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

📊 Statistical Description:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Dooley, Mr. Patrick",male,,,,347082.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.38,2.31,,,29.7,0.52,0.38,,32.2,,
std,257.35,0.49,0.84,,,14.53,1.1,0.81,,49.69,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.12,0.0,0.0,,7.91,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.45,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,



❓ Missing Values:


Unnamed: 0,Missing values
Cabin,687
Age,177
Embarked,2



🔁 Duplicated rows: 0

🔍 Extended Summary:

🔢 Unique Values per Column:


Unnamed: 0,Unique values
Survived,2
Sex,2
Pclass,3
Embarked,3
Parch,7
SibSp,7
Age,88
Cabin,147
Fare,248
Ticket,681



🧪 Data Types:


Unnamed: 0,Type
PassengerId,int64
Survived,int64
Pclass,int64
Name,object
Sex,object
Age,float64
SibSp,int64
Parch,int64
Ticket,object
Fare,float64





### Missing values

In [5]:
summarize_df(df, name="Titanic Dataset (Before Missing Value Imputation)", mode="basic")

# Extract passenger titles from names (e.g., Mr, Miss, Dr) before filling missing values
df = extract_title(df)

# here we using "title" for median calculation
df = fill_missing_values(df)

summarize_df(df, name="Titanic Dataset (After Missing Value Imputation)", mode="basic")



Shape: 891 rows × 12 columns

🧾 DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

📊 Statistical Description:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Dooley, Mr. Patrick",male,,,,347082.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.38,2.31,,,29.7,0.52,0.38,,32.2,,
std,257.35,0.49,0.84,,,14.53,1.1,0.81,,49.69,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.12,0.0,0.0,,7.91,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.45,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,



❓ Missing Values:


Unnamed: 0,Missing values
Cabin,687
Age,177
Embarked,2



🔁 Duplicated rows: 0



Shape: 891 rows × 13 columns

🧾 DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
 12  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB

📊 Statistical Description:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891.0,204,891,891
unique,,,,891,2,,,,681.0,,147,3,9
top,,,,"Dooley, Mr. Patrick",male,,,,347082.0,,G6,S,Mr
freq,,,,1,577,,,,7.0,,4,646,517
mean,446.0,0.38,2.31,,,29.41,0.52,0.38,,32.2,,,
std,257.35,0.49,0.84,,,13.25,1.1,0.81,,49.69,,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,,
25%,223.5,0.0,2.0,,,21.5,0.0,0.0,,7.91,,,
50%,446.0,0.0,3.0,,,30.0,0.0,0.0,,14.45,,,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,31.0,,,



❓ Missing Values:


Unnamed: 0,Missing values
Cabin,687



🔁 Duplicated rows: 0



### Feature Engineering
We apply a set of feature engineering functions to create meaningful categorical and numeric variables.
These include:
- Extracting titles from names
- Computing family size and isolation status
- Simplifying cabin deck
- Creating age and fare bands
- Extracting ticket prefixes and normalizing rare values


In [6]:
# Create family size and a binary flag for solo travelers
df = create_family_features(df)

# Simplify cabin data to deck letters (e.g., C, D, Unknown)
df = simplify_deck(df)

# Bin ages into human-readable groups
df = create_age_group(df)

# Bin fares into custom intervals and round for simplicity
df = create_fare_band(df)

# Extract ticket prefixes (non-numeric parts)
df = create_ticket_prefix(df)

# Group rare deck values under "Other"
df = normalize_deck(df)

# Keep top 5 ticket prefixes, group others under "Other"
df = normalize_ticket_prefix(df)

display_df(df)
summarize_df(df, name="Titanic Dataset (After Feature Engenineering)", mode="extended")

🔁 Deck: Replaced 3 rare categories with 'Other': ['F', 'G', 'T']
🔁 TicketPrefix: Replaced 39 rare categories with 'Other': ['A./5.', 'A.5.', 'A/4', 'A/4.', 'A/5.', 'A/S', 'A4.', 'C', 'C.A./SOTON', 'CA', 'CA.', 'F.C.', 'F.C.C.', 'Fa', 'LINE', 'P/PP', 'PP', 'S.C./A.4.', 'S.C./PARIS', 'S.O./P.P.', 'S.O.C.', 'S.O.P.', 'S.P.', 'S.W./PP', 'SC', 'SC/AH', 'SC/PARIS', 'SC/Paris', 'SCO/W', 'SO/C', 'SOTON/O.Q.', 'SOTON/O2', 'SOTON/OQ', 'STON/O2.', 'SW/PP', 'W./C.', 'W.E.P.', 'W/C', 'WE/P']


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Embarked,Title,FamilySize,IsAlone,Deck,AgeGroup,Fare,FareBand,TicketPrefix
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,S,Mr,2,0,Unknown,young,7.25,0.0,A/5
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,C,Mrs,2,0,C,adult,70.0,50.0,PC
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,S,Miss,1,1,Unknown,young,7.9,7.5,Other
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,S,Mrs,2,0,C,adult,50.0,25.0,
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,S,Mr,1,1,Unknown,adult,8.05,7.5,
6,0,3,"Moran, Mr. James",male,30.0,0,0,330877,Q,Mr,1,1,Unknown,young,8.45,7.5,
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,S,Mr,1,1,E,adult,50.0,25.0,
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,S,Master,5,0,Unknown,kids<3,20.0,15.0,
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,S,Mrs,3,0,Unknown,young,11.0,7.5,
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,C,Mrs,2,0,Unknown,teenager,30.0,25.0,




Shape: 891 rows × 18 columns

🧾 DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   PassengerId   891 non-null    int64   
 1   Survived      891 non-null    int64   
 2   Pclass        891 non-null    int64   
 3   Name          891 non-null    object  
 4   Sex           891 non-null    object  
 5   Age           891 non-null    float64 
 6   SibSp         891 non-null    int64   
 7   Parch         891 non-null    int64   
 8   Ticket        891 non-null    object  
 9   Embarked      891 non-null    object  
 10  Title         891 non-null    object  
 11  FamilySize    891 non-null    int64   
 12  IsAlone       891 non-null    int64   
 13  Deck          891 non-null    object  
 14  AgeGroup      891 non-null    category
 15  Fare          891 non-null    float64 
 16  FareBand      891 non-null    float64 
 17  Tick

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Embarked,Title,FamilySize,IsAlone,Deck,AgeGroup,Fare,FareBand,TicketPrefix
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891,891,891.0,891.0,891,891,891.0,891.0,891.0
unique,,,,891,2,,,,681.0,3,9,,,7,7,,,6.0
top,,,,"Dooley, Mr. Patrick",male,,,,347082.0,S,Mr,,,Unknown,young,,,
freq,,,,1,577,,,,7.0,646,517,,,687,521,,,661.0
mean,446.0,0.38,2.31,,,29.41,0.52,0.38,,,,1.9,0.6,,,32.1,17.29,
std,257.35,0.49,0.84,,,13.25,1.1,0.81,,,,1.61,0.49,,,49.52,15.78,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,,,1.0,0.0,,,0.0,0.0,
25%,223.5,0.0,2.0,,,21.5,0.0,0.0,,,,1.0,0.0,,,7.9,7.5,
50%,446.0,0.0,3.0,,,30.0,0.0,0.0,,,,1.0,1.0,,,14.5,7.5,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,,,2.0,1.0,,,30.0,25.0,



❓ Missing Values:
✅ No missing values.

🔁 Duplicated rows: 0

🔍 Extended Summary:

🔢 Unique Values per Column:


Unnamed: 0,Unique values
Survived,2
Sex,2
IsAlone,2
Pclass,3
Embarked,3
FareBand,5
TicketPrefix,6
Deck,7
AgeGroup,7
SibSp,7



🧪 Data Types:


Unnamed: 0,Type
PassengerId,int64
Survived,int64
Pclass,int64
Name,object
Sex,object
Age,float64
SibSp,int64
Parch,int64
Ticket,object
Embarked,object





### Save results

Saving results in pkl-format to keep structure and data types

In [7]:
# to keep types
df.to_pickle(RESULTS_PATH+"titanic_clean.pkl")

# simple format
df.to_csv(RESULTS_PATH+"titanic_clean.csv", index=False)