In [1]:
import pandas as pd
import os
import re

In [2]:
# initialize input and output directories
input_dir = os.path.join("..", "data", "raw")
output_dir = os.path.join("..", "data", "processed")

# read csv files
train_df = pd.read_csv(os.path.join(input_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(input_dir, "test.csv"))


A common and highly effective workflow involves concatenating the training and testing sets into a single DataFrame early in the process (after separating and storing the 'Survived' target variable and passenger IDs). This practice ensures that all subsequent data cleaning, imputation, and feature engineering steps are applied uniformly across both datasets. This consistency is critical for preventing data processing discrepancies that can lead to model errors or degraded performance when making predictions on the test set.  

In [3]:
# Combine train and test sets for consistent preprocessing
# The 'Survived' column from the training set is preserved (will be NaN for test rows)
df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
print(f"Combined dataset created with shape: {df.shape}")
df.head()

Combined dataset created with shape: (1309, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [5]:
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [6]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


# --- 1. Handling 'Embarked' ---

In [7]:
# Impute with the mode ('S')
embarked_mode = train_df['Embarked'].mode()[0]
df['Embarked'] = df['Embarked'].fillna(embarked_mode)
print(f"Missing 'Embarked' values imputed with mode: '{embarked_mode}'")

Missing 'Embarked' values imputed with mode: 'S'


# --- 2. Handling 'Fare' ---

In [8]:
# Check if there are any missing 'Fare' values before imputing
if df['Fare'].isnull().sum() > 0:
    # Find the Pclass for the passenger with the missing fare
    pclass_for_missing_fare = df[df['Fare'].isnull()]['Pclass'].values[0]
    # Calculate the median fare for that Pclass
    median_fare_for_pclass = df[df['Pclass'] == pclass_for_missing_fare]['Fare'].median()
    # Fill the missing value and reassign to avoid SettingWithCopyWarning
    df['Fare'] = df['Fare'].fillna(median_fare_for_pclass)
    print(f"Missing 'Fare' value imputed with median fare for Pclass {pclass_for_missing_fare}: {median_fare_for_pclass:.4f}")
else:
    print("No missing 'Fare' values found.")

Missing 'Fare' value imputed with median fare for Pclass 3: 8.0500


# --- 3. Handling 'Cabin' (Feature Engineering) ---

In [9]:
# Create 'CabinAssigned' feature
df['CabinAssigned'] = df['Cabin'].notna().astype(int)
print("Engineered feature 'CabinAssigned' (1 for cabin present, 0 for missing).")

# Extract Deck from Cabin number
# We fill NaN with 'U' for 'Unknown' before extracting the first letter.
df['Deck'] = df['Cabin'].fillna('U').apply(lambda x: x[0])
print("Engineered feature 'Deck' by extracting the first letter of the cabin number.")

# Drop the original 'Cabin' column as it's no longer needed
df.drop('Cabin', axis=1, inplace=True)
print("Original 'Cabin' column dropped.")

Engineered feature 'CabinAssigned' (1 for cabin present, 0 for missing).
Engineered feature 'Deck' by extracting the first letter of the cabin number.
Original 'Cabin' column dropped.


## --- 4. Handling 'Age' (Advanced Title-Based Imputation) ---

In [10]:
# Step 4a: Extract Titles
df['Title'] = df['Name'].apply(lambda name: re.search(r' ([A-Za-z]+)\.', name).group(1))
print("Extracted titles from 'Name' column.")

Extracted titles from 'Name' column.


In [11]:
# Step 4b: Consolidate Titles
# Mapping titles to broader categories
title_mapping = {
    'Mlle': 'Miss',
    'Ms': 'Miss',
    'Mme': 'Mrs',
    'Capt': 'Official',
    'Col': 'Official',
    'Major': 'Official',
    'Dr': 'Official',
    'Rev': 'Official',
    'Jonkheer': 'Rare',
    'Don': 'Rare',
    'Dona': 'Rare',
    'Sir': 'Rare',
    'Lady': 'Rare',
    'Countess': 'Rare'
}
df['Title'] = df['Title'].replace(title_mapping)
print("Consolidated rare and synonymous titles.")

Consolidated rare and synonymous titles.


In [12]:
# Step 4c: Calculate Median Age per Title Group
median_ages = df.groupby('Title')['Age'].median()
print("\nMedian Age per Title Group:")
print(median_ages)


Median Age per Title Group:
Title
Master       4.0
Miss        22.0
Mr          29.0
Mrs         35.0
Official    49.5
Rare        39.5
Name: Age, dtype: float64


In [13]:
# Step 4d: Impute Missing Age Values
# Use a loop to fill missing ages based on the calculated medians for each title
# Note: Using .loc for assignment here is the correct way to avoid SettingWithCopyWarning
for title in median_ages.index:
    df.loc[(df['Age'].isnull()) & (df['Title'] == title), 'Age'] = median_ages[title]

print("\nMissing 'Age' values imputed based on title-specific medians.")


Missing 'Age' values imputed based on title-specific medians.


In [16]:
# --- Final Check ---
print("\n--- Preprocessing Complete ---")
print("Verifying no more missing values in key columns:")
print(df[['Age', 'Fare', 'Embarked']].isnull().sum())

# Display the first few rows with the new features
print("\nSample of the preprocessed data:")
df.head()


--- Preprocessing Complete ---
Verifying no more missing values in key columns:
Age         0
Fare        0
Embarked    0
dtype: int64

Sample of the preprocessed data:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,CabinAssigned,Deck,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,U,Mr
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,C,Mrs
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,U,Miss
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,C,Mrs
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,U,Mr


In [17]:
df.to_csv(os.path.join(output_dir, "processed_data.csv"), index=False)

print(f"Processed dataset shape: {df.shape}")
print("\nProcessed datasets saved at", output_dir)


Processed dataset shape: (1309, 14)

Processed datasets saved at ..\data\processed
