In [21]:
import pandas as pd
import numpy as np
import os

In [22]:
data_dir = os.path.join("..", "data", "processed")

# --- Part 1: Load Preprocessed Data ---

In [23]:
df = pd.read_csv(os.path.join(data_dir, 'processed_data.csv'))
print("Cleaned and combined dataset loaded successfully.")
print(f"Loaded dataset with shape: {df.shape}")
df.head()

Cleaned and combined dataset loaded successfully.
Loaded dataset with shape: (1309, 14)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,CabinAssigned,Deck,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,U,Mr
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,C,Mrs
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,U,Miss
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,1,C,Mrs
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,U,Mr


# --- Part 2: Advanced Feature Engineering ---

In [24]:
print("--- Performing Advanced Feature Engineering ---")

# Feature 1: Extract Ticket Information
df['Ticket_Prefix'] = df['Ticket'].apply(lambda x: x.split()[0] if not x.split()[0].isdigit() else 'NUM')
df['Ticket_Prefix'] = df['Ticket_Prefix'].str.replace(r'[\./]', '', regex=True)

# Feature 2: More Granular Family Size Bins
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['FamilySize_Group'] = 'Medium'
df.loc[df['FamilySize'] == 1, 'FamilySize_Group'] = 'Alone'
df.loc[df['FamilySize'] >= 5, 'FamilySize_Group'] = 'Large'

# Feature 3: Interaction Features
df['Age_Class'] = df['Age'] * df['Pclass']
df['Fare_per_Person'] = df['Fare'] / df['FamilySize']

--- Performing Advanced Feature Engineering ---


# --- Part 3: Final Data Preparation ---

In [25]:
print("--- Preparing Data for Modeling ---")

# Drop redundant or original columns
cols_to_drop = ['Name', 'Ticket', 'SibSp', 'Parch', 'Age', 'Fare']
df.drop(columns=cols_to_drop, inplace=True)

# Encode all categorical features using one-hot encoding
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

--- Preparing Data for Modeling ---


In [26]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,CabinAssigned,FamilySize,Age_Class,Fare_per_Person,Sex_male,Embarked_Q,Embarked_S,...,Ticket_Prefix_SOTONOQ,Ticket_Prefix_SP,Ticket_Prefix_STONO,Ticket_Prefix_STONO2,Ticket_Prefix_STONOQ,Ticket_Prefix_SWPP,Ticket_Prefix_WC,Ticket_Prefix_WEP,FamilySize_Group_Large,FamilySize_Group_Medium
0,1,0.0,3,0,2,66.0,3.625,True,False,True,...,False,False,False,False,False,False,False,False,False,True
1,2,1.0,1,1,2,38.0,35.64165,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,3,1.0,3,0,1,78.0,7.925,False,False,True,...,False,False,False,True,False,False,False,False,False,False
3,4,1.0,1,1,2,35.0,26.55,False,False,True,...,False,False,False,False,False,False,False,False,False,True
4,5,0.0,3,0,1,105.0,8.05,True,False,True,...,False,False,False,False,False,False,False,False,False,False


In [31]:
# --- Part 4: Save the Fully Processed Data ---
output_path = os.path.join(data_dir, 'titanic_fully_processed.csv')
df.to_csv(output_path, index=False)
print(f"\nFully processed data saved to '{output_path}")
print(f"Final dataset shape: {df.shape}")
print("--- Feature Engineering Script Finished ---")


Fully processed data saved to '..\data\processed\titanic_fully_processed.csv
Final dataset shape: (1309, 61)
--- Feature Engineering Script Finished ---
