# Data Preprocessing

This is my effort to turn the given data in to an enhanced form to improve the prediction capability of models trained on it.

## Imports

In [None]:
import os
import polars as pl

from polars import Expr, LazyFrame, DataFrame, Series

## Process Data
Apply the same processing to the training and testing data.

In [None]:
train_data: LazyFrame = pl.scan_csv("../data/train.csv", has_header=True)
test_data: LazyFrame = pl.scan_csv("../data/test.csv", has_header=True)
Xs: LazyFrame = pl.concat([train_data.drop("Survived"), test_data])

### Extract Titles

Extract the titles from the _train_ and _test_ datasets. Save the titles to a file.

In [None]:
titles: LazyFrame = Xs.select(title=pl.col("Name").str.extract(r", (\w+)\. ")).drop_nulls().unique().sort("title")

titles.collect().write_csv("../data/uniques_titles.csv")

From the unique title list, define a map of titles to integers. The _title_ is a good proxy for _gender_ but it contains a _social standing_ component. This feature is somewhat better to predict _survival_ than _gender_ alone.

In [None]:
title_map: dict[str, int] = {
    # Common Titles
    "Mr": 1,
    "Ms": 2,
    "Mrs": 2,
    "Mme": 2,
    "Master": 3,
    "Miss": 4,
    "Mlle": 4,
    # Distinguished Titles
    "Capt": 5,
    "Col": 5,
    "Dr": 5,
    "Major": 5,
    "Rev": 5,
    # Royal Titles
    "Don": 6,
    "Dona": 6,
    "Jonkheer": 6,
    "Lady": 6,
    "Sir": 6,
}

### Features

Calculate the mean ages for records that do not have an age. Calculate the _mean_ age for "Master."s, "Miss."es, "Mr."s, and "Mrs."es. For other titles, use the mean age of the entire dataset. Titles are embedded in the _Name_ column.

In [None]:
mean_unknown_ages: DataFrame = pl.select(
    mean_master_age=Xs.filter(pl.col("Name").str.contains("Master")).select(pl.col("Age")).mean().collect(),
    mean_miss_age=Xs.filter(pl.col("Name").str.contains("Miss")).select(pl.col("Age")).mean().collect(),
    mean_mr_age=Xs.filter(pl.col("Name").str.contains("Mr")).select(pl.col("Age")).mean().collect(),
    mean_mrs_age=Xs.filter(pl.col("Name").str.contains("Mrs")).select(pl.col("Age")).mean().collect(),
    mean_remaining_age=Xs.filter(~pl.col("Name").str.contains("Master|Miss|Mr|Mrs"))
    .select(pl.col("Age"))
    .mean()
    .collect(),
).with_columns(
    pl.col("mean_master_age").round(1),
    pl.col("mean_miss_age").round(1),
    pl.col("mean_mr_age").round(1),
    pl.col("mean_mrs_age").round(1),
)

### Engineered features

| Feature              | Description                                |
|---------------------:|:-------------------------------------------|
| `sku`                | The class of the passenger                 |
| `family_size`        | How many family members travel together    |
| `origin`             | The port of embarkation                    |
| `title`              | The title of the passenger                 |
| `has_cabin`          | The passenger has a cabin                  |
| `fare`               | The fare of the passenger                  |
| `age`                | The age of the passenger                   |
| `father_with_family` | Father with his family                     |
| `child_with_family`  | A child less than 12 years old in a family |


#### sku
Since `class` is a reserved keyword in Python, I will use `sku` instead.


#### family_size
The `family_size` is the sum of the `SibSp` and `Parch` columns.


#### origin
The `origin` is the port of embarkation. It is a categorical feature. It is an integer value for each port. I will consider one-hot encoding if it will improve model performace.


#### title
The `title` is the title of the passenger. It is a categorical feature. It is an integer value for each title. `title` is a good proxy for `gender` but it contains a _social standing_ component. This feature is marginally better better to predict _survival_ than gender alone. See how it is extracted above.


#### has_cabin
The `has_cabin` is a binary feature. It is `1` if the passenger has a cabin and `0` otherwise.


#### fare
The `fare` is the fare of the passenger. It is a continuous feature mapped into a number of bins. The boundaries of the bins are:
+ 0 to 7.91
+ 7.91 to 14.454
+ 14.454 to 31
+ 31 to 512.329

If there is no fare, the _median_ fare is used to avoid skewing by outliers.


#### age
The `age` is the age of the passenger. The null values are filled with the mean ages as calculated above.


#### father_with_family
The `father_with_family` is a binary feature. It is `1` if the passenger is a father with his family and `0` otherwise. It is a combination of the `title` and _Parch_ and _SibSp_ features.


#### child_with_family
The `child_with_family` is a binary feature. It is `1` if the passenger is a child less than 12 years old in a family and `0` otherwise.

In [None]:
# fmt: off
def build_null_age_expr(title: str) -> Expr:
    return pl.col("Name").str.contains(title) & pl.col("Age").is_null()


null_master_age: Expr = build_null_age_expr("Master")
null_miss_age: Expr = build_null_age_expr("Miss")
null_mrs_age: Expr = build_null_age_expr("Mrs")
null_mr_age: Expr = build_null_age_expr("Mr")
null_person_age: Expr = ~pl.col("Name").str.contains("Master|Miss|Mr|Mrs") & pl.col("Age").is_null()

mean_master_age: Series = mean_unknown_ages["mean_master_age"]
mean_miss_age: Series = mean_unknown_ages["mean_miss_age"]
mean_mr_age: Series = mean_unknown_ages["mean_mr_age"]
mean_mrs_age: Series = mean_unknown_ages["mean_mrs_age"]
mean_remaining_age: Series = mean_unknown_ages["mean_remaining_age"]


def process_data(data: LazyFrame) -> LazyFrame:
    return data.select(
        sku=pl.col("Pclass").rank(method="dense"),
        family_size=pl.col("SibSp") + pl.col("Parch"),
        origin=pl.col("Embarked").fill_null(strategy="forward").rank(method="dense"),
        title=pl.col("Name")
            .str.extract(r",\s*(\w+)\.\s*")
            .replace_strict(title_map, default=max(title_map.values()) + 1, return_dtype=pl.UInt8),
        has_cabin=pl.col("Cabin").is_not_null().cast(pl.UInt8),
        fare=pl.when(pl.col("Fare").is_null()).then(pl.col("Fare").median())
            .when(pl.col("Fare").le(7.91),).then(1)
            .when(pl.col("Fare").is_between(7.91, 14.454, closed='left')).then(2)
            .when(pl.col("Fare").is_between(14.454, 31.0, closed='left')).then(3)
            .otherwise(4).cast(pl.UInt8),
        age=pl.when(null_master_age).then(mean_master_age)
            .when(null_miss_age).then(mean_miss_age)
            .when(null_mrs_age).then(mean_mrs_age)
            .when(null_mr_age).then(mean_mr_age)
            .when(null_person_age).then(mean_remaining_age)
            .otherwise(pl.col("Age")),
        father_with_family=pl.when(pl.col("Name").str.contains("Mr") & (pl.col("Parch") + pl.col("SibSp") > 2))
            .then(1)
            .otherwise(0).cast(pl.UInt8),
        child_with_family=pl.when(pl.col("Age").lt(12.0) & (pl.col("Parch") + pl.col("SibSp") > 1))
            .then(1)
            .otherwise(0).cast(pl.UInt8)
    )


train_Xs: LazyFrame = process_data(train_data)
train_ys: LazyFrame = train_data.select(y=pl.col("Survived"))
test_Xs: LazyFrame = process_data(test_data)
# fmt: on

## Save the Processed Data

In [None]:
def save_processed_data(df: DataFrame, path: str) -> None:
    df.collect().write_csv(os.path.join("../data", path + ".csv"))

for dataset in [(train_Xs, "train_Xs"), (train_ys, "train_ys"), (test_Xs, "test_Xs")]:
    save_processed_data(*dataset)