# Data Preprocessing

## Imports

In [None]:
import polars as pl

from polars import LazyFrame, DataFrame

## Process Data
Apply the same processing to the training and testing data.

Access the data lazily.

In [None]:
train_data: LazyFrame = pl.scan_csv("../data/train.csv", has_header=True)
test_data: LazyFrame = pl.scan_csv("../data/test.csv", has_header=True)

Define a map of titles to integers. The title data was extracted from the _Name_ column in the training and testing data.

In [None]:
title_map: dict[str, int] = {
    # Common Titles
    "Mr": 1,
    "Ms": 2,
    "Mrs": 2,
    "Mme": 2,
    "Master": 3,
    "Miss": 4,
    "Mlle": 4,
    # Distinguished Titles
    "Capt": 5,
    "Col": 5,
    "Dr": 5,
    "Major": 5,
    "Rev": 5,
    # Royal Titles
    "Countess": 6,
    "Don": 6,
    "Dona": 6,
    "Jonkheer": 6,
    "Lady": 6,
    "Sir": 6,
}

### Features

Calculate the mean ages for records that do not have an age. Calculate the _mean_ age for "Master."s, "Miss."es, "Mr."s, and "Mrs."es. For other titles, use the mean age of the entire dataset. Titles are embedded in the _Name_ column.

> ![note]
> The following code is pseudo-code. It may not be executable.

```python

The ages was calculated roughly as follows:

```python
all_data = pl.concat(
    pl.read_csv("data/train.csv"),
    pl.read_csv("data/test.csv"),
)

mean_miss_age: float = all_data.filter(pl.col("Name").str.contains("Miss.")).get_column("Age").mean()
mean_master_age: float = all_data.filter(pl.col("Name").str.contains("Master.")).get_column("Age").mean()
mean_mrs_age: float = all_data.filter(pl.col("Name").str.contains("Mrs.")).get_column("Age").mean()
mean_mr_age: float = all_data.filter(pl.col("Name").str.contains("Mr.")).get_column("Age").mean()

mean_ages: DataFrame = all_data.with_columns(
    miss=pl.when(null_age_miss_filter).then(mean_miss_age).otherwise(pl.col("Age")),
    master=pl.when(null_age_master_filter).then(mean_master_age).otherwise(pl.col("Age")
    mrs=pl.when(null_age_mrs_filter).then(mean_mrs_age).otherwise(pl.col("Age")),
    mr=pl.when(null_age_mr_filter).then(mean_mr_age).otherwise(pl.col("Age")),
    mean=pl.col("Age").mean(),
)
```

In [None]:
mean_miss_age = 21.5
mean_master_age = 4.5
mean_mrs_age = 35.5
mean_mr_age = 33.0

Prepare the features for training and inference:
+ Create a feature called `sku` from the 'Pclass' column
+ Create a feature called `n_family` by adding the 'SibSp' and 'Parch' columns
+ Create a feature called `title` by extracting the title from the 'Name' column and mapping it to an integer
+ Create a feature called `has_cabin` by checking if the 'Cabin' column is not null
+ Create a feature called `fare` by binning the 'Fare' column into 3 categories
+ Create a feature called `age` by using the calculated mean ages for the 'Age' column
+ Create a feature called `is_father` by checking the 'Name' column for the title 'Mr.' and the sum of 'Parch' and 'SibSp' is greater than 2
+ Create a feature called `is_lt12fam` by checking if the 'Age' is less than 12 and the sum of 'Parch' and 'SibSp' is greater than 1

In [None]:
# fmt: off
train_Xs: LazyFrame = train_data.select(
    sku=pl.col("Pclass").rank(method="dense"),
    n_family=pl.col("SibSp") + pl.col("Parch"),
    origin=pl.col("Embarked").fill_null(strategy="forward").rank(method="dense"),
    title=pl.col("Name")
        .str.extract(r",\s*(\w+)\.\s*")
        .replace_strict(title_map, default=max(title_map.values()) + 1, return_dtype=pl.UInt8),
    has_cabin=pl.col("Cabin").is_not_null().cast(pl.UInt8),
    fare=pl.when(pl.col("Fare").is_null(),)
        .then(pl.col("Fare").mean())
        .when(pl.col("Fare").le(7.91),).then(1)
        .when(pl.col("Fare").is_between(7.91, 14.454, closed='left'),).then(2)
        .when(pl.col("Fare").is_between(14.454, 31.0, closed='left'),).then(3)
        .otherwise(4).cast(pl.UInt8),
    age=pl.when((pl.col("Name").str.contains("Master")) & (pl.col("Age").is_null()),)
        .then(mean_master_age)
        .when((pl.col("Name").str.contains("Miss")) & (pl.col("Age").is_null()),)
        .then(mean_miss_age)
        .when((pl.col("Name").str.contains("Mrs")) & (pl.col("Age").is_null()),)
        .then(mean_mrs_age)
        .when((pl.col("Name").str.contains("Mr")) & (pl.col("Age").is_null()),)
        .then(mean_mr_age)
        .when(pl.col("Age").is_null(),)
        .then(pl.col("Age").mean())
        .otherwise(pl.col("Age")),
    # is father with family
    is_father=pl.when(pl.col("Name").str.contains("Mr") & (pl.col("Parch") + pl.col("SibSp") > 2),)
        .then(1)
        .otherwise(0).cast(pl.UInt8),
    # Child under 12 in family
    is_lt12fam=pl.when(pl.col("Age").lt(12) & pl.col("Parch") + pl.col("SibSp") > 1,)
        .then(1)
        .otherwise(0).cast(pl.UInt8),
)
train_ys: LazyFrame = train_data.select(y=pl.col("Survived"))

test_Xs: LazyFrame = test_data.select(
    sku=pl.col("Pclass").rank(method="dense"),
    n_family=pl.col("SibSp") + pl.col("Parch"),
    origin=pl.col("Embarked").fill_null(strategy="forward").rank(method="dense"),
    title=pl.col("Name")
        .str.extract(r",\s*(\w+)\.\s*")
        .replace_strict(title_map, default=max(title_map.values()) + 1, return_dtype=pl.UInt8),
    has_cabin=pl.col("Cabin").is_not_null().cast(pl.UInt8),
    fare=pl.when(pl.col("Fare").is_null(),)
        .then(pl.col("Fare").mean())
        .when(pl.col("Fare").le(7.91),).then(1)
        .when(pl.col("Fare").is_between(7.91, 14.454, closed='left'),).then(2)
        .when(pl.col("Fare").is_between(14.454, 31.0, closed='left'),).then(3)
        .otherwise(4).cast(pl.UInt8),
    age=pl.when((pl.col("Name").str.contains("Master")) & (pl.col("Age").is_null()),)
        .then(mean_master_age)
        .when((pl.col("Name").str.contains("Miss")) & (pl.col("Age").is_null()),)
        .then(mean_miss_age)
        .when((pl.col("Name").str.contains("Mrs")) & (pl.col("Age").is_null()),)
        .then(mean_mrs_age)
        .when((pl.col("Name").str.contains("Mr")) & (pl.col("Age").is_null()),)
        .then(mean_mr_age)
        .when(pl.col("Age").is_null(),)
        .then(pl.col("Age").mean())
        .otherwise(pl.col("Age")),
    # is father with family
    is_father=pl.when(pl.col("Name").str.contains("Mr") & (pl.col("Parch") + pl.col("SibSp") > 2),)
        .then(1)
        .otherwise(0).cast(pl.UInt8),
    # Child under 12 in family
    is_lt12fam=pl.when(pl.col("Age").lt(12) & pl.col("Parch") + pl.col("SibSp") > 1,)
        .then(1)
        .otherwise(0).cast(pl.UInt8),
)
# fmt: on

### View Some of the Processed Data (Optional)

In [None]:
sample_train_features: DataFrame = train_Xs.collect().head(15)
sample_train_features

In [None]:
sample_test_features: DataFrame = test_Xs.collect().head(15)
sample_test_features

## Save the Processed Data

In [None]:
train_Xs.collect().write_csv("../data/train_Xs.csv")
train_ys.collect().write_csv("../data/train_ys.csv")
test_Xs.collect().write_csv("../data/test_Xs.csv")
