In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

In [3]:
RANDOM_SEED = 71
np.random.seed(RANDOM_SEED)

In [4]:
# !python3 -m pip freeze > requirements.txt

## Split Data into Training and Test Sets

In [5]:
import pandas as pd

# data_df = pd.read_csv("insurance.csv")

In [6]:
# data_df

In [7]:
from sklearn.model_selection import train_test_split

# train_df, test_df = train_test_split(data_df, train_size=0.8)

# train_df.to_csv("insurance_train.csv", index=False)
# test_df.to_csv("insurance_test.csv", index=False)

## Define a Linear Regression

In [8]:
class LinearRegression(nn.Module):
    """PyTorch implementation of Linear Regression."""

    def __init__(self, input_dim: int) -> None:
        super().__init__()
        self.input_dim = input_dim
        # Linear regression.
        self.fc1 = nn.Linear(self.input_dim, 1)

    def forward(self, x: Tensor) -> Tensor:
        x = self.fc1(x)
        return x

## Preprocess Train Datasets

In [9]:
# train_df = pd.read_csv("insurance_train.csv")
# test_df = pd.read_csv("insurance_test.csv")

In [10]:
# train_df

In [11]:
train_file_name = "insurance_train.csv"
test_file_name = "insurance_test.csv"

label_name = "charges"
float_feature_names = ["age", "bmi", "children"]
id_list_feature_names = ["sex", "smoker", "region"]
id_score_list_feature_names = []
embedding_feature_names = []

In [12]:
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass


@dataclass
class InputData:
    features: np.ndarray
    labels: np.ndarray


class DataReader:
    def __init__(
        self, 
        label_name: str,
        file_name: str, 
        float_feature_names: Optional[List[str]] = None, 
        id_list_feature_names: Optional[List[str]] = None, 
        id_score_list_feature_names: Optional[List[str]] = None, 
        embedding_feature_names: Optional[List[str]] = None, 
    ) -> None:
        self.file_name = file_name
        self.label_name = label_name

        if float_feature_names is None:
            float_feature_names = []
        if id_list_feature_names is None:
            id_list_feature_names = []
        if id_score_list_feature_names is None:
            id_score_list_feature_names = []
        if embedding_feature_names is None:
            embedding_feature_names = []

        self.float_feature_names = float_feature_names
        self.id_list_feature_names = id_list_feature_names
        self.id_score_list_feature_names = id_score_list_feature_names
        self.embedding_feature_names = embedding_feature_names
        self.feature_names = (
            self.float_feature_names + 
            self.id_list_feature_names + 
            self.id_score_list_feature_names + 
            self.embedding_feature_names
        )
        if len(self.feature_names) < 1:
            raise ValueError("Need to at least set up one feature name.")
    
    def __call__(self) -> InputData:
        data_df = pd.read_csv(self.file_name)
        features_df, labels_df = (
            data_df.loc[:, self.feature_names], 
            data_df.loc[:, self.label_name]
        )
        return InputData(
            features=features_df,
            labels=labels_df
        )

In [13]:
data_reader = DataReader(
    file_name=train_file_name, 
    label_name=label_name,
    float_feature_names=float_feature_names,
    id_list_feature_names=id_list_feature_names, 
)

input_data = data_reader()

In [14]:
data_reader.float_feature_names

['age', 'bmi', 'children']

In [15]:
input_data

InputData(features=      age     bmi  children     sex smoker     region
0      31  26.885         1    male     no  northeast
1      55  38.280         0    male     no  southeast
2      20  35.310         1    male     no  southeast
3      60  24.320         1    male     no  northwest
4      34  26.730         1  female     no  southeast
...   ...     ...       ...     ...    ...        ...
1065   28  26.510         2  female     no  southeast
1066   39  34.100         3  female     no  southwest
1067   60  25.840         0  female     no  northwest
1068   39  22.800         3  female     no  northeast
1069   23  28.490         1  female    yes  southeast

[1070 rows x 6 columns], labels=0        4441.21315
1       10226.28420
2       27724.28875
3       13112.60480
4        5002.78270
           ...     
1065     4340.44090
1066     7418.52200
1067    28923.13692
1068     7985.81500
1069    18328.23810
Name: charges, Length: 1070, dtype: float64)

In [29]:
from collections import OrderedDict
from copy import deepcopy
from typing import Dict, OrderedDict


@dataclass
class InputTransformedData:
    float_features: np.ndarray
    id_list_features: np.ndarray
    id_score_list_features: np.ndarray
    embedding_features: np.ndarray
    labels: np.ndarray


class InputTransform:
    def __init__(
        self,
        data_reader: DataReader,
        is_train: bool = True,
    ) -> None:
        self.data_reader = data_reader
        self.is_train = is_train
        self.id_list_features_metadata = None

    def __call__(
        self,
        input_data: InputData,
    ) -> InputTransformedData:
        labels_df = input_data.labels
        
        features_df = input_data.features
        (
            float_features_np,
            id_list_features_np,
            id_score_list_features_np,
            embedding_features_np,
        ) = self.get_feature_groups_data(features_df)

        # Preprocess id_list features.
        if self.is_train:
            self.id_list_features_metadata = self.get_id_list_features_metadata(
                id_list_features_np
            )

        id_list_features_transformed_np = self.id_list_features_transform(
            id_list_features_np
        )
        
        # Preprocess id_score_list features.
        id_score_list_features_transformed_np = self.id_score_list_features_transform(
            id_score_list_features_np
        )

        return InputTransformedData(
            float_features=float_features_np,
            id_list_features=id_list_features_transformed_np,
            id_score_list_features=id_score_list_features_transformed_np,
            embedding_features=embedding_features_np,
            labels=labels_df.values
        )

    def get_feature_groups_data(
        self,
        features_df: pd.DataFrame,
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        float_features_np = (
            features_df.loc[:, self.data_reader.float_feature_names]
        ).values
        id_list_features_np = (
            features_df.loc[:, self.data_reader.id_list_feature_names]
        ).values
        id_score_list_features_np = (
            features_df.loc[:, self.data_reader.id_score_list_feature_names]
        ).values
        embedding_features_np = (
            features_df.loc[:, self.data_reader.embedding_feature_names]
        ).values
        return (
            float_features_np,
            id_list_features_np,
            id_score_list_features_np,
            embedding_features_np,
        )

    def get_id_list_features_metadata(
        self,
        id_list_examples_np: np.ndarray,
    ) -> Dict[str, Dict[str, int]]:
        id_list_features_metadata = OrderedDict()

        for c in range(id_list_examples_np.shape[1]):
            col = id_list_examples_np[:, c]
            unique_data = np.unique(col)
            data_idx_map = {data: idx for idx, data in enumerate(unique_data)}
            id_list_features_metadata[
                self.data_reader.id_list_feature_names[c]
            ] = data_idx_map
        
        return id_list_features_metadata
    
    def id_list_features_transform(
        self,
        id_list_features_np: np.ndarray,
    ) -> np.ndarray:
        id_list_features_transformed_np = deepcopy(id_list_features_np)

        for c in range(id_list_features_transformed_np.shape[1]):
            # Convert category data to idx, with unknown category mapping to largest idx + 1.
            # Note: The unknown category would only appear in the test data.
            data_idx_map = self.id_list_features_metadata[
                self.data_reader.id_list_feature_names[c]
            ]
            data2idx = lambda x: data_idx_map.get(x, len(data_idx_map))
            result = np.array(list(map(data2idx, id_list_features_transformed_np[:, c])))
            id_list_features_transformed_np[:, c] = result
        
        return id_list_features_transformed_np.astype(np.int64)
    
    def id_score_list_features_transform(
        self,
        id_score_list_features_np: np.ndarray,
    ) -> np.ndarray:
        id_score_list_features_transformed_np = deepcopy(id_score_list_features_np)
        return id_score_list_features_transformed_np.astype(np.float64)


In [17]:
input_transform = InputTransform(data_reader=data_reader)

In [18]:
input_transformed_data = input_transform(input_data)

In [19]:
input_transformed_data

InputTransformedData(float_features=array([[31.   , 26.885,  1.   ],
       [55.   , 38.28 ,  0.   ],
       [20.   , 35.31 ,  1.   ],
       ...,
       [60.   , 25.84 ,  0.   ],
       [39.   , 22.8  ,  3.   ],
       [23.   , 28.49 ,  1.   ]]), id_list_features=array([[1, 0, 0],
       [1, 0, 2],
       [1, 0, 2],
       ...,
       [0, 0, 1],
       [0, 0, 0],
       [0, 1, 2]]), id_score_list_features=array([], shape=(1070, 0), dtype=float64), embedding_features=array([], shape=(1070, 0), dtype=float64), labels=array([ 4441.21315, 10226.2842 , 27724.28875, ..., 28923.13692,
        7985.815  , 18328.2381 ]))

In [21]:
labels_df = input_data.labels
features_df = input_data.features

(
    float_features_np,
    id_list_features_np,
    id_score_list_features_np,
    embedding_features_np,
) = input_transform.get_feature_groups_data(features_df)

In [22]:
float_features_np

array([[31.   , 26.885,  1.   ],
       [55.   , 38.28 ,  0.   ],
       [20.   , 35.31 ,  1.   ],
       ...,
       [60.   , 25.84 ,  0.   ],
       [39.   , 22.8  ,  3.   ],
       [23.   , 28.49 ,  1.   ]])

In [23]:
id_list_features_np

array([['male', 'no', 'northeast'],
       ['male', 'no', 'southeast'],
       ['male', 'no', 'southeast'],
       ...,
       ['female', 'no', 'northwest'],
       ['female', 'no', 'northeast'],
       ['female', 'yes', 'southeast']], dtype=object)

In [24]:
embedding_features_np

array([], shape=(1070, 0), dtype=float64)

In [25]:
id_list_features_metadata = input_transform.get_id_list_features_metadata(id_list_features_np)

In [26]:
id_list_features_metadata

OrderedDict([('sex', {'female': 0, 'male': 1}),
             ('smoker', {'no': 0, 'yes': 1}),
             ('region',
              {'northeast': 0,
               'northwest': 1,
               'southeast': 2,
               'southwest': 3})])

In [27]:
id_list_features_transformed_np = input_transform.id_list_features_transform(id_list_features_np)

In [28]:
print(id_list_features_transformed_np.dtype)
id_list_features_transformed_np

int64


array([[1, 0, 0],
       [1, 0, 2],
       [1, 0, 2],
       ...,
       [0, 0, 1],
       [0, 0, 0],
       [0, 1, 2]])

In [31]:
# TODO
id_list_feature_one_hot_transformed = F.one_hot(torch.LongTensor(id_list_features_transformed_np[:, 0]))
id_list_feature_one_hot_transformed[:, 0:id_list_feature_one_hot_transformed.shape[1]-1]

# for c in range(id_list_examples_np2.shape[1]):


tensor([[0],
        [0],
        [0],
        ...,
        [1],
        [1],
        [1]])

In [None]:
np.hstack([float_examples_raw_df, np.array([]).reshape((len(float_examples_raw_df), 0))])

In [None]:
from typing import Any, Tuple
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.transforms import ToTensor, Lambda


batch_size = 32
is_shuffle = True

class CustomDataset(Dataset):
    def __init__(
        self, 
        data_reader: DataReader, 
        input_transform: Any = None, 
        target_transform: Any = None
    ) -> None:
        self.examples, self.labels = data_reader()
        self.input_transform = input_transform
        self.target_transform = target_transform

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx: int) -> Tuple[Tensor, float]:
        example = self.examples[idx, :]
        label = self.labels[idx]
        if self.input_transform:
            example = self._input_transform(example)
        if self.target_transform:
            label = self.target_transform(self)
        return example, label

## Dataset & DataLoader

In [None]:
dataset = CustomDataset(
    data_reader=housing_data_reader,
)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=is_shuffle)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=is_shuffle)

In [None]:
train_examples, train_labels = next(iter(train_dataloader))

In [None]:
train_examples[:3]

In [None]:
float_feature_names = 