In [107]:
import typing as t
import nltk
from pathlib import Path
import torch

import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader, Dataset, Subset, random_split

In [108]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\super\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [109]:
DATA_DIR = Path("data/")

In [110]:
def torch_train_test_split(dataset: t.Union[Dataset, t.Sized], train_part: float) -> t.Tuple[Subset, Subset]:
    train_size = round(train_part * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, lengths=(train_size, test_size))
    return train_dataset, test_dataset

# Regression

## concrete.csv

In [111]:
class ConcreteDataset(Dataset):
    scaler: StandardScaler
    n_features: int
    data: torch.Tensor
    targets: torch.Tensor

    def __init__(self, x: pd.DataFrame, y: pd.Series, scaler: StandardScaler = None):
        self.scaler = scaler or StandardScaler()
        self.data = torch.tensor(self.scaler.fit_transform(x), dtype=torch.float)
        self.targets = torch.tensor(y.to_numpy(), dtype=torch.float).unsqueeze(1)
        self.n_features = self.data.size(1)

    def __len__(self):
        return self.targets.size(0)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [112]:
concrete_df = pd.read_csv(DATA_DIR / "regression/concrete.csv")
concrete_df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [113]:
target_col = "strength"
X, y = concrete_df.drop(columns=[target_col]), concrete_df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

concrete_train_dataset = ConcreteDataset(X_train, y_train)
concrete_test_dataset = ConcreteDataset(X_test, y_test, scaler=concrete_train_dataset.scaler)
len(concrete_train_dataset), len(concrete_test_dataset), concrete_train_dataset.n_features

(824, 206, 8)

In [114]:
concrete_train_dataloader = DataLoader(concrete_train_dataset, batch_size=4, shuffle=True)
concrete_test_dataloader = DataLoader(concrete_test_dataset, batch_size=len(concrete_test_dataset))

In [115]:
next(iter(concrete_train_dataloader))

[tensor([[ 0.9817, -0.8811, -0.8462,  0.2151, -1.0598,  0.9678, -0.4215, -0.6279],
         [-1.1888,  1.2042,  1.3720,  0.5888,  0.4489, -1.1806, -0.9149, -0.2880],
         [ 0.5724, -0.8811, -0.8462,  0.2011, -1.0598,  1.2534, -0.2328, -0.6279],
         [-0.2671, -0.8811,  0.9888,  0.3319, -0.0876,  0.7286, -0.1909,  0.8774]]),
 tensor([[24.0000],
         [28.9900],
         [21.1800],
         [43.5800]])]

## gold.csv

## house.csv

## insurance.csv

## vehicle.csv

# Feature classification

# Image classification

# Text classification