# Data Imports

In [60]:
# let's start with the data and see how it goes
import os
import pandas as pd
HOME = os.getcwd()
train_csv = os.path.join(HOME, 'data', 'train.csv')
test_csv = os.path.join(HOME, 'data', 'test.csv')

df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)
# set the columns names to lower case 

df_train.columns = [c.lower() for c in df_train.columns]
df_test.columns = [c.lower() for c in df_test.columns]

# remove unnecessary columns
df_train.drop(columns=['helpfulness', 'score'], inplace=True)
df_test.drop(columns=['helpfulness', 'score'], inplace=True)

In [61]:
# add a small piece of code to call the pytorch_modular code
from pathlib import Path
import sys

current = HOME
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(current, 'src'))

In [62]:
df_train.head()

Unnamed: 0,title,text,category
0,Golden Valley Natural Buffalo Jerky,The description and photo on this product need...,grocery gourmet food
1,Westing Game,This was a great book!!!! It is well thought t...,toys games
2,Westing Game,"I am a first year teacher, teaching 5th grade....",toys games
3,Westing Game,I got the book at my bookfair at school lookin...,toys games
4,I SPY A is For Jigsaw Puzzle 63pc,Hi! I'm Martine Redman and I created this puzz...,toys games


In [63]:
df_test.head()

Unnamed: 0,id,title,text
0,0,PetSafe Staywell Pet Door with Clear Hard Flap,We've only had it installed about 2 weeks. So ...
1,1,"Kaytee Timothy Cubes, 1-Pound",My bunny had a hard time eating this because t...
2,2,Body Back Buddy,would never in a million years have guessed th...
3,3,SnackMasters California Style Turkey Jerky,"Being the jerky fanatic I am, snackmasters han..."
4,4,Premier Busy Buddy Tug-a-Jug Treat Dispensing ...,Wondered how quick my dog would catch on to th...


In [64]:
import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

try:
    STOP_WORDS = list(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOP_WORDS = list(stopwords.words('english'))

In [65]:
# preprocessing functions
import re
from typing import List

def to_lower(text: str) -> str:
    return text.lower()

def no_extra_spaces(text: str) -> str:
    return re.sub('\s+', ' ', text)

def no_extra_chars(text: str) -> str:
    return re.sub(r'[^a-zA-Z\s,!.;:-]+', ' ', text) 

text = 'aaa5531--==-||"z2::,.a'

def remove_stop_words(text: str,
                      tokenizer: TweetTokenizer = None) -> str:
    text = to_lower(text)    
    tokenizer = TweetTokenizer() if tokenizer is None else tokenizer
    tokens = tokenizer.tokenize(text)
    # if the remove_stop_words argument is set to True, then filter stop words
    tokens = [t.strip() for t in tokens if t not in STOP_WORDS] 
    return " ".join(tokens)

def process(text: str) -> str:
    # first lower, remove extrac chracters
    text1 = to_lower(no_extra_chars(text))
    # remove redundant words
    text2 = remove_stop_words(text1)
    # remove extra spaces
    return no_extra_spaces(text2)

import random
random.seed(69)
example = df_train['text'][int(random.random() * len(df_train))]
print(example)
print(process(example))

# # drop the 'text' column as only the title will be used for classification
df_train.drop(columns=['text'], inplace=True)
df_test.drop(columns=['text'], inplace=True)

# 16 rows have missing values in the 'title' column, remove them
df_train.dropna(inplace=True)

See the title of this review. Fortunately, I am a packrat, and kept a bunch of hole repair kits from various blow up things that we have gone through over the years. Does not come with a hole repair kit though, just to warn you. Anyway, it is back in black and bouncing our 3 year old all over the place. Indoor only, I would say. Very highly recommended, in spite of a hole within a week of use. Hope that this is the first and last one... probably not.
see title review . fortunately , packrat , kept bunch hole repair kits various blow things gone years . come hole repair kit though , warn . anyway , back black bouncing year old place . indoor , would say . highly recommended , spite hole within week use . hope first last one ... probably .


In [66]:
print(df_train.isna().sum())
print("#" * 100)
print(df_test.isna().sum())

title       0
category    0
dtype: int64
####################################################################################################
id       0
title    5
dtype: int64


In [67]:
import numpy as np

cat2idx = {
    'toys games': 0,
    'health personal care': 1,
    'beauty': 2,
    'baby products': 3,
    'pet supplies': 4,
    'grocery gourmet food': 5,
}

idx2cat = {
    0:'toys games',
    1:'health personal care',
    2:'beauty',
    3:'baby products',
    4:'pet supplies',
    5:'grocery gourmet food' 
}

# making sure the dataframes are ready for training
def df_process_data(row):
    if isinstance(row['title'], float):
        row['title'] = random.choice(list(cat2idx.keys()))
        return row
    row['title'] = process(row['title'])
    return row

def df_process_labels(row):
    row['category'] = process(row['category'])
    # map it to an integer
    row['category'] = cat2idx[row['category']]
    return row

# process the fields
df_train = df_train.apply(df_process_data, axis=1)
# process the labels
df_train = df_train.apply(df_process_labels, axis=1)
# process the data is the test split
df_test = df_test.apply(df_process_data, axis=1)

In [68]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(df_train, test_size=0.15, stratify=df_train['category'], random_state=69)

# Embeddings

In [69]:
# in the rest of the code I will be using the d
import torch
from transformers import AutoModel, AutoTokenizer
NOTEBOOK_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT = 'distilbert-base-uncased' # let's keep it simple as for the first iteration
MODEL = AutoModel.from_pretrained(CHECKPOINT).to(NOTEBOOK_DEVICE)
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)

# Train Loaders

In [70]:
from torch.utils.data import DataLoader, Dataset

def collate_function(batch: List[str]):
    # batch will represent a list of tuples (text, category) 
    x, y = [list(row) for row in zip(*batch)]
    # convert both labels and data to tensors
    y_tensor = torch.FloatTensor(y).to(device=NOTEBOOK_DEVICE)
    embeddings = MODEL(**TOKENIZER(x, padding=True, return_tensors='pt').to(NOTEBOOK_DEVICE)).last_hidden_state # make sure to return tensors
    return embeddings.to(NOTEBOOK_DEVICE), y_tensor

# let's create a dataset object really quick:
class LabeledReviewDS(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> tuple[str, int]:
        return tuple(self.data.iloc[index, :2])

# let's set the random seed

torch.manual_seed(69)

train_ds = LabeledReviewDS(train_data)
val_ds = LabeledReviewDS(val_data)

# create the dataloaders
train_dl = DataLoader(dataset=train_ds, batch_size=32, shuffle=True, collate_fn=collate_function, drop_last=True)
val_dl = DataLoader(dataset=val_ds, batch_size=32, shuffle=False, collate_fn=collate_function)

# Training code:

In [71]:
import warnings
import random
import torch

from typing import Union, Dict, Tuple
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler

def get_default_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu' 

ACCURACY = 'accuracy'


def accuracy(y_pred: torch.tensor, y: torch.tensor) -> float:
    # squeeze values if needed
    value = (y_pred == y).type(torch.float32).mean().item()
    return value

def _set_default_parameters(device: str = None,
                            metrics: Union[str, Dict[str, callable]] = None
                            ) -> Tuple[str, Dict[str, callable]]:
    # set default arguments
    device = get_default_device() if device is None else device

    # a None 'metrics' will map only to accuracy
    metrics = {ACCURACY: accuracy} if metrics is None else metrics

    return device, metrics


In [72]:
TRAIN_LOSS = 'train_loss'
VAL_LOSS = 'val_loss'
TEST_LOSS = 'test_loss'

OPTIMIZER = 'optimizer'
SCHEDULER = 'scheduler'
OUTPUT_LAYER = 'output_layer'
LOSS_FUNCTION = 'loss_function'
METRICS = 'metrics'
MIN_TRAIN_LOSS = 'min_train_loss'
MIN_VAL_LOSS = 'min_val_loss'
MAX_EPOCHS = 'max_epochs'
DEVICE = 'device'
PROGRESS = 'progress'
REPORT_EPOCH = 'report_epoch'

# the number of epochs to discard before considering the best model
MIN_EVALUATION_EPOCH = 'min_evaluation_epoch'

# if the model does not reach a lower training loss than the current lowest loss after 'n' consecutive epochs,
# the training will stop
NO_IMPROVE_STOP = 'no_improve_stop'
DEBUG = 'debug'

In [73]:
def train_per_epoch(model: nn.Module,
                    train_dataloader: DataLoader[torch.tensor],
                    loss_function: nn.Module,
                    optimizer: torch.optim.Optimizer,
                    output_layer: Union[nn.Module, callable],
                    scheduler: lr_scheduler,
                    device: str = None,
                    metrics: Union[str, Dict[str, callable]] = None,
                    debug: bool = False
                    ) -> Dict[str, float]:

    # set the default arguments
    device, metrics = _set_default_parameters(device, metrics)

    # set the model to correct device and state
    model.train()
    model.to(device)
    # set the train loss and metrics
    train_loss, train_metrics = 0, dict([(name, 0) for name, _ in metrics.items()])

    # make sure to set the `drop_last` field in the dataloader to True,
    # as it might affect the metrics
    if hasattr(train_dataloader, 'drop_last'):
        if not train_dataloader.drop_last:
            raise ValueError(f"Please make sure to set the parameter 'drop_last' in the dataloader"
                             f"to {True} to avoid any misleading decrease in performance")

    # make sure the train_dataloader shuffles the data
    if hasattr(train_dataloader, 'shuffle'):
        train_dataloader.shuffle = True

    for _, (x, y) in enumerate(train_dataloader):
        # depending on the type of the dataset and the dataloader, the labels can be either 1 or 2 dimensional tensors
        # the first step is to squeeze them
        y = torch.squeeze(y, dim=-1)
        # THE LABELS MUST BE SET TO THE LONG DATATYPE
        x, y = x.to(device), y.to(torch.long).to(device)
        # pass the 1-dimensional label tensor to the loss function. In case the loss function expects 2D tensors, then
        # the exception will be caught and the extra dimension will be added
        y_pred = model(x)
        try:
            batch_loss = loss_function(y_pred, y)
        except RuntimeError:
            # un-squeeze the y
            new_y = torch.unsqueeze(y, dim=-1).to(torch.long).to(device)
            warnings.warn(
                f"An extra dimension has been added to the labels vectors"
                f"\nold shape: {y.shape}, new shape: {new_y.shape}")
            batch_loss = loss_function(y_pred, new_y)

        train_loss += batch_loss.item()
        optimizer.zero_grad()
        batch_loss.backward()
        # optimizer's step
        optimizer.step()

        y_pred_class = output_layer(y_pred)

        # calculate the metrics for
        for metric_name, metric_func in metrics.items():
            train_metrics[metric_name] += metric_func(y_pred_class, y)

    # update the learning rate at the end of each epoch
    if scheduler is not None:
        scheduler.step()

    # average the loss and the metrics
    # make sure to add the loss before averaging the 'train_loss' variable
    train_metrics[TRAIN_LOSS] = train_loss
    for metric_name, _ in train_metrics.items():
        train_metrics[metric_name] /= len(train_dataloader)

    return train_metrics


In [74]:
def val_per_epoch(model: nn.Module,
                  dataloader: DataLoader[torch.tensor],
                  loss_function: nn.Module,
                  output_layer: Union[nn.Module, callable],
                  device: str = None,
                  metrics: Union[str, Dict[str, callable]] = None,
                  debug: bool = False
                  ) -> Dict[str, float]:
    """
    This function evaluates a given model on a given test split of a dataset
    Args:
        debug:
        model: the given model
        dataloader: the loader guaranteeing access to the test split
        loss_function: the loss function the model tries to minimize on the test split
        output_layer: The model is assumed to output logits, this layer converts them to labels (to compute metrics)
        metrics: defaults to only accuracy
        device: The device on which the model will run
    Returns: A dictionary with loss value on the training data as well as the different given metrics
    """
    # set the default arguments
    device, metrics = _set_default_parameters(device, metrics)

    val_loss, val_metrics = 0, dict([(name, 0) for name, _ in metrics.items()])

    # set the model to the correct device and state
    model.eval()
    model.to(device)

    # Turn on inference context manager
    with torch.inference_mode():
        # Loop through DataLoader batches
        for _, (x, y) in enumerate(dataloader):
            # depending on the type of the dataset and the dataloader, the labels can be either 1 or 2 dimensional tensors
            # the first step is to squeeze them
            y = torch.squeeze(y, dim=-1)
            # THE LABELS MUST BE SET TO THE LONG DATATYPE
            x, y = x.to(device), y.to(torch.long).to(device)
            y_pred = model(x)
            
            # calculate the loss, and backprop
            try:
                loss = loss_function(y_pred, y)
            except RuntimeError:
                # un-squeeze the y
                new_y = torch.unsqueeze(y, dim=-1)
                warnings.warn(
                    f"An extra dimension has been added to the labels vectors"
                    f"\nold shape: {y.shape}, new shape: {new_y.shape}")
                loss = loss_function(y_pred, new_y.float())

            val_loss += loss.item()

            predictions = output_layer(y_pred)

            for metric_name, metric_func in metrics.items():
                val_metrics[metric_name] += metric_func(y, predictions)

    # make sure to add the loss without averaging the 'val_loss' variable
    val_metrics['val_loss'] = val_loss
    for name, metric_value in val_metrics.items():
        val_metrics[name] = metric_value / len(dataloader)

    return val_metrics


In [75]:
from typing import Any, List, Optional
##################################################################################################################
# UTILITY TRAINING FUNCTIONS:

# let's define a function to validate the passed training configuration
def _validate_training_configuration(train_configuration: Dict) -> Dict[str, Any]:
    # first step: extract the necessary parameters for the training: optimizer and scheduler
    optimizer = train_configuration.get(OPTIMIZER, None)
    scheduler = train_configuration.get(SCHEDULER, None)

    # set the default multi-class classification loss
    train_configuration[LOSS_FUNCTION] = train_configuration.get(LOSS_FUNCTION, nn.CrossEntropyLoss())

    # the default output layer: argmax: since only the default loss expects logits: the predictions need hard labels
    def default_output(x: torch.Tensor) -> torch.Tensor:
        return x.argmax(dim=-1)

    train_configuration[OUTPUT_LAYER] = train_configuration.get(OUTPUT_LAYER, default_output)

    necessary_training_params = [(OPTIMIZER, optimizer),
                                 (SCHEDULER, scheduler)]

    # make sure these parameters are indeed passed to the train_model function
    for name, tp in enumerate(necessary_training_params):
        if tp is None:
            raise TypeError(f"The argument {name} is expected to be passed as non-None to the configuration\n"
                            f"Found: {type(tp)}")

    # set the default parameters
    train_configuration[METRICS] = train_configuration.get(METRICS, {ACCURACY: accuracy})
    train_configuration[MIN_TRAIN_LOSS] = train_configuration.get(MIN_TRAIN_LOSS, None)
    train_configuration[MIN_VAL_LOSS] = train_configuration.get(MIN_VAL_LOSS, None)
    train_configuration[MAX_EPOCHS] = train_configuration.get(MAX_EPOCHS, 50)
    train_configuration[MIN_EVALUATION_EPOCH] = train_configuration.get(MIN_EVALUATION_EPOCH,
                                                                           train_configuration[MAX_EPOCHS] // 10)

    train_configuration[DEVICE] = train_configuration.get(DEVICE, get_default_device())
    train_configuration[PROGRESS] = train_configuration.get(PROGRESS, True)
    train_configuration[REPORT_EPOCH] = train_configuration.get(REPORT_EPOCH, None)
    # the default value will be set to 5% of the max number of epochs
    train_configuration[NO_IMPROVE_STOP] = train_configuration.get(NO_IMPROVE_STOP,
                                                                      train_configuration[MAX_EPOCHS] * 0.15)

    train_configuration[DEBUG] = train_configuration.get(DEBUG, False)

    return train_configuration


In [76]:
from torch.utils.tensorboard import SummaryWriter

def _track_performance(performance_dict: Dict[str, List[float]],
                       train_loss: float,
                       val_loss: float,
                       train_metric: Dict[str, float],
                       val_metrics: Dict[str, float]) -> None:
    # add the losses first
    performance_dict[TRAIN_LOSS].append(train_loss)
    performance_dict[VAL_LOSS].append(val_loss)

    # update train metrics
    for metric_name, metric_value in train_metric.items():
        performance_dict[f'train_{metric_name}'].append(metric_value)

    # update val metrics
    for metric_name, metric_value in val_metrics.items():
        performance_dict[f'val_{metric_name}'].append(metric_value)


def _set_summary_writer(writer: SummaryWriter,
                        epoch_train_loss,
                        epoch_val_loss,
                        epoch_train_metrics,
                        epoch_val_metrics,
                        epoch) -> None:
    # track loss results
    writer.add_scalars(main_tag='Loss',
                       tag_scalar_dict={"train_loss": epoch_train_loss, 'val_loss': epoch_val_loss},
                       global_step=epoch)

    for name, m in epoch_train_metrics.items():
        writer.add_scalars(main_tag=name,
                           tag_scalar_dict={f"train_{name}": m, f"val_{name}": epoch_val_metrics[name]},
                           global_step=epoch)

    writer.close()

In [77]:
from datetime import datetime
import re

def default_file_name(hour_ok: bool = True,
                      minute_ok: bool = True):
    # Get timestamp of current date (all experiments on certain day live in same folder)
    current_time = datetime.now()
    current_hour = current_time.hour
    current_minute = current_time.minute
    timestamp = datetime.now().strftime("%Y-%m-%d")  # returns current date in YYYY-MM-DD format
    # now it is much more detailed: better tracking
    timestamp += f"-{(current_hour if hour_ok else '')}-{current_minute if minute_ok else ''}"

    # make sure to remove any '-' left at the end
    timestamp = re.sub(r'-+$', '', timestamp)
    return timestamp


In [78]:
def abs_path(path: Union[str, Path]) -> Path:
    return Path(path) if os.path.isabs(path) else Path(os.path.join(os.getcwd(), path))

def process_save_path(save_path: Union[str, Path, None],
                      dir_ok: bool = True,
                      file_ok: bool = True,
                      condition: callable = None,
                      error_message: str = 'error!!') -> Union[str, Path, None]:
    if save_path is not None:
        # first make the save_path absolute
        save_path = abs_path(save_path)
        assert not \
            ((not file_ok and os.path.isfile(save_path)) or
             (not dir_ok and os.path.isdir(save_path))), \
            f'MAKE SURE NOT TO PASS A {"directory" if not dir_ok else "file"}'

        assert condition is None or condition(save_path), error_message

        # create the directory if needed
        if not os.path.isfile(save_path):
            os.makedirs(save_path, exist_ok=True)

    return save_path


In [79]:
def __verify_extension(p):
    return os.path.basename(p).endswith('.pt') or os.path.basename(p).endswith('.pth')


def save_model(model: nn.Module, path: Union[str, Path] = None) -> None:
    # the time of saving the model
    now = datetime.now()
    file_name = "-".join([str(now.month), str(now.day), str(now.hour), str(now.minute)])
    # add the extension
    file_name += '.pt'

    # first check if the path variable is None:
    path = path if path is not None else os.path.join(os.getcwd(), file_name)

    # process the path
    path = process_save_path(path,
                             dir_ok=True,
                             file_ok=True,
                             condition=lambda p: not os.path.isfile(p) or __verify_extension(p),
                             error_message='MAKE SURE THE FILE PASSED IS OF THE CORRECT EXTENSION')

    if os.path.isdir(path):
        path = os.path.join(path, file_name)

    # finally save the model.
    torch.save(model.state_dict(), path)


In [80]:
import pickle
def create_summary_writer(parent_dir: Union[str, Path],
                          experiment_name: str = None,
                          model_name: str = None,
                          return_path: bool = False) -> Union[SummaryWriter, tuple[SummaryWriter, Path]]:
    timestamp = default_file_name()
    # process the parent_dir first
    parent_dir = process_save_path(parent_dir, file_ok=False, dir_ok=True)

    # set the default values
    experiment_name = experiment_name if experiment_name is not None else f'experience_{len(os.listdir(parent_dir))}'
    exp_dir = os.path.join(parent_dir, experiment_name)

    # create the directory if needed
    os.makedirs(exp_dir, exist_ok=True)
    model_name = model_name if model_name is not None else f'experience_{len(os.listdir(parent_dir))}'

    log_dir = os.path.join(parent_dir, experiment_name, model_name, timestamp)
    os.makedirs(log_dir, exist_ok=True)

    print(f"[INFO] Created SummaryWriter, saving to: {log_dir}...")
    if return_path:
        return SummaryWriter(log_dir=log_dir), Path(log_dir)

    return SummaryWriter(log_dir=log_dir)


def save_info(save_path: Union[Path, str],
              details: dict[str, object],
              details_folder: str = 'details'):
    save_path = process_save_path(os.path.join(save_path, details_folder), dir_ok=True, file_ok=False)

    for name, obj in details.items():
        p = os.path.join(save_path, (name + '.pkl'))
        with open(p, 'wb') as f:
            pickle.dump(obj, f)


In [81]:
def _report_performance(train_loss: float,
                        val_loss: float,
                        train_metrics: Dict[str, float],
                        val_metrics: Dict[str, float]) -> None:
    print("#" * 25)
    print(f"training loss: {train_loss}")

    for metric_name, metric_value in train_metrics.items():
        print(f"train_{metric_name}: {metric_value}")

    print(f"validation loss : {val_loss}")
    for metric_name, metric_value in val_metrics.items():
        print(f"val_{metric_name}: {metric_value}")
    print("#" * 25)

In [82]:
from tqdm import tqdm
# THE MAIN TRAINING FUNCTION:
def train_model(model: nn.Module,
                train_dataloader: DataLoader[torch.Tensor],
                test_dataloader: DataLoader[torch.Tensor],
                train_configuration: Dict,
                log_dir: Optional[Union[Path, str]] = None,
                save_path: Optional[Union[Path, str]] = None,
                ):
    # set the default parameters
    train_configuration = _validate_training_configuration(train_configuration)

    save_path = save_path if save_path is not None else log_dir

    performance_dict = {TRAIN_LOSS: [],
                        VAL_LOSS: []}

    metrics = train_configuration[METRICS]

    # save 2 copies: val and train for each metric
    for name, _ in metrics.items():
        performance_dict[f'train_{name}'] = []
        performance_dict[f'val_{name}'] = []

    # best_model, best_loss = None, None
    min_training_loss, no_improve_counter, best_model = float('inf'), 0, None

    # in addition to the model save all the details:
    # build the details:
    details = {OPTIMIZER: train_configuration[OPTIMIZER],
               SCHEDULER: train_configuration[SCHEDULER],
               MAX_EPOCHS: train_configuration[MAX_EPOCHS],
               MIN_TRAIN_LOSS: train_configuration[MIN_TRAIN_LOSS],
               MIN_VAL_LOSS: train_configuration[MIN_VAL_LOSS]}

    # before proceeding with the training, let's set the summary writer
    writer = None if log_dir is None else create_summary_writer(log_dir)

    for epoch in tqdm(range(train_configuration[MAX_EPOCHS])):

        epoch_train_metrics = train_per_epoch(model=model,
                                              train_dataloader=train_dataloader,
                                              loss_function=train_configuration[LOSS_FUNCTION],
                                              optimizer=train_configuration[OPTIMIZER],
                                              output_layer=train_configuration[OUTPUT_LAYER],
                                              scheduler=train_configuration[SCHEDULER],
                                              device=train_configuration[DEVICE],
                                              debug=train_configuration[DEBUG])

        epoch_val_metrics = val_per_epoch(model=model, dataloader=test_dataloader,
                                          loss_function=train_configuration[LOSS_FUNCTION],
                                          output_layer=train_configuration[OUTPUT_LAYER],
                                          device=train_configuration[DEVICE],
                                          debug=train_configuration[DEBUG])

        epoch_train_loss = epoch_train_metrics[TRAIN_LOSS]
        del (epoch_train_metrics[TRAIN_LOSS])

        epoch_val_loss = epoch_val_metrics[VAL_LOSS]
        del (epoch_val_metrics[VAL_LOSS])

        no_improve_counter = no_improve_counter + 1 if min_training_loss < epoch_train_loss else 0

        if min_training_loss > epoch_train_loss:
            # save the model with the lowest training error
            min_training_loss = epoch_train_loss
            best_model = model

        if (train_configuration[REPORT_EPOCH] is not None
                and epoch % train_configuration[REPORT_EPOCH] == 0):
            _report_performance(epoch_train_loss,
                                epoch_val_loss,
                                epoch_train_metrics,
                                epoch_val_metrics)

        # save the model's performance for this epoch
        _track_performance(performance_dict=performance_dict,
                           train_loss=epoch_train_loss,
                           val_loss=epoch_val_loss,
                           train_metric=epoch_train_metrics,
                           val_metrics=epoch_val_metrics)

        _set_summary_writer(writer,
                            epoch_train_loss=epoch_train_loss,
                            epoch_val_loss=epoch_val_loss,
                            epoch_train_metrics=epoch_train_metrics,
                            epoch_val_metrics=epoch_val_metrics,
                            epoch=epoch
                            )

        # check if the losses reached the minimum thresholds
        if ((train_configuration[MIN_TRAIN_LOSS] is not None and
             train_configuration[MIN_TRAIN_LOSS] >= epoch_train_loss) or

                (train_configuration[MIN_VAL_LOSS] is not None
                 and train_configuration[MIN_VAL_LOSS] >= epoch_val_loss)):
            # the first state that reaches lower scores than the specified thresholds
            # is consequently the model's best state
            break

        # abort training if the training loss did not decrease 
        if no_improve_counter >= train_configuration[NO_IMPROVE_STOP]:
            warnings.warn(f"The training loss did not improve for {no_improve_counter} consecutive epochs."
                          f"\naborting training!!", category=RuntimeWarning)
            break

    save_info(save_path=log_dir, details=details)
    save_model(best_model, path=save_path)
    return performance_dict



In [83]:
def load_model(base_model: nn.Module,
               path: Union[str, Path]) -> nn.Module:
    # first process the path
    path = process_save_path(path,
                             dir_ok=False,
                             file_ok=True,
                             condition=lambda p: not os.path.isfile(p) or __verify_extension(p),
                             error_message='MAKE SURE THE FILE PASSED IS OF THE CORRECT EXTENSION')

    base_model.load_state_dict(torch.load(path))

    return base_model


# Train A model

In [84]:
from torch import nn
from torch.nn.functional import leaky_relu

class SeqClassModel(nn.Module):
    def __init__(self, 
                in_features: int,
                hidden_size: int, 
                num_classes: int, 
                num_layers: int = 2, 
                dropout: float=0.25, 
                *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.output_units = num_classes if num_classes > 2 else 1
        self.rnn = nn.LSTM(input_size=in_features, 
                           hidden_size=hidden_size, 
                           dropout=dropout, 
                           num_layers=num_layers,
                           bidirectional=True, # bidiretional RNN are more powerful
                           batch_first=True # easier manipulation
                           )
        # 2: comes from the fact that the lstm is bidirectional, the rest is similar to the LSTM documention Pytorch
        linear_input_dim = 2 * num_layers * hidden_size 
        self.batch_layer= nn.BatchNorm1d(num_features=linear_input_dim)
        # self.relu_layer = nn.LeakyReLU()
        self.head = nn.Linear(in_features=linear_input_dim, out_features=self.output_units)
        
    def forward(self, x: torch.Tensor):
        # first pass it through the rnn
        _, (hidden_state, _) = self.rnn(x)
        batch_size = hidden_state.shape[1]
        # first permuting channels: batch_size as dimensions '0' 
        # only only the last lstm layer
        hidden_state = hidden_state.permute((1, 0, 2)).reshape((batch_size, -1))
        return self.head.forward(self.batch_layer(hidden_state))


In [85]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
from torchmetrics.classification import MulticlassF1Score, MulticlassAccuracy

base_model = SeqClassModel(in_features=768, hidden_size=128, num_classes=6)
optimizer = AdamW(base_model.parameters(), lr=0.01)
scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.005, total_iters=100)

accuracy_metric, f1_metric = MulticlassAccuracy(num_classes=6), MulticlassF1Score(num_classes=6)

metrics = {'accuracy': accuracy_metric, 'f1_score': f1_metric}


train_configuration = {'optimizer': optimizer,
                        'scheduler': scheduler,
                        'min_val_loss': 10 ** -4,
                        'max_epochs': 100,
                        'report_epoch': 5,
                        'device': NOTEBOOK_DEVICE, 
                        'metrics': metrics,
                        'no_improve_stop': 30
                        }

In [86]:
# import src.pytorch_modular.image_classification.engine_classification as cls
results = train_model(base_model, train_dl, val_dl, train_configuration,    
                            log_dir=os.path.join(HOME, 'runs'),         
                            save_path=os.path.join(HOME, 'saved_models'))   

[INFO] Created SummaryWriter, saving to: /home/ayhem18/DEV/My_Kaggle_Repo/amazon_reviews/runs/experience_19/experience_19/2023-09-19-18-21...


 50%|█████     | 1/2 [00:54<00:54, 54.16s/it]

#########################
training loss: 0.6592031277982753
train_accuracy: 0.7730402542372882
validation loss : 0.5822132710129657
val_accuracy: 0.8122387921556513
#########################


100%|██████████| 2/2 [01:48<00:00, 54.21s/it]

#########################
training loss: 0.4895275359913679
train_accuracy: 0.832391713747646
validation loss : 0.4638859727122682
val_accuracy: 0.8458634120352725
#########################





# Inference Code

In [88]:
# time to set the inference part of the script
_VALID_RETURN_TYPES = ['np', 'pt', 'list']
# relatively small test splits (that can fit th memory)
from torchvision import transforms as tr


class InferenceDirDataset(Dataset):
    def __init__(self,
                 test_dir: Union[str, Path],
                 transformations: tr) -> None:
        # the usual base class constructor call
        super().__init__()
        test_data_path = process_save_path(test_dir, file_ok=False, dir_ok=True)
        self.data = [os.path.join(test_data_path, file_name) for file_name in os.listdir(test_data_path)]
        self.t = transformations

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> int:
        # don't forget to apply the transformation before returning the index-th element in the directory
        return self.t(Image.open(self.data[index]))


In [89]:
def _set_inference_loader(inference_source_data: Union[DataLoader[torch.tensor], Path, str],
                          transformations: tr = None) -> DataLoader:
    # the input to this function should be validated
    if isinstance(inference_source_data, (Path, str)):

        warnings.warn(f"The inference source data was passed as a path to a directory..."
                      f"\nBuilding the dataloader", category=RuntimeWarning)

        # make sure the transformations argument is passed
        if transformations is None:
            raise TypeError("The 'transformations' argument must be passed if the data source is a directory"
                            f"\nFound: {transformations}")
        ds = InferenceDirDataset(inference_source_data, transformations)
        dataloader = DataLoader(ds,
                                batch_size=100,
                                shuffle=False,  # shuffle false to keep the original order of the test-split samples
                                num_workers=os.cpu_count() // 2)
        return dataloader

    return inference_source_data


In [90]:
import itertools
from torchvision import transforms as tr

def inference(model: nn.Module,
              inference_source_data: Union[DataLoader, Path, str],
              transformation: tr = None,
              output_layer: Union[nn.Module, callable] = None,
              device: str = None,
              return_tensor: str = 'np'
              ) -> Union[np.ndarray, torch.tensor, List[int]]:
    # first let's make sure our loader is set
    loader = _set_inference_loader(inference_source_data,
                                   transformation)

    device = get_default_device() if device is None else device
    # make sure the return_tensor argument is a set to a valid value
    if return_tensor not in _VALID_RETURN_TYPES:
        raise ValueError(f'the `return_tensor` argument is expected to be among {_VALID_RETURN_TYPES}\n'
                         f'found: {return_tensor}')

    def default_output(x: torch.Tensor):
        return x.argmax(dim=-1)

    # the default output layer is the softmax layer: (reduced to argmax)
    output_layer = default_output if output_layer is None else output_layer
    # set to the inference mode
    model.eval()
    model.to(device)

    with torch.inference_mode():
        result = [output_layer(model.forward(X.to(device))) for X in loader]

    # now we have a list of pytorch tensors
    if return_tensor == 'pt':
        res = torch.stack(result)
        res = torch.squeeze(res, dim=-1)

    else:
        # convert res to a list of lists
        res = [torch.squeeze(r, dim=-1).cpu().tolist() for r in result]
        # flatten the list
        res = list(itertools.chain(*res))
        res = np.asarray(res) if return_tensor == 'np' else res
    return res


# Inference 

In [92]:
# let's make the damn submission
from src.pytorch_modular.pytorch_utilities import load_model
# base_model = SeqClassModel(in_features=768, hidden_size=128, num_classes=6)
# base_model = load_model(base_model=base_model, path=os.path.join(HOME, 'saved_models', '9-17-15-10.pt'))
# let's create a dataset object really quick:
class TestReviewDS(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> tuple[str, int]:
        return self.data.iloc[index, 1]

# we need a different callate_function
def test_collate_function(batch):
    embeddings = MODEL(**TOKENIZER(batch, padding=True, return_tensors='pt').to(NOTEBOOK_DEVICE)).last_hidden_state # make sure to return tensors
    return embeddings.to(NOTEBOOK_DEVICE)
    
# let's set the random seed

torch.manual_seed(69)

test_ds = TestReviewDS(data=df_test)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=test_collate_function)
# next(iter(test_loader)).shape
predictions = inference(base_model, inference_source_data=test_loader, return_tensor='list')
# convert the numerical labels to the string ones
predictions = [idx2cat[p] for p in predictions]

In [93]:
submission = pd.DataFrame(data={"id": df_test['id'].tolist(), "Category": predictions})
sub_dir = os.path.join(HOME, 'submissions')
submission.to_csv(os.path.join(sub_dir, f'sub_{len(os.listdir(sub_dir)) + 1}.csv'), index=False)