# Data Imports

In [1]:
# let's start with the data and see how it goes
import os
import pandas as pd
HOME = os.getcwd()
train_csv = os.path.join(HOME, 'data', 'train.csv')
test_csv = os.path.join(HOME, 'data', 'test.csv')

df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)
# set the columns names to lower case 

df_train.columns = [c.lower() for c in df_train.columns]
df_test.columns = [c.lower() for c in df_test.columns]

# remove unnecessary columns
df_train.drop(columns=['helpfulness', 'score'], inplace=True)
df_test.drop(columns=['helpfulness', 'score'], inplace=True)

In [2]:
# add a small piece of code to call the pytorch_modular code
from pathlib import Path
import sys

current = HOME
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(current, 'src'))

In [3]:
df_train.head()

Unnamed: 0,title,text,category
0,Golden Valley Natural Buffalo Jerky,The description and photo on this product need...,grocery gourmet food
1,Westing Game,This was a great book!!!! It is well thought t...,toys games
2,Westing Game,"I am a first year teacher, teaching 5th grade....",toys games
3,Westing Game,I got the book at my bookfair at school lookin...,toys games
4,I SPY A is For Jigsaw Puzzle 63pc,Hi! I'm Martine Redman and I created this puzz...,toys games


In [4]:
df_test.head()

Unnamed: 0,id,title,text
0,0,PetSafe Staywell Pet Door with Clear Hard Flap,We've only had it installed about 2 weeks. So ...
1,1,"Kaytee Timothy Cubes, 1-Pound",My bunny had a hard time eating this because t...
2,2,Body Back Buddy,would never in a million years have guessed th...
3,3,SnackMasters California Style Turkey Jerky,"Being the jerky fanatic I am, snackmasters han..."
4,4,Premier Busy Buddy Tug-a-Jug Treat Dispensing ...,Wondered how quick my dog would catch on to th...


In [5]:
import nltk 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

try:
    STOP_WORDS = list(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOP_WORDS = list(stopwords.words('english'))

In [6]:
# preprocessing functions
import re
from typing import List

def to_lower(text: str) -> str:
    return text.lower()

def no_extra_spaces(text: str) -> str:
    return re.sub('\s+', ' ', text)

def no_extra_chars(text: str) -> str:
    return re.sub(r'[^a-zA-Z\s,!.;:-]+', ' ', text) 

text = 'aaa5531--==-||"z2::,.a'

def remove_stop_words(text: str,
                      tokenizer: TweetTokenizer = None) -> str:
    text = to_lower(text)    
    tokenizer = TweetTokenizer() if tokenizer is None else tokenizer
    tokens = tokenizer.tokenize(text)
    # if the remove_stop_words argument is set to True, then filter stop words
    tokens = [t.strip() for t in tokens if t not in STOP_WORDS] 
    return " ".join(tokens)

def process(text: str) -> str:
    # first lower, remove extrac chracters
    text1 = to_lower(no_extra_chars(text))
    # remove redundant words
    text2 = remove_stop_words(text1)
    # remove extra spaces
    return no_extra_spaces(text2)

import random
random.seed(69)
example = df_train['text'][int(random.random() * len(df_train))]
print(example)
print(process(example))

# # drop the 'text' column as only the title will be used for classification
df_train.drop(columns=['text'], inplace=True)
df_test.drop(columns=['text'], inplace=True)

# 16 rows have missing values in the 'title' column, remove them
df_train.dropna(inplace=True)

See the title of this review. Fortunately, I am a packrat, and kept a bunch of hole repair kits from various blow up things that we have gone through over the years. Does not come with a hole repair kit though, just to warn you. Anyway, it is back in black and bouncing our 3 year old all over the place. Indoor only, I would say. Very highly recommended, in spite of a hole within a week of use. Hope that this is the first and last one... probably not.
see title review . fortunately , packrat , kept bunch hole repair kits various blow things gone years . come hole repair kit though , warn . anyway , back black bouncing year old place . indoor , would say . highly recommended , spite hole within week use . hope first last one ... probably .


In [7]:
print(df_train.isna().sum())
print("#" * 100)
print(df_test.isna().sum())

title       0
category    0
dtype: int64
####################################################################################################
id       0
title    5
dtype: int64


In [8]:
import numpy as np

cat2idx = {
    'toys games': 0,
    'health personal care': 1,
    'beauty': 2,
    'baby products': 3,
    'pet supplies': 4,
    'grocery gourmet food': 5,
}

idx2cat = {
    0:'toys games',
    1:'health personal care',
    2:'beauty',
    3:'baby products',
    4:'pet supplies',
    5:'grocery gourmet food' 
}

# making sure the dataframes are ready for training
def df_process_data(row):
    if isinstance(row['title'], float):
        row['title'] = random.choice(list(cat2idx.keys()))
        return row
    row['title'] = process(row['title'])
    return row

def df_process_labels(row):
    row['category'] = process(row['category'])
    # map it to an integer
    row['category'] = cat2idx[row['category']]
    return row

# process the fields
df_train = df_train.apply(df_process_data, axis=1)
# process the labels
df_train = df_train.apply(df_process_labels, axis=1)
# process the data is the test split
df_test = df_test.apply(df_process_data, axis=1)

In [9]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(df_train, test_size=0.15, stratify=df_train['category'], random_state=69)

# Embeddings

In [10]:
# in the rest of the code I will be using the d
import torch
from transformers import AutoModel, AutoTokenizer
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT = 'distilbert-base-uncased' # let's keep it simple as for the first iteration
MODEL = AutoModel.from_pretrained(CHECKPOINT).to(DEVICE)
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)

# Train Loaders

In [11]:
from torch.utils.data import DataLoader, Dataset

def collate_function(batch: List[str]):
    # batch will represent a list of tuples (text, category) 
    x, y = [list(row) for row in zip(*batch)]
    # convert both labels and data to tensors
    y_tensor = torch.FloatTensor(y).to(device=DEVICE)
    embeddings = MODEL(**TOKENIZER(x, padding=True, return_tensors='pt').to(DEVICE)).last_hidden_state # make sure to return tensors
    return embeddings.to(DEVICE), y_tensor

# let's create a dataset object really quick:
class LabeledReviewDS(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> tuple[str, int]:
        return tuple(self.data.iloc[index, :2])

# let's set the random seed

torch.manual_seed(69)

train_ds = LabeledReviewDS(train_data)
val_ds = LabeledReviewDS(val_data)

# create the dataloaders
train_dl = DataLoader(dataset=train_ds, batch_size=32, shuffle=True, collate_fn=collate_function, drop_last=True)
val_dl = DataLoader(dataset=val_ds, batch_size=32, shuffle=False, collate_fn=collate_function)

In [12]:
# next(iter(train_dl))
# looks our data is loaded and ready to go, time to build a model!!

# Train A model

In [13]:
from torch import nn
from torch.nn.functional import leaky_relu

class SeqClassModel(nn.Module):
    def __init__(self, 
                in_features: int,
                hidden_size: int, 
                num_classes: int, 
                num_layers: int = 2, 
                dropout: float=0.25, 
                *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.output_units = num_classes if num_classes > 2 else 1
        self.rnn = nn.LSTM(input_size=in_features, 
                           hidden_size=hidden_size, 
                           dropout=dropout, 
                           num_layers=num_layers,
                           bidirectional=True, # bidiretional RNN are more powerful
                           batch_first=True # easier manipulation
                           )
        # 2: comes from the fact that the lstm is bidirectional, the rest is similar to the LSTM documention Pytorch
        linear_input_dim = 2 * num_layers * hidden_size 
        self.batch_layer= nn.BatchNorm1d(num_features=linear_input_dim)
        # self.relu_layer = nn.LeakyReLU()
        self.head = nn.Linear(in_features=linear_input_dim, out_features=self.output_units)
        
    def forward(self, x: torch.Tensor):
        # first pass it through the rnn
        _, (hidden_state, _) = self.rnn(x)
        batch_size = hidden_state.shape[1]
        # first permuting channels: batch_size as dimensions '0' 
        # only only the last lstm layer
        hidden_state = hidden_state.permute((1, 0, 2)).reshape((batch_size, -1))
        return self.head.forward(self.batch_layer(hidden_state))


In [16]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
from torchmetrics.classification import MulticlassF1Score, MulticlassAccuracy

base_model = SeqClassModel(in_features=768, hidden_size=128, num_classes=6)
optimizer = AdamW(base_model.parameters(), lr=0.01)
scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.005, total_iters=100)

accuracy_metric, f1_metric = MulticlassAccuracy(num_classes=6), MulticlassF1Score(num_classes=6)

metrics = {'accuracy': accuracy_metric, 'f1_score': f1_metric}

train_configuration = {'optimizer': optimizer,
                        'scheduler': scheduler,
                        'min_val_loss': 10 ** -4,
                        'max_epochs': 100,
                        'report_epoch': 5,
                        'device': DEVICE, 
                        'metrics': metrics,
                        'no_improve_stop': 30
                        }

In [17]:
import src.pytorch_modular.image_classification.engine_classification as cls
results = cls.train_model(base_model, train_dl, val_dl, train_configuration,  
                            log_dir=os.path.join(HOME, 'runs'),         
                            save_path=os.path.join(HOME, 'saved_models'))   

[INFO] Created SummaryWriter, saving to: /home/ayhem18/DEV/My_Kaggle_Repo/amazon_reviews/runs/experience_8...


  1%|          | 1/100 [00:52<1:25:53, 52.06s/it]

#########################
training loss: 0.6558893404325747
train_accuracy: 0.7721792459487915
train_f1_score: 0.7497310638427734
validation loss : 0.49693129116550405
val_accuracy: 0.8303143382072449
val_f1_score: 0.8115997910499573
#########################


  6%|▌         | 6/100 [05:15<1:22:36, 52.73s/it]

#########################
training loss: 0.33543459741353093
train_accuracy: 0.8898731470108032
train_f1_score: 0.8712399005889893
validation loss : 0.3558555673411552
val_accuracy: 0.8865804076194763
val_f1_score: 0.8709324598312378
#########################


 11%|█         | 11/100 [09:40<1:18:31, 52.93s/it]

#########################
training loss: 0.28026878331102106
train_accuracy: 0.9068214297294617
train_f1_score: 0.8924185037612915
validation loss : 0.3465685739201751
val_accuracy: 0.8956265449523926
val_f1_score: 0.8800507187843323
#########################


 16%|█▌        | 16/100 [14:05<1:14:09, 52.97s/it]

#########################
training loss: 0.24023865084751403
train_accuracy: 0.9250503182411194
train_f1_score: 0.9102699756622314
validation loss : 0.37825810049284014
val_accuracy: 0.8886847496032715
val_f1_score: 0.8720353245735168
#########################


 21%|██        | 21/100 [18:28<1:09:24, 52.71s/it]

#########################
training loss: 0.2081629899479519
train_accuracy: 0.9302944540977478
train_f1_score: 0.9165394902229309
validation loss : 0.3073223424837627
val_accuracy: 0.9077926278114319
val_f1_score: 0.8932238221168518
#########################


 26%|██▌       | 26/100 [22:51<1:04:48, 52.54s/it]

#########################
training loss: 0.19074603160246925
train_accuracy: 0.937451958656311
train_f1_score: 0.9255355596542358
validation loss : 0.3109566688715936
val_accuracy: 0.9085302948951721
val_f1_score: 0.8967682719230652
#########################


 31%|███       | 31/100 [27:14<1:00:33, 52.66s/it]

#########################
training loss: 0.17182372456149894
train_accuracy: 0.9445618391036987
train_f1_score: 0.9313549995422363
validation loss : 0.3265013951471353
val_accuracy: 0.9078767895698547
val_f1_score: 0.8945206999778748
#########################


 36%|███▌      | 36/100 [31:38<56:07, 52.62s/it]  

#########################
training loss: 0.14859041175235013
train_accuracy: 0.9520153999328613
train_f1_score: 0.9418550729751587
validation loss : 0.29632890879354895
val_accuracy: 0.9132164716720581
val_f1_score: 0.9005258679389954
#########################


 41%|████      | 41/100 [36:01<51:47, 52.67s/it]

#########################
training loss: 0.13002125095583064
train_accuracy: 0.9565749168395996
train_f1_score: 0.9474289417266846
validation loss : 0.3078711375524785
val_accuracy: 0.909719705581665
val_f1_score: 0.8984768986701965
#########################


 46%|████▌     | 46/100 [40:24<47:19, 52.58s/it]

#########################
training loss: 0.11265406316442664
train_accuracy: 0.9635230898857117
train_f1_score: 0.9545397758483887
validation loss : 0.30427132342803354
val_accuracy: 0.9247327446937561
val_f1_score: 0.9082689881324768
#########################


 51%|█████     | 51/100 [44:46<42:50, 52.46s/it]

#########################
training loss: 0.1007115059760671
train_accuracy: 0.9663552641868591
train_f1_score: 0.9578993320465088
validation loss : 0.30731326078703786
val_accuracy: 0.9181804656982422
val_f1_score: 0.9109286665916443
#########################


 56%|█████▌    | 56/100 [49:09<38:32, 52.56s/it]

#########################
training loss: 0.08895624943045556
train_accuracy: 0.9690596461296082
train_f1_score: 0.9616992473602295
validation loss : 0.3080096492306032
val_accuracy: 0.9212034940719604
val_f1_score: 0.9120166301727295
#########################


 61%|██████    | 61/100 [53:32<34:08, 52.53s/it]

#########################
training loss: 0.07316058159036529
train_accuracy: 0.9757175445556641
train_f1_score: 0.9690972566604614
validation loss : 0.3381021298874328
val_accuracy: 0.9166561365127563
val_f1_score: 0.9055274724960327
#########################


 66%|██████▌   | 66/100 [57:55<29:47, 52.56s/it]

#########################
training loss: 0.06063577710416715
train_accuracy: 0.9787825345993042
train_f1_score: 0.9733573794364929
validation loss : 0.3365217211007319
val_accuracy: 0.9233630895614624
val_f1_score: 0.9147210121154785
#########################


 71%|███████   | 71/100 [1:02:18<25:25, 52.60s/it]

#########################
training loss: 0.04952614438807085
train_accuracy: 0.9844860434532166
train_f1_score: 0.9787583947181702
validation loss : 0.32695737641621775
val_accuracy: 0.9375408887863159
val_f1_score: 0.926816463470459
#########################


 76%|███████▌  | 76/100 [1:06:41<21:03, 52.63s/it]

#########################
training loss: 0.03962599304531549
train_accuracy: 0.9865507483482361
train_f1_score: 0.9826551675796509
validation loss : 0.3078606569832389
val_accuracy: 0.9306030869483948
val_f1_score: 0.9226606488227844
#########################


 81%|████████  | 81/100 [1:11:04<16:37, 52.52s/it]

#########################
training loss: 0.02994663364164818
train_accuracy: 0.9907798767089844
train_f1_score: 0.9877738952636719
validation loss : 0.3364748696070858
val_accuracy: 0.9356984496116638
val_f1_score: 0.9250094294548035
#########################


 86%|████████▌ | 86/100 [1:15:26<12:13, 52.40s/it]

#########################
training loss: 0.02197880615812006
train_accuracy: 0.9936609268188477
train_f1_score: 0.9913581609725952
validation loss : 0.32877729997469823
val_accuracy: 0.9330189824104309
val_f1_score: 0.9260494709014893
#########################


 91%|█████████ | 91/100 [1:19:49<07:52, 52.51s/it]

#########################
training loss: 0.01732540629366371
train_accuracy: 0.9939391016960144
train_f1_score: 0.9925612807273865
validation loss : 0.3360339563251399
val_accuracy: 0.9358867406845093
val_f1_score: 0.9240445494651794
#########################


 96%|█████████▌| 96/100 [1:24:11<03:29, 52.42s/it]

#########################
training loss: 0.013246412011904914
train_accuracy: 0.9958216547966003
train_f1_score: 0.9946160316467285
validation loss : 0.3258891504607318
val_accuracy: 0.9358094334602356
val_f1_score: 0.9251861572265625
#########################


100%|██████████| 100/100 [1:27:41<00:00, 52.62s/it]


# Inference 

In [18]:
# let's make the damn submission
from src.pytorch_modular.pytorch_utilities import load_model
# base_model = SeqClassModel(in_features=768, hidden_size=128, num_classes=6)
# base_model = load_model(base_model=base_model, path=os.path.join(HOME, 'saved_models', '9-17-15-10.pt'))
# let's create a dataset object really quick:
class TestReviewDS(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> tuple[str, int]:
        return self.data.iloc[index, 1]

# we need a different callate_function
def test_collate_function(batch):
    embeddings = MODEL(**TOKENIZER(batch, padding=True, return_tensors='pt').to(DEVICE)).last_hidden_state # make sure to return tensors
    return embeddings.to(DEVICE)
    
# let's set the random seed

torch.manual_seed(69)

test_ds = TestReviewDS(data=df_test)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=test_collate_function)
# next(iter(test_loader)).shape
predictions = cls.inference(base_model, inference_source_data=test_loader, return_tensor='list')
# convert the numerical labels to the string ones
predictions = [idx2cat[p] for p in predictions]

In [19]:
submission = pd.DataFrame(data={"id": df_test['id'].tolist(), "Category": predictions})
sub_dir = os.path.join(HOME, 'submissions')
submission.to_csv(os.path.join(sub_dir, f'sub_{len(os.listdir(sub_dir)) + 1}.csv'), index=False)