# Data Imports

In [24]:
# let's start with the data and see how it goes
import os
import pandas as pd
HOME = os.getcwd()
train_csv = os.path.join(HOME, 'data', 'train.csv')
test_csv = os.path.join(HOME, 'data', 'test.csv')

df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)
# set the columns names to lower case 

df_train.columns = [c.lower() for c in df_train.columns]
df_test.columns = [c.lower() for c in df_test.columns]

# remove unnecessary columns
df_train.drop(columns=['helpfulness', 'score'], inplace=True)
df_test.drop(columns=['helpfulness', 'score'], inplace=True)

In [25]:
import nltk 
from nltk.tokenize import TweetTokenizer
# add a small piece of code to call the pytorch_modular code
from pathlib import Path
import sys

current = HOME
while 'src' not in os.listdir(current):
    current = Path(current).parent

sys.path.append(str(current))
sys.path.append(os.path.join(current, 'src'))

In [26]:
df_train.head()

Unnamed: 0,title,text,category
0,Golden Valley Natural Buffalo Jerky,The description and photo on this product need...,grocery gourmet food
1,Westing Game,This was a great book!!!! It is well thought t...,toys games
2,Westing Game,"I am a first year teacher, teaching 5th grade....",toys games
3,Westing Game,I got the book at my bookfair at school lookin...,toys games
4,I SPY A is For Jigsaw Puzzle 63pc,Hi! I'm Martine Redman and I created this puzz...,toys games


In [27]:
df_test.head()

Unnamed: 0,id,title,text
0,0,PetSafe Staywell Pet Door with Clear Hard Flap,We've only had it installed about 2 weeks. So ...
1,1,"Kaytee Timothy Cubes, 1-Pound",My bunny had a hard time eating this because t...
2,2,Body Back Buddy,would never in a million years have guessed th...
3,3,SnackMasters California Style Turkey Jerky,"Being the jerky fanatic I am, snackmasters han..."
4,4,Premier Busy Buddy Tug-a-Jug Treat Dispensing ...,Wondered how quick my dog would catch on to th...


# Data Preprocessing

In [29]:
# preprocessing functions
import re
from typing import List

def to_lower(text: str) -> str:
    return text.lower()

def no_extra_spaces(text: str) -> str:
    return re.sub('\s+', ' ', text)

def no_extra_chars(text: str) -> str:
    return re.sub(r'[^a-zA-Z\s,!.;:-]+', ' ', text) 

text = 'aaa5531--==-||"z2::,.a'

In [30]:
import requests
dictionary_file_url = 'https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt'

r = requests.get(dictionary_file_url, allow_redirects=True)

dict_path = os.path.join(HOME, 'data', 'dictionary.txt') 

with open(dict_path, 'wb') as f:
    f.write(r.content)
    

In [31]:
# let's convert this file into a dictionary object:
# discard any words shorter than 3 letters
dictionary = set()
with open(dict_path, 'r') as f:
    for line in f.readlines():        
        word = line[:-1]
        if len(word) >= 3:
            dictionary.add(word)

len(dictionary)

369652

In [32]:
# download a more comprehensive list of stop words
stopwords_file_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
r = requests.get(stopwords_file_url, allow_redirects=True)
stopwords_path = os.path.join(HOME, 'data', 'stop_words_en.txt') 
with open(stopwords_path, 'wb') as f:
    f.write(r.content)

In [33]:
STOP_WORDS = set()
with open(stopwords_path, 'r') as f:
    for line in f.readlines():        
        word = line[:-1]
        if len(word) >= 3:
            STOP_WORDS.add(word)

len(STOP_WORDS)

1000

In [34]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
LEMMATIZER = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /home/ayhem18/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
def filter_text(text: str, tokenizer = None) -> None:
    tokenizer = TweetTokenizer() if tokenizer is None else tokenizer
    tokens = tokenizer.tokenize(text)
    # add a
    tokens = [LEMMATIZER.lemmatize(t.strip().lower()) for t in tokens if t not in STOP_WORDS and t in dictionary]
    return " ".join(tokens)

def process(text: str) -> str:
    # first lower, remove extrac chracters
    text1 = to_lower(no_extra_chars(text))
    # remove redundant words
    text2 = filter_text(text1)
    # remove extra spaces
    return no_extra_spaces(text2)

import random
random.seed(69)
example = df_train['text'][int(random.random() * len(df_train))]
print(example)
print(process(example))

# # drop the 'text' column as only the title will be used for classification
# df_train.drop(columns=['text'], inplace=True)
# df_test.drop(columns=['text'], inplace=True)

# 16 rows have missing values in the 'title' column, remove them
df_train.fillna(value='', inplace=True)
df_test.fillna(value='', inplace=True)

See the title of this review. Fortunately, I am a packrat, and kept a bunch of hole repair kits from various blow up things that we have gone through over the years. Does not come with a hole repair kit though, just to warn you. Anyway, it is back in black and bouncing our 3 year old all over the place. Indoor only, I would say. Very highly recommended, in spite of a hole within a week of use. Hope that this is the first and last one... probably not.
title review fortunately packrat bunch hole repair kit blow hole repair kit warn black bouncing indoor highly recommended spite hole week hope


In [36]:
print(df_train.isna().sum())
print("#" * 100)
print(df_test.isna().sum())

df_train_org, df_test_org = df_train.copy(), df_test.copy()

title       0
text        0
category    0
dtype: int64
####################################################################################################
id       0
title    0
text     0
dtype: int64


In [37]:
import numpy as np

cat2idx = {
    'toys games': 0,
    'health personal care': 1,
    'beauty': 2,
    'baby products': 3,
    'pet supplies': 4,
    'grocery gourmet food': 5,
}

idx2cat = {
    0:'toys games',
    1:'health personal care',
    2:'beauty',
    3:'baby products',
    4:'pet supplies',
    5:'grocery gourmet food' 
}

# making sure the dataframes are ready for training
def df_process_data(row):
    row['title'] = process(row['title'])
    row['text'] = process(row['text'])
    return row

def df_process_labels(row):
    # map it to an integer
    row['category'] = cat2idx[row['category']]
    return row

# process the fields
df_train = df_train.apply(df_process_data, axis=1)
# process the labels
df_train = df_train.apply(df_process_labels, axis=1)
# process the data is the test split
df_test = df_test.apply(df_process_data, axis=1)

In [38]:
# add the data category for both training and testing data 
df_train['data'] = df_train['title'] + "\t" + df_train['text']
df_test['data'] = df_test['title'] + "\t" + df_test['text']

# remove the 'text' and 'title' columns
df_train.drop(columns=['text', 'title'], inplace=True)
df_test.drop(columns=['text', 'title'], inplace=True)

In [39]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(df_train, test_size=0.15, stratify=df_train['category'], random_state=69)

In [40]:
# train_data['data_len'] = train_data['data'].apply(lambda x: len(x))
# print(max(train_data['data_len']))
# print(min(train_data['data_len']))

# Embeddings

In [41]:
# in the rest of the code I will be using the d
import torch
from transformers import AutoModel, AutoTokenizer
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT = 'distilbert-base-uncased' # let's keep it simple as for the first iteration
MODEL = AutoModel.from_pretrained(CHECKPOINT).to(DEVICE)
TOKENIZER = AutoTokenizer.from_pretrained(CHECKPOINT)

# Train Loaders

In [42]:
from torch.utils.data import DataLoader, Dataset

def collate_function(batch: List[str]):
    # batch will represent a list of tuples (text, category) 
    x, y = [list(row) for row in zip(*batch)]
    # convert both labels and data to tensors
    y_tensor = torch.FloatTensor(y).to(device=DEVICE)
    embeddings = MODEL(**TOKENIZER(x, padding=True, return_tensors='pt', truncation=True).to(DEVICE)).last_hidden_state # make sure to return tensors
    return embeddings.to(DEVICE), y_tensor

# let's create a dataset object really quick:
class LabeledReviewDS(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> tuple[str, int]:
        return self.data.iloc[index]['data'], self.data.iloc[index]['category']

# let's set the random seed

torch.manual_seed(69)

train_ds = LabeledReviewDS(train_data)
val_ds = LabeledReviewDS(val_data)

# create the dataloaders
train_dl = DataLoader(dataset=train_ds, batch_size=16, shuffle=True, collate_fn=collate_function, drop_last=True)
val_dl = DataLoader(dataset=val_ds, batch_size=16, shuffle=False, collate_fn=collate_function)

In [43]:
# next(iter(train_dl))
# looks our data is loaded and ready to go, time to build a model!!

# Train A model

In [44]:
from torch import nn
from torch.nn.functional import leaky_relu

class SeqClassModel(nn.Module):
    def __init__(self, 
                in_features: int,
                hidden_size: int, 
                num_classes: int, 
                num_layers: int = 2, 
                dropout: float=0.4, 
                *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.output_units = num_classes if num_classes > 2 else 1
        self.rnn = nn.LSTM(input_size=in_features, 
                           hidden_size=hidden_size, 
                           dropout=dropout, 
                           num_layers=num_layers,
                           bidirectional=True, # bidiretional RNN are more powerful
                           batch_first=True # easier manipulation
                           )
        # the coefficient 2 comes from the fact that the lstm is bidirectional, the rest is similar to the LSTM documention Pytorch
        linear_input_dim = 2 * num_layers * hidden_size 
        self.batch_layer= nn.BatchNorm1d(num_features=linear_input_dim)
        # self.relu_layer = nn.LeakyReLU()
        self.head = nn.Linear(in_features=linear_input_dim, out_features=self.output_units)
        
    def forward(self, x: torch.Tensor):
        # first pass it through the rnn
        _, (hidden_state, _) = self.rnn(x)
        batch_size = hidden_state.shape[1]
        # first permuting channels: batch_size as dimensions '0' 
        # only only the last lstm layer
        hidden_state = hidden_state.permute((1, 0, 2)).reshape((batch_size, -1))
        return self.head.forward(self.batch_layer(hidden_state))


In [45]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
from torchmetrics.classification import MulticlassF1Score, MulticlassAccuracy

base_model = SeqClassModel(in_features=768, hidden_size=64, num_classes=6)
optimizer = AdamW(base_model.parameters(), lr=0.01)
scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.005, total_iters=100)

accuracy_metric, f1_metric = MulticlassAccuracy(num_classes=6), MulticlassF1Score(num_classes=6)

metrics = {'accuracy': accuracy_metric, 'f1_score': f1_metric}

train_configuration = {'optimizer': optimizer,
                        'scheduler': scheduler,
                        'min_val_loss': 10 ** -4,
                        'max_epochs': 50,
                        'report_epoch': 2,
                        'device': DEVICE, 
                        'metrics': metrics,
                        'no_improve_stop': 10
                        }

In [46]:
import src.pytorch_modular.image_classification.engine_classification as cls
results = cls.train_model(base_model, train_dl, val_dl, train_configuration,  
                            log_dir=os.path.join(HOME, 'runs'),         
                            save_path=os.path.join(HOME, 'saved_models'))   

[INFO] Created SummaryWriter, saving to: /home/ayhem18/DEV/My_Kaggle_Repo/amazon_reviews/runs/experience_19...


  2%|▏         | 1/50 [03:08<2:34:01, 188.60s/it]

#########################
training loss: 0.6779630907493479
train_accuracy: 0.737939178943634
train_f1_score: 0.7096965909004211
validation loss : 0.5373522022565206
val_accuracy: 0.809920072555542
val_f1_score: 0.7875585556030273
#########################


  6%|▌         | 3/50 [09:31<2:29:25, 190.75s/it]

#########################
training loss: 0.4981355534090715
train_accuracy: 0.815688967704773
train_f1_score: 0.7916708588600159
validation loss : 0.4798527381916841
val_accuracy: 0.8334441781044006
val_f1_score: 0.8111178874969482
#########################


 10%|█         | 5/50 [15:52<2:22:57, 190.60s/it]

#########################
training loss: 0.45814702941214336
train_accuracy: 0.8262511491775513
train_f1_score: 0.8062968254089355
validation loss : 0.44089928739269574
val_accuracy: 0.8332328200340271
val_f1_score: 0.8132728338241577
#########################


 14%|█▍        | 7/50 [22:14<2:16:48, 190.89s/it]

#########################
training loss: 0.4295128247685292
train_accuracy: 0.8369565606117249
train_f1_score: 0.8170448541641235
validation loss : 0.41169475691517193
val_accuracy: 0.8451753854751587
val_f1_score: 0.8227936029434204
#########################


 18%|█▊        | 9/50 [28:36<2:10:25, 190.87s/it]

#########################
training loss: 0.4206440725712215
train_accuracy: 0.84557044506073
train_f1_score: 0.8252637982368469
validation loss : 0.4336529390712579
val_accuracy: 0.8521341681480408
val_f1_score: 0.8327414393424988
#########################


 22%|██▏       | 11/50 [34:56<2:03:49, 190.50s/it]

#########################
training loss: 0.406242218555773
train_accuracy: 0.8491209149360657
train_f1_score: 0.8297417759895325
validation loss : 0.41055941169460614
val_accuracy: 0.841548502445221
val_f1_score: 0.8220723271369934
#########################


 26%|██▌       | 13/50 [41:20<1:57:49, 191.08s/it]

#########################
training loss: 0.3932165937581483
train_accuracy: 0.8530812859535217
train_f1_score: 0.8326742053031921
validation loss : 0.4221890305578709
val_accuracy: 0.8424361944198608
val_f1_score: 0.8225082755088806
#########################


 30%|███       | 15/50 [47:41<1:51:19, 190.84s/it]

#########################
training loss: 0.38871463636791004
train_accuracy: 0.8526533842086792
train_f1_score: 0.8329665660858154
validation loss : 0.3936349258025487
val_accuracy: 0.8587116599082947
val_f1_score: 0.8401244878768921
#########################


 34%|███▍      | 17/50 [54:00<1:44:34, 190.13s/it]

#########################
training loss: 0.39703962104899043
train_accuracy: 0.8496612310409546
train_f1_score: 0.8311195969581604
validation loss : 0.40311111382643383
val_accuracy: 0.8514856696128845
val_f1_score: 0.8310486674308777
#########################


 38%|███▊      | 19/50 [1:00:20<1:38:14, 190.14s/it]

#########################
training loss: 0.37994036928169866
train_accuracy: 0.8591514825820923
train_f1_score: 0.8407832384109497
validation loss : 0.39710221311450006
val_accuracy: 0.8531365394592285
val_f1_score: 0.8319758772850037
#########################


 42%|████▏     | 21/50 [1:06:40<1:31:52, 190.07s/it]

#########################
training loss: 0.37503389488949496
train_accuracy: 0.8593876361846924
train_f1_score: 0.8416686058044434
validation loss : 0.4022137623329957
val_accuracy: 0.846325695514679
val_f1_score: 0.8282150626182556
#########################


 46%|████▌     | 23/50 [1:13:00<1:25:29, 189.98s/it]

#########################
training loss: 0.3649496432858355
train_accuracy: 0.8669677376747131
train_f1_score: 0.848456859588623
validation loss : 0.41163728429873786
val_accuracy: 0.8515522480010986
val_f1_score: 0.8334006071090698
#########################


 50%|█████     | 25/50 [1:19:20<1:19:07, 189.89s/it]

#########################
training loss: 0.36298816239921483
train_accuracy: 0.8664419054985046
train_f1_score: 0.8491610884666443
validation loss : 0.35268746707836784
val_accuracy: 0.8680956363677979
val_f1_score: 0.8525749444961548
#########################


 54%|█████▍    | 27/50 [1:25:38<1:12:40, 189.57s/it]

#########################
training loss: 0.3585445951328558
train_accuracy: 0.8667318224906921
train_f1_score: 0.8487405180931091
validation loss : 0.3641689727604389
val_accuracy: 0.8589880466461182
val_f1_score: 0.8433343172073364
#########################


 58%|█████▊    | 29/50 [1:31:58<1:06:26, 189.83s/it]

#########################
training loss: 0.34634573701900595
train_accuracy: 0.8702887892723083
train_f1_score: 0.8522975444793701
validation loss : 0.36722110349933307
val_accuracy: 0.8775210976600647
val_f1_score: 0.8614065647125244
#########################


 62%|██████▏   | 31/50 [1:38:18<1:00:06, 189.80s/it]

#########################
training loss: 0.3443183675259352
train_accuracy: 0.8706082701683044
train_f1_score: 0.8526430726051331
validation loss : 0.3616951473802328
val_accuracy: 0.8688241243362427
val_f1_score: 0.849892258644104
#########################


 66%|██████▌   | 33/50 [1:44:37<53:44, 189.70s/it]  

#########################
training loss: 0.3373211648385314
train_accuracy: 0.8740084171295166
train_f1_score: 0.8561446070671082
validation loss : 0.37292206734915573
val_accuracy: 0.8688001036643982
val_f1_score: 0.8525489568710327
#########################


 70%|███████   | 35/50 [1:50:56<47:21, 189.42s/it]

#########################
training loss: 0.3311517212022753
train_accuracy: 0.877812922000885
train_f1_score: 0.8611069321632385
validation loss : 0.38545399195949237
val_accuracy: 0.8460733294487
val_f1_score: 0.830070972442627
#########################


 74%|███████▍  | 37/50 [1:57:14<41:00, 189.26s/it]

#########################
training loss: 0.3308432402794852
train_accuracy: 0.8728024959564209
train_f1_score: 0.8560780882835388
validation loss : 0.3584797342568636
val_accuracy: 0.8686927556991577
val_f1_score: 0.851532518863678
#########################


 78%|███████▊  | 39/50 [2:03:32<34:41, 189.25s/it]

#########################
training loss: 0.32038639595052776
train_accuracy: 0.8785847425460815
train_f1_score: 0.8621760010719299
validation loss : 0.41938770933945974
val_accuracy: 0.8458176255226135
val_f1_score: 0.8295831680297852
#########################


 82%|████████▏ | 41/50 [2:09:52<28:27, 189.67s/it]

#########################
training loss: 0.3182208304992493
train_accuracy: 0.8813996315002441
train_f1_score: 0.864874005317688
validation loss : 0.41232798010110855
val_accuracy: 0.8512557744979858
val_f1_score: 0.8320695757865906
#########################


 86%|████████▌ | 43/50 [2:16:11<22:06, 189.45s/it]

#########################
training loss: 0.3105670741866617
train_accuracy: 0.8881371021270752
train_f1_score: 0.8706713318824768
validation loss : 0.33383782117565475
val_accuracy: 0.8796889781951904
val_f1_score: 0.8663960695266724
#########################


 90%|█████████ | 45/50 [2:22:30<15:47, 189.46s/it]

#########################
training loss: 0.30087869111317045
train_accuracy: 0.8876261115074158
train_f1_score: 0.8718485236167908
validation loss : 0.3514584127267202
val_accuracy: 0.8736597299575806
val_f1_score: 0.8583224415779114
#########################


 94%|█████████▍| 47/50 [2:28:49<09:28, 189.46s/it]

#########################
training loss: 0.29267055119530244
train_accuracy: 0.8920213580131531
train_f1_score: 0.8755929470062256
validation loss : 0.36771172733604907
val_accuracy: 0.8690950274467468
val_f1_score: 0.8554518818855286
#########################


 98%|█████████▊| 49/50 [2:35:07<03:09, 189.24s/it]

#########################
training loss: 0.2787494460427586
train_accuracy: 0.8974595069885254
train_f1_score: 0.8824065923690796
validation loss : 0.37258491743604344
val_accuracy: 0.8707227110862732
val_f1_score: 0.8527376055717468
#########################


100%|██████████| 50/50 [2:38:17<00:00, 189.94s/it]


# Performance Diagnosis
Judging from the tensorboard output, the model reaches a training loss of 0.01 as well as an accuracy of 99.6 \% indicating that the model's capacity fits the problem at hand. The save learning curves clearly indicates that the model overfits and overfits significantly. The divergence of the validation loss and the training loss starts from the the 20-th epoch or so. Let's start the error analysis. 

In [47]:
# from src.pytorch_modular.pytorch_utilities import load_model
# import src.pytorch_modular.image_classification.engine_classification as cls
# base_model = SeqClassModel(in_features=768, hidden_size=128, num_classes=6)
# base_model = load_model(base_model=base_model, path=os.path.join(HOME, 'saved_models', '9-18-17-53.pt'))

# # let's create a dataset object really quick:
# class TestReviewDS(Dataset):
#     def __init__(self, data: pd.DataFrame) -> None:
#         super().__init__()
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index) -> tuple[str, int]:
#         return self.data.iloc[index]

# # we need a different callate_function
# def test_collate_function(batch):
#     embeddings = MODEL(**TOKENIZER(batch, padding=True, return_tensors='pt').to(DEVICE)).last_hidden_state # make sure to return tensors
#     return embeddings.to(DEVICE)


# validation_data = TestReviewDS(data=val_data['title'])
# validation_loader = DataLoader(validation_data, batch_size=32, shuffle=False, collate_fn=test_collate_function)

# test_ds = TestReviewDS(data=df_test)
# test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=test_collate_function)

# # next(iter(test_loader)).shape
# val_predictions = cls.inference(base_model, inference_source_data=validation_loader, return_tensor='list')
# # convert the numerical labels to the string ones
# val_predictions = [idx2cat[p] for p in val_predictions]

In [48]:
# print(val_predictions[:5])
# print(val_data['category'][:5])

In [49]:
# val_labels = val_data['category'].values
# from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
# print(accuracy_score(val_labels, val_predictions))
# print(f1_score(val_labels, val_predictions, average='macro'))
# confusion = confusion_matrix(val_labels, val_predictions)
# print(confusion)

In [50]:
# let's calculate the error rate per class
# error_rate_per_class = 1 - confusion[list(range(len(confusion))), [i for i in range(len(confusion))]] / np.sum(confusion, axis=0)
# print(error_rate_per_class)

# # so we can see that classes 1, 2, 3 are the most problematic classes so far
# classes_aug = [idx2cat[i] for i in [1, 2, 3]]
# print(classes_aug)

In [51]:
# let's see how we can translate a couple of sentences
# let's increase the data by 20 percent for each of these categories

# def extract_samples(category: str, ratio: float = 0.2) -> List[str]:
#     # first filter the dataframe
#     all = df_train_org[df_train_org['category'] == category]['title'].tolist()
#     return random.sample(all, k=int(len(all) * ratio))

# samples = [extract_samples(cat) for cat in classes_aug]

In [52]:
# samples[0]

# Inference 

In [53]:
# let's make the damn submission
from src.pytorch_modular.pytorch_utilities import load_model
# base_model = SeqClassModel(in_features=768, hidden_size=128, num_classes=6)
# base_model = load_model(base_model=base_model, path=os.path.join(HOME, 'saved_models', '9-17-15-10.pt'))
# let's create a dataset object really quick:
class TestReviewDS(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index) -> tuple[str, int]:
        return self.data.iloc[index, 1]

# we need a different callate_function
def test_collate_function(batch):
    embeddings = MODEL(**TOKENIZER(batch, padding=True, return_tensors='pt', truncation=True).to(DEVICE)).last_hidden_state # make sure to return tensors
    return embeddings.to(DEVICE)
    
# let's set the random seed

torch.manual_seed(69)

test_ds = TestReviewDS(data=df_test)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=test_collate_function)
# next(iter(test_loader)).shape
predictions = cls.inference(base_model, inference_source_data=test_loader, return_tensor='list')
# convert the numerical labels to the string ones
predictions = [idx2cat[p] for p in predictions]

In [55]:
submission = pd.DataFrame(data={"id": df_test['id'].tolist(), "Category": predictions})
sub_dir = os.path.join(HOME, 'submissions')
submission.to_csv(os.path.join(sub_dir, f'sub_{len(os.listdir(sub_dir)) + 1}.csv'), index=False)