<a href="https://colab.research.google.com/github/amhaiskar0921/AmazonProject/blob/main/main_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pyarrow.parquet as pq
import pandas as pd
import numpy as np

# For data visualization
import seaborn as sns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
np.random.seed(42)

sample_size = 10000

shopping_data = pq.read_table('/content/drive/MyDrive/Amazon (LA) - Multi-Class Product Classification (Team A)/Datasets/shopping_queries_dataset_examples.parquet')
df = shopping_data.to_pandas().sample(n=sample_size, random_state=42)
shopping_data_p = pq.read_table('/content/drive/MyDrive/Amazon (LA) - Multi-Class Product Classification (Team A)/Datasets/shopping_queries_dataset_products.parquet')
df_p = shopping_data_p.to_pandas().sample(n=sample_size, random_state=42)
df_p = df_p.drop(columns=['product_locale']) #this is a duplicate from shopping_data

# merging
df_merged = pd.merge(df, df_p, on='product_id', how='inner')
print(f"number of products {len(pd.unique(df_merged['product_id']))}")
df_merged_no_null = df_merged.fillna("")
missing_values = df_merged_no_null.isnull().sum()
print(len(df_merged_no_null))

number of products 5319
5548


In [None]:
# Class imbalance exists:
class_counts = df_merged['esci_label'].value_counts()
print(class_counts)

# We will use the Micro averaging F1 Score, because the four classes are unbalanced:
# 65.17% Exacts, 21.91% Substitutes, 2.89% Complements and 10.04% Irrelevants; and this metric
# is robust enough for this situation.

E    3606
S    1226
I     569
C     147
Name: esci_label, dtype: int64


In [None]:
# concat dat
!pip install transformers
import torch
import transformers
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import urllib
from urllib.request import urlopen

# We looked at the repo with japanese stopwords, and they gave instructions to download it in the shell
# in JavaScript.
# We looked into how to do the same thing in Python, and came across this stack overflow post: https://stackoverflow.com/questions/72149806/exclude-japanese-stopwords-from-file
# Following the code from there:
def get_japanese_stopwords():
  # print("in function")
  iso_path = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ja/master/stopwords-ja.txt"
  iso_file = urllib.request.urlopen(iso_path)
  # print(iso_file)
  stopwords = [line.decode("utf-8").strip() for line in iso_file]

  stopwords = [ss for ss in stopwords if not ss==u'']
  stopwords = set(stopwords)
  # print(len(stopwords))
  return stopwords


In [None]:
def get_all_stopwords():
  stop_words_english = set(stopwords.words('english'))
  stop_words_spanish = set(stopwords.words('spanish'))
  stop_words_japanese = get_japanese_stopwords()
  return stop_words_english, stop_words_spanish, stop_words_japanese

# Add this code before tokenization
def remove_stopwords(text, stop_words_english, stop_words_spanish, stop_words_japanese):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words_english and word.lower()
    not in stop_words_spanish and word.lower() not in stop_words_japanese]
    return ' '.join(filtered_words)

In [None]:
# Initialize an empty list to store the modified input sequences
input_sequences = []
stop_words_english, stop_words_spanish, stop_words_japanese = get_all_stopwords()

# Apply stopword removal to each element in the 'input_sequences' list using a for loop
for text in df_merged_no_null.apply(lambda x: f"[CLS] {x['query']} [SEP] {x['product_title']} [SEP] {x['product_description']} [SEP] {x['product_bullet_point']} [SEP] {x['product_color']} [SEP] {x['product_bullet_point']} [SEP]", axis=1).tolist():
# for text in df_merged_no_null.apply(lambda x: f"[CLS] {x['product_title']} [SEP] {x['product_description']} [SEP] {x['product_bullet_point']} [SEP]", axis=1).tolist():
    cleaned_text = remove_stopwords(text, stop_words_english, stop_words_spanish, stop_words_japanese)
    input_sequences.append(cleaned_text)
print(input_sequences[0], len(input_sequences))

[CLS] yunsey tinte amoniaco [SEP] LIGHT IRRIDIANCE OXIGENADA 20VOL 1000ML L.I, Negro, 1000mL-1L [SEP] [SEP] OXIGENADA 20VOL 1000ML L.I. [SEP] Negro [SEP] OXIGENADA 20VOL 1000ML L.I. [SEP] 5548


In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example text to tokenize
# text = "This is an example sentence."

# Tokenize the text
tokens = tokenizer(input_sequences, padding=True, truncation=True, return_tensors="pt")

# Convert tokenized text to PyTorch tensors
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')

stemmer = PorterStemmer()

for i in range(len(input_sequences)):
  words = word_tokenize(input_sequences[i])
  words = [stemmer.stem(word) for word in words]
  input_sequences[i] = ' '.join(words)

print(input_sequences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['[ cl ] yunsey tint amoniaco [ sep ] light irridi oxigenada 20vol 1000ml l.i , negro , 1000ml-1l [ sep ] [ sep ] oxigenada 20vol 1000ml l.i . [ sep ] negro [ sep ] oxigenada 20vol 1000ml l.i . [ sep ]', "[ cl ] android phone [ sep ] tracfon carrier-lock samsung galaxi j7 crown 4g lte prepaid smartphon - black - 16gb - sim card includ - cdma ( renew ) [ sep ] [ sep ] 5.5 '' hd screen ; 1.56 ghz octa-cor processor ; android 8.0 oreo 4g lte ; wi-fi capabl ; bluetooth 4.2 wireless technolog ; mp3 player 13mp camera/13mp front face camera ; intern memori 16gb ; support micro sd memori card 400 gb ( not includ ) note : phone carrier lock ; custom must lock devic activ servic fewer 12 month , redeem air time card fewer 12 month , telephon number recycl ported.rear camera resolut : up 4160 x 3120 pixel unlimit talk , text data plan start low $ 20/month [ sep ] black [ sep ] 5.5 '' hd screen ; 1.56 ghz octa-cor processor ; android 8.0 oreo 4g lte ; wi-fi capabl ; bluetooth 4.2 wireless technol

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

for i in range(len(input_sequences)):
  words = word_tokenize(input_sequences[i])
  words = [lemmatizer.lemmatize(word) for word in words]
  input_sequences[i] = ' '.join(words)

print(input_sequences)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['[ cl ] yunsey tint amoniaco [ sep ] light irridi oxigenada 20vol 1000ml l.i , negro , 1000ml-1l [ sep ] [ sep ] oxigenada 20vol 1000ml l.i . [ sep ] negro [ sep ] oxigenada 20vol 1000ml l.i . [ sep ]', '[ cl ] android phone [ sep ] tracfon carrier-lock samsung galaxi j7 crown 4g lte prepaid smartphon - black - 16gb - sim card includ - cdma ( renew ) [ sep ] [ sep ] 5.5 `` hd screen ; 1.56 ghz octa-cor processor ; android 8.0 oreo 4g lte ; wi-fi capabl ; bluetooth 4.2 wireless technolog ; mp3 player 13mp camera/13mp front face camera ; intern memori 16gb ; support micro sd memori card 400 gb ( not includ ) note : phone carrier lock ; custom must lock devic activ servic fewer 12 month , redeem air time card fewer 12 month , telephon number recycl ported.rear camera resolut : up 4160 x 3120 pixel unlimit talk , text data plan start low $ 20/month [ sep ] black [ sep ] 5.5 `` hd screen ; 1.56 ghz octa-cor processor ; android 8.0 oreo 4g lte ; wi-fi capabl ; bluetooth 4.2 wireless technol

In [None]:
from torch.utils.data import DataLoader, TensorDataset
def get_data_loader(features, max_seq_length, batch_size, shuffle=True):

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader
class BertInputItem(object):
  """An item with all the necessary attributes for finetuning BERT."""

  def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
      self.text = text
      self.input_ids = input_ids
      self.input_mask = input_mask
      self.segment_ids = segment_ids
      self.label_id = label_id


In [None]:
def convert_examples_to_inputs(example_texts, example_labels, label2idx, max_seq_length, tokenizer, verbose=0):
      """Loads a data file into a list of `InputBatch`s."""

      input_items = []
      examples = zip(example_texts, example_labels)
      for (ex_index, (text, label)) in enumerate(examples):

          # Create a list of token ids
          input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
          if len(input_ids) > max_seq_length:
              input_ids = input_ids[:max_seq_length]

          # All our tokens are in the first input segment (id 0).
          segment_ids = [0] * len(input_ids)

          # The mask has 1 for real tokens and 0 for padding tokens. Only real
          # tokens are attended to.
          input_mask = [1] * len(input_ids)

          # Zero-pad up to the sequence length.
          padding = [0] * (max_seq_length - len(input_ids))
          input_ids += padding
          input_mask += padding
          segment_ids += padding

          assert len(input_ids) == max_seq_length
          assert len(input_mask) == max_seq_length
          assert len(segment_ids) == max_seq_length

          label_id = label2idx[label]

          input_items.append(
              BertInputItem(text=text,
                            input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id))


      return input_items

In [None]:
rest_texts, test_texts, rest_labels, test_labels = train_test_split(input_sequences, df_merged_no_null['esci_label'].tolist(), test_size=0.1, random_state=1)
train_texts, dev_texts, train_labels, dev_labels = train_test_split(rest_texts, rest_labels, test_size=0.1, random_state=1)

target_names = list(set(df_merged_no_null['esci_label'].tolist()))
label2idx = {label: idx for idx, label in enumerate(target_names)}
print(label2idx)

{'C': 0, 'S': 1, 'E': 2, 'I': 3}


In [None]:
# inputItem = BertInputItem()
MAX_SEQ_LENGTH = 512
train_features = convert_examples_to_inputs(train_texts, train_labels, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
dev_features = convert_examples_to_inputs(dev_texts, dev_labels, label2idx, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_inputs(test_texts, test_labels, label2idx, MAX_SEQ_LENGTH, tokenizer)

BATCH_SIZE = 16

train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True)
dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)
test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (808 > 512). Running this sequence through the model will result in indexing errors


In [None]:
df_merged_no_null.shape

(10220, 14)

In [None]:
# Evaluation method with full classification report (from nlp notebooks repo)

def evaluate(model, dataloader):
    model.eval()

    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask,
                                          token_type_ids=segment_ids, labels=label_ids)

        outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()

        predicted_labels += list(outputs)
        correct_labels += list(label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)

    return eval_loss, correct_labels, predicted_labels

In [None]:
# TRAINING BEGINS HERE

In [None]:
#creating custom model with numerical features
model = AutoModelForSequenceClassification.from_pretrained("ashnadua01/bert-based-multilingual-cased-finetuned-lid")
additional_features = df_merged_no_null[["large_version", "small_version"]] #should we include this in the model?

#work in progress, need to create base model
import torch.optim as optim
from torch import nn

EPOCHS = 1

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    for batch in train_dataloader:
        print(len(batch))
        input_ids, input_mask, segment_ids, label_ids = batch # originally there were 3 values being unpacked here, resulted in an error

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass ,results in RAM overflow
        outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)

        # Calculate loss
        loss = outputs[0]

        # Backpropagation
        loss.backward()
        optimizer.step()
