<a href="https://colab.research.google.com/github/amhaiskar0921/AmazonProject/blob/main/BERT_Based_Multilingual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal of this notebook: get a BERT model working

##Loading + merging the datasets

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pyarrow.parquet as pq
import pandas as pd
import numpy as np

# For data visualization
import seaborn as sns

Mounted at /content/drive


In [2]:
# Getting random samples of our raw datasets
np.random.seed(42)

sample_size = 10000

shopping_data = pq.read_table('/content/drive/MyDrive/Amazon (LA) - Multi-Class Product Classification (Team A)/Datasets/shopping_queries_dataset_examples.parquet')
df_examples = shopping_data.to_pandas().sample(n=sample_size, random_state=42)

# This line takes up 7GB ram and a couple of seconds to run
shopping_data_p = pq.read_table('/content/drive/MyDrive/Amazon (LA) - Multi-Class Product Classification (Team A)/Datasets/shopping_queries_dataset_products.parquet')
df_products = shopping_data_p.to_pandas().sample(n=sample_size, random_state=42)

In [3]:
# SAVING RAM BY NOT MAKING COPIES OF EXISTING DATAFRAMES
# Creating the merged dataset, null values replaced with empty strings
# Taken from the esci challenge GitHub notebook
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id'],
    copy=False
)

# Replacing null values with ""
df_examples_products.fillna("", inplace=True)

# df_examples_products.head(20)

In [4]:
df_examples_products.drop(df_examples_products[df_examples_products.large_version == 0].index, inplace = True)
df_examples_products_train = df_examples_products[df_examples_products["split"] == "train"]
df_examples_products_test = df_examples_products[df_examples_products["split"] == "test"]
df_examples_products.head(10)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color
0,40527,100% cotton long sleeve t shirt women,1450,B014WD3WOG,us,E,1,1,test,,,,,
1,1351394,mi full screen tv pro,68591,B07Y25LMNP,es,I,1,1,train,,,,,
2,895892,geocache,44920,B014I55V8S,us,E,0,1,train,,,,,
3,2454618,メンズ t シャツ バックプリント,124137,B086DHGYGF,jp,E,0,1,train,,,,,
4,1302108,mary kay makeup remover eye,66054,B017PCGABI,us,S,1,1,train,,,,,
5,2361724,シーツ,120029,B01M0IJYH4,jp,E,0,1,train,,,,,
6,870918,furniture no scratch spray,43627,B009ETNUJ6,us,E,1,1,train,,,,,
7,1718789,rifle bag,87640,B0897157XP,us,E,0,1,train,,,,,
8,1710062,resin circular sandpaper,87199,B000022339,us,E,1,1,train,,,,,
9,1684868,rca 2 male to 1 female,85882,B095SBFQLZ,us,E,1,1,train,,,,,


##Preprocessing the data

###Getting all the stopwords

In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Getting Japanese stopwords from this [GitHub repo](https://github.com/stopwords-iso/stopwords-ja)

Inspo from: https://stackoverflow.com/questions/72149806/exclude-japanese-stopwords-from-file

In [6]:
import urllib
from urllib.request import urlopen

def get_japanese_stopwords():
  iso_path = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ja/master/stopwords-ja.txt"
  iso_file = urllib.request.urlopen(iso_path)
  stopwords = [line.decode("utf-8").strip() for line in iso_file]

  stopwords = [ss for ss in stopwords if not ss==u'']
  stopwords = set(stopwords)
  return stopwords


NLTK has English and Spanish stopwords

In [7]:
def get_all_stopwords():
  stop_words_english = set(stopwords.words('english'))
  # Merging english and spanish
  stop_words_english_and_spanish = stop_words_english.union(set(stopwords.words('spanish')))
  # Merging japanese with english and spanish
  all_stopwords = stop_words_english_and_spanish.union(get_japanese_stopwords())
  # Returning a set of the all stopwords
  return all_stopwords

In [8]:
# Load NLTK stopwords, stemmer, and lemmatizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

all_stopwords = get_all_stopwords()
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


Stemming, lemmatizing

In [20]:
# Function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))

    # Remove stopwords, apply stemming and lemmatization
    tokens = [porter_stemmer.stem(token) for token in tokens if token.lower() not in all_stopwords]
    tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens if token.lower() not in all_stopwords]

    # Concatenate tokens into a single string
    processed_text = ' '.join(tokens)

    return processed_text

In [21]:
!pip install transformers
from transformers import BertTokenizer, TFAutoModelForSequenceClassification



In [11]:
# Load the BERT tokenizer to get individual words
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

(…)ultilingual-cased/resolve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

(…)ingual-cased/resolve/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

(…)tilingual-cased/resolve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Full preprocessing function

##Applying the above preprocessing function to train and test data

###Applying the preprocessing function to the product description and bullet point columns

In [22]:
df_examples_products_train['product_description'] = df_examples_products_train['product_description'].apply(preprocess_text)
df_examples_products_test['product_bullet_point'] = df_examples_products_test['product_bullet_point'].apply(preprocess_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_examples_products_train['product_description'] = df_examples_products_train['product_description'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_examples_products_test['product_bullet_point'] = df_examples_products_test['product_bullet_point'].apply(preprocess_text)


## Creating input sequences for the BERT model
Columns included: query, desc, bullet points, title, brand

In [23]:
# Create input sequences for the BERT model
def create_input_sequence(row: 'DataFrame_row'):
    # Concatenate relevant columns: query, desc, bullet points, title, brand
    input_text = f"[CLS] {row['query']} [SEP] {row['product_description']} [SEP] {row['product_bullet_point']} [SEP] {row['product_title']} [SEP] {row['product_brand']} [SEP]"

    # Tokenize and encode the input
    encoded_input = tokenizer(input_text, padding="max_length", truncation=True, return_tensors='tf')

    return encoded_input


In [25]:
import tensorflow as tf

In [27]:
# Creating input sequences for train and test data
df_examples_products_train['bert_input'] = df_examples_products_train.apply(create_input_sequence, axis=1)
df_examples_products_test['bert_input'] = df_examples_products_test.apply(create_input_sequence, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_examples_products_train['bert_input'] = df_examples_products_train.apply(create_input_sequence, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_examples_products_test['bert_input'] = df_examples_products_test.apply(create_input_sequence, axis=1)


In [32]:
print(df_examples_products_train['bert_input'])

1       [input_ids, token_type_ids, attention_mask]
2       [input_ids, token_type_ids, attention_mask]
3       [input_ids, token_type_ids, attention_mask]
4       [input_ids, token_type_ids, attention_mask]
5       [input_ids, token_type_ids, attention_mask]
                           ...                     
9994    [input_ids, token_type_ids, attention_mask]
9995    [input_ids, token_type_ids, attention_mask]
9996    [input_ids, token_type_ids, attention_mask]
9998    [input_ids, token_type_ids, attention_mask]
9999    [input_ids, token_type_ids, attention_mask]
Name: bert_input, Length: 7512, dtype: object


In [34]:
import ast

# Convert 'bert_input' from string to list
df_examples_products_train['bert_input'] = df_examples_products_train['bert_input'].apply(ast.literal_eval)

df_examples_products_train['bert_input'] = df_examples_products_train['bert_input'].apply(
    lambda x: {
        'input_ids': tf.concat([item['input_ids'] for item in x], axis=0),
        'token_type_ids': tf.concat([item['token_type_ids'] for item in x], axis=0),
        'attention_mask': tf.concat([item['attention_mask'] for item in x], axis=0),
    }
)

df_examples_products_test['bert_input'] = df_examples_products_test['bert_input'].apply(
    lambda x: {
        'input_ids': tf.concat([item['input_ids'] for item in x], axis=0),
        'token_type_ids': tf.concat([item['token_type_ids'] for item in x], axis=0),
        'attention_mask': tf.concat([item['attention_mask'] for item in x], axis=0),
    }
)

ValueError: ignored

##Training the model

In [36]:
# labels
id2label = {0: "E", 1: "S", 2: "C", 3: "I"}
label2id = {val: key for key, val in id2label.items()}

In [43]:
# Other model params
MODEL_CHECKPOINT = "bert-base-multilingual-cased"
BATCH_SIZE = 16
NUM_LABELS = 4

In [44]:
# Loading the model
model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
# Getting the optimizer
import tensorflow as tf
from tensorflow.keras.optimizers import Adam as Adam

In [45]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=2e-5), loss='categorical_crossentropy')

In [46]:
# Train the model
model.fit(df_examples_products_train['bert_input'].tolist(), epochs=1, batch_size=BATCH_SIZE)

Epoch 1/3


AttributeError: ignored

##Getting predictions

In [None]:
outputs = model(df_examples_products_test['bert_input']).logits

In [None]:
classifications = [model.config.id2label[output] for output in classifications]
print(classifications)