This notebook pre-processes the data that will be used for the thesis, and computes sentence embeddings for different models, which it then saves to csv files. 

It uses the data that was provided by Caspeco AB. First we calculate and save embeddings for completely unprocessed data ('original-1.csv'), and then we calculate and save embeddings for data which has been pre-processed ('processed-1.csv').

In [None]:
pip install -U sentence-transformers

The pre-processing (by hand) involved removing numerical quantities from certain items' names, e.g., 'Andiamo White 18cl', 'Falcon Export 50 cl', 'Buffalo Wings 5 pcs', etc. Then the file (see code below) was read into a pandas dataframe and duplicate entries were removed. Lastly, all item names were lowercased (since one of the models we will be using is cased, meaning it differentiates between, for example, "English" and "english").

In [None]:
import pandas as pd

# Change this line if you want to display more than 10 rows when printing
pd.set_option('display.max_rows', 10)

df_original = pd.read_csv('/content/original-1.csv')
df_processed = pd.read_csv('/content/processed-1.csv')
#Drop the category column
df_original = df_original.drop(columns=['ArticleGroupName'])
df_processed = df_processed.drop(columns=['ArticleGroupName'])


# Inspect the data
print("Df_orig value counts = \n", df_original.value_counts(), "\n")
print("Df_proc value counts = \n", df_processed.value_counts(), "\n")


# Remove duplicate rows based on all columns
df_original = df_original.drop_duplicates(ignore_index=True)
df_processed = df_processed.drop_duplicates(ignore_index=True)


# Inspect the data again
print("Df_original with duplicates removed = \n", df_original.value_counts(), "\n")
print("Df_processed with duplicates removed = \n", df_processed.value_counts(), "\n")

# Lowercase the items in the processed df
df_proc_copy = df_processed.copy()
lowercased = df_proc_copy['ArticleName'].str.strip().str.lower()
df_processed['ArticleName'] = lowercased
print("Processed df = \n", df_processed)
print("Original df = \n", df_original)


Let us calculate the embeddings for the items. We will be using 3 different, pre-trained models, all of them multilingual since our data includes English, Swedish and Spanish item names.

In [None]:
from sentence_transformers import SentenceTransformer
"""
Create embeddings for the items.
Models: 
'distiluse-base-multilingual-cased-v2',  
'paraphrase-multilingual-MiniLM-L12-v2' and
'paraphrase-multilingual-mpnet-base-v2'.
"""
# Define the models
model_distiluse = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
model_para_mini = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model_para_base = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Read items from the file into a list
items_original = df_original['ArticleName'].tolist()
items_processed = df_processed['ArticleName'].tolist()

# Calculate embeddings
orig_embeddings_distiluse = model_distiluse.encode(items_original)
orig_embeddings_para_mini = model_para_mini.encode(items_original)
orig_embeddings_para_base = model_para_base.encode(items_original)

proc_embeddings_distiluse = model_distiluse.encode(items_processed)
proc_embeddings_para_mini = model_para_mini.encode(items_processed)
proc_embeddings_para_base = model_para_base.encode(items_processed)



We will now save the embeddings to csv files so we don't have to calculate them again in the future. 

In [None]:
import csv
"""
Save the embeddings of the 'distiluse-base-multilingual-cased-v2' model, both with original 
and pre-processed data.
"""

# Create names of features 
features_names = []
for i in range(len(orig_embeddings_distiluse[0])):
  name = 'f' + str(i+1)
  features_names.append(name)

# Stringify the vector elements, so you can concatenate them later with item names
orig_emb_distiluse_str = []
for emb in orig_embeddings_distiluse:
  emb_str = [str(x) for x in emb]
  orig_emb_distiluse_str.append(emb_str)

# Stringify the vector elements, so you can concatenate them later with item names
proc_emb_distiluse_str = []
for emb in proc_embeddings_distiluse:
  emb_str = [str(x) for x in emb]
  proc_emb_distiluse_str.append(emb_str)


# Write to csv files
with open('orig_1_emb_distiluse.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(['item_orig'] + features_names)
    for i in range(len(items_original)):
      spamwriter.writerow([items_original[i]] + orig_emb_distiluse_str[i])

with open('proc_1_emb_distiluse.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(['item_proc'] + features_names)
    for i in range(len(items_processed)):
      spamwriter.writerow([items_processed[i]] + proc_emb_distiluse_str[i])


In [None]:
import csv
"""
Save the embeddings of the 'paraphrase-multilingual-MiniLM-L12-v2' model, 
both with original and pre-processed data.
"""

# Create names of features 
features_names = []
for i in range(len(orig_embeddings_para_mini[0])):
  name = 'f' + str(i+1)
  features_names.append(name)

# Stringify the vector elements, so you can concatenate them later with item names
orig_emb_para_mini_str = []
for emb in orig_embeddings_para_mini:
  emb_str = [str(x) for x in emb]
  orig_emb_para_mini_str.append(emb_str)

# Stringify the vector elements, so you can concatenate them later with item names
proc_emb_para_mini_str = []
for emb in proc_embeddings_para_mini:
  emb_str = [str(x) for x in emb]
  proc_emb_para_mini_str.append(emb_str)


# Write to csv files
with open('orig_1_emb_para_mini.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(['item_orig'] + features_names)
    for i in range(len(items_original)):
      spamwriter.writerow([items_original[i]] + orig_emb_para_mini_str[i])

with open('proc_1_emb_para_mini.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(['item_proc'] + features_names)
    for i in range(len(items_processed)):
      spamwriter.writerow([items_processed[i]] + proc_emb_para_mini_str[i])


In [None]:
import csv
"""
Save the embeddings of the 'paraphrase-multilingual-mpnet-base-v2' model, 
both with original and pre-processed data.
"""

# Create names of features 
features_names = []
for i in range(len(orig_embeddings_para_base[0])):
  name = 'f' + str(i+1)
  features_names.append(name)

# Stringify the vector elements, so you can concatenate them later with item names
orig_emb_para_base_str = []
for emb in orig_embeddings_para_base:
  emb_str = [str(x) for x in emb]
  orig_emb_para_base_str.append(emb_str)

# Stringify the vector elements, so you can concatenate them later with item names
proc_emb_para_base_str = []
for emb in proc_embeddings_para_base:
  emb_str = [str(x) for x in emb]
  proc_emb_para_base_str.append(emb_str)


# Write to csv files
with open('orig_1_emb_para_base.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(['item_orig'] + features_names)
    for i in range(len(items_original)):
      spamwriter.writerow([items_original[i]] + orig_emb_para_base_str[i])

with open('proc_1_emb_para_base.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(['item_proc'] + features_names)
    for i in range(len(items_processed)):
      spamwriter.writerow([items_processed[i]] + proc_emb_para_base_str[i])
