In this notebook we will implement a custom metric for measuring the quality of the classification based on embeddings.  

In [None]:
pip install -U sentence-transformers

In [15]:
import pandas as pd

# Change this line if you want to display more than 10 rows when printing
pd.set_option('display.max_rows', 70)

df_original = pd.read_csv('/content/original-1-classified.csv')
df_processed = pd.read_csv('/content/processed-1-classified.csv')

# Inspect the data
print("Df_orig classes value counts = \n", df_original['Class1'].value_counts(), "\n")
print("Df_proc classes value counts = \n", df_processed['Class1'].value_counts(), "\n")

# Remove duplicate rows based on all columns
df_orig = df_original.drop_duplicates(ignore_index=True)
df_proc = df_processed.drop_duplicates(ignore_index=True)

# Inspect the data again
print("Df_original with duplicates removed = \n", df_orig['Class1'].value_counts(), "\n")
print("Df_processed with duplicates removed = \n", df_proc['Class1'].value_counts(), "\n")

Df_orig classes value counts = 
 VariedFood       120
Beer             110
Spirits          101
Wine              72
SoftDrinks        50
CoffeeTeaMilk     32
Burger            21
Kids              14
Cider             14
Wings             12
Dessert           11
Other              7
Champagne          5
Salad              5
Name: Class1, dtype: int64 

Df_proc classes value counts = 
 VariedFood       119
Beer              95
Spirits           94
Wine              73
SoftDrinks        43
CoffeeTeaMilk     32
Burger            21
Kids              14
Cider             14
Wings             12
Dessert           11
Other              7
Champagne          5
Salad              5
Name: Class1, dtype: int64 

Df_original with duplicates removed = 
 VariedFood       120
Beer             105
Spirits          101
Wine              71
SoftDrinks        50
CoffeeTeaMilk     32
Burger            21
Kids              14
Cider             14
Wings             12
Dessert           11
Other            

Perform one-hot encoding of the categorical variables and concatenate with the original dataframe. 

In [None]:
# One-hot encoding of the Class1 columns
one_hot_columns_orig = pd.get_dummies(df_orig['Class1'])
one_hot_columns_proc = pd.get_dummies(df_proc['Class1'])
# Concatenate
df_orig_onehot = pd.concat([df_orig, one_hot_columns_orig], axis=1)
print(df_orig_onehot)

df_proc_onehot = pd.concat([df_proc, one_hot_columns_proc], axis=1)
print(df_proc_onehot)

Create the embeddings of the items for all 3 models and save them with the one-hot encoded columns. 

In [10]:
# Create embeddings
from sentence_transformers import SentenceTransformer
"""
Create embeddings for the items.
Models: 
'distiluse-base-multilingual-cased-v2',  
'paraphrase-multilingual-MiniLM-L12-v2' and
'paraphrase-multilingual-mpnet-base-v2'.
"""
# Define the models
model_distiluse = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
model_para_mini = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model_para_base = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Read items from the file into a list
items_original = df_orig_onehot['ArticleName'].tolist()
items_processed = df_proc_onehot['ArticleName'].tolist()

# Calculate embeddings
orig_embeddings_distiluse = model_distiluse.encode(items_original)
orig_embeddings_para_mini = model_para_mini.encode(items_original)
orig_embeddings_para_base = model_para_base.encode(items_original)

proc_embeddings_distiluse = model_distiluse.encode(items_processed)
proc_embeddings_para_mini = model_para_mini.encode(items_processed)
proc_embeddings_para_base = model_para_base.encode(items_processed)

In [12]:
# For the original dataset 

# Save the embeddings into the dataframe
# Create names of features for distiluse
features_names_distiluse = []
for i in range(len(orig_embeddings_distiluse[0])):
  name = 'f' + str(i+1)
  features_names_distiluse.append(name)

# Create names of features for para-mini
features_names_para_mini = []
for i in range(len(orig_embeddings_para_mini[0])):
  name = 'f' + str(i+1)
  features_names_para_mini.append(name)

# Create names of features for para-base
features_names_para_base = []
for i in range(len(orig_embeddings_para_base[0])):
  name = 'f' + str(i+1)
  features_names_para_base.append(name)


# Save the distiluse dataframe
df_emb_distiluse_orig = pd.DataFrame(orig_embeddings_distiluse, columns=features_names_distiluse)
# Concatenate the dataframes
df_orig_distiluse_class_onehot = pd.concat([df_orig_onehot, df_emb_distiluse_orig], axis=1)

# Save the para_mini dataframe
df_emb_para_mini_orig = pd.DataFrame(orig_embeddings_para_mini, columns=features_names_para_mini)
# Concatenate the dataframes
df_orig_para_mini_class_onehot = pd.concat([df_orig_onehot, df_emb_para_mini_orig], axis=1)

# Save the para_base dataframe
df_emb_para_base_orig = pd.DataFrame(orig_embeddings_para_base, columns=features_names_para_base)
# Concatenate the dataframes
df_orig_para_base_class_onehot = pd.concat([df_orig_onehot, df_emb_para_base_orig], axis=1)

In [13]:
# For the processed dataset 

# Save the embeddings into the dataframe
# Create names of features for distiluse
features_names_distiluse = []
for i in range(len(proc_embeddings_distiluse[0])):
  name = 'f' + str(i+1)
  features_names_distiluse.append(name)

# Create names of features for para-mini
features_names_para_mini = []
for i in range(len(proc_embeddings_para_mini[0])):
  name = 'f' + str(i+1)
  features_names_para_mini.append(name)

# Create names of features for para-base
features_names_para_base = []
for i in range(len(proc_embeddings_para_base[0])):
  name = 'f' + str(i+1)
  features_names_para_base.append(name)


# Save the distiluse dataframe
df_emb_distiluse_proc = pd.DataFrame(proc_embeddings_distiluse, columns=features_names_distiluse)
# Concatenate the dataframes
df_proc_distiluse_class_onehot = pd.concat([df_proc_onehot, df_emb_distiluse_proc], axis=1)

# Save the para_mini dataframe
df_emb_para_mini_proc = pd.DataFrame(proc_embeddings_para_mini, columns=features_names_para_mini)
# Concatenate the dataframes
df_proc_para_mini_class_onehot = pd.concat([df_proc_onehot, df_emb_para_mini_proc], axis=1)

# Save the para_base dataframe
df_emb_para_base_proc = pd.DataFrame(proc_embeddings_para_base, columns=features_names_para_base)
# Concatenate the dataframes
df_proc_para_base_class_onehot = pd.concat([df_proc_onehot, df_emb_para_base_proc], axis=1)

Let us save the dataframes to csv files so we can easily load them in the future.

In [14]:
import csv
# Save the datasets to csv files
df_orig_distiluse_class_onehot.to_csv('df-orig-distil-class-onehot.csv', index=False)
df_orig_para_mini_class_onehot.to_csv('df-orig-para-mini-class-onehot.csv', index=False)
df_orig_para_base_class_onehot.to_csv('df-orig-para-base-class-onehot.csv', index=False)

df_proc_distiluse_class_onehot.to_csv('df-proc-distil-class-onehot.csv', index=False)
df_proc_para_mini_class_onehot.to_csv('df-proc-para-mini-class-onehot.csv', index=False)
df_proc_para_base_class_onehot.to_csv('df-proc-para-base-class-onehot.csv', index=False)

Now let us do label encoding on the target variable. 

In [16]:
from sklearn import preprocessing

# Label encoding of the target variable
df_orig_le = df_orig.copy()
df_proc_le = df_proc.copy()

le = preprocessing.LabelEncoder()
df_orig_le['ClassLabel'] = le.fit_transform(df_orig['Class1'])
df_proc_le['ClassLabel'] = le.fit_transform(df_proc['Class1'])

print(df_orig_le)
print(df_proc_le)

                             ArticleName ArticleGroupName      Class1  \
0                 Carlsberg Export 50 cl               Öl        Beer   
1                             Drink 4 cl            Sprit     Spirits   
2                          Erdinger Hefe               Öl        Beer   
3                          Guinness 50cl               Öl        Beer   
4                      Staropramen 50 cl               Öl        Beer   
..                                   ...              ...         ...   
563                   EXTRA TAVO VEG 1ST              Mat  VariedFood   
564                Carlsberg Export 40cl    Fatöl Starköl        Beer   
565  Les Cardounettes Blanc EKO (FRA) GL              Vin        Wine   
566               Crispy Halloumi Burger              Mat      Burger   
567                      Öppen Aktivitet      Aktiviteter       Other   

     ClassLabel  
0             0  
1            10  
2             0  
3             0  
4             0  
..          ...

In [17]:
# For the original dataset
# Save the embeddings into the dataframe
# Create names of features for distiluse
features_names_distiluse = []
for i in range(len(orig_embeddings_distiluse[0])):
  name = 'f' + str(i+1)
  features_names_distiluse.append(name)

# Create names of features for para-mini
features_names_para_mini = []
for i in range(len(orig_embeddings_para_mini[0])):
  name = 'f' + str(i+1)
  features_names_para_mini.append(name)

# Create names of features for para-base
features_names_para_base = []
for i in range(len(orig_embeddings_para_base[0])):
  name = 'f' + str(i+1)
  features_names_para_base.append(name)


# Concatenate the dataframes
df_orig_distiluse_class_le = pd.concat([df_orig_le, df_emb_distiluse_orig], axis=1)

# Concatenate the dataframes
df_orig_para_mini_class_le = pd.concat([df_orig_le, df_emb_para_mini_orig], axis=1)

# Concatenate the dataframes
df_orig_para_base_class_le = pd.concat([df_orig_le, df_emb_para_base_orig], axis=1)

In [18]:
# For the processed dataset
# Save the embeddings into the dataframe
# Create names of features for distiluse
features_names_distiluse = []
for i in range(len(proc_embeddings_distiluse[0])):
  name = 'f' + str(i+1)
  features_names_distiluse.append(name)

# Create names of features for para-mini
features_names_para_mini = []
for i in range(len(proc_embeddings_para_mini[0])):
  name = 'f' + str(i+1)
  features_names_para_mini.append(name)

# Create names of features for para-base
features_names_para_base = []
for i in range(len(proc_embeddings_para_base[0])):
  name = 'f' + str(i+1)
  features_names_para_base.append(name)


# Concatenate the dataframes
df_proc_distiluse_class_le = pd.concat([df_proc_le, df_emb_distiluse_proc], axis=1)

# Concatenate the dataframes
df_proc_para_mini_class_le = pd.concat([df_proc_le, df_emb_para_mini_proc], axis=1)

# Concatenate the dataframes
df_proc_para_base_class_le = pd.concat([df_proc_le, df_emb_para_base_proc], axis=1)

In [19]:
import csv
# Save the datasets to csv files
df_orig_distiluse_class_le.to_csv('df-orig-distil-class-le.csv', index=False)
df_orig_para_mini_class_le.to_csv('df-orig-para-mini-class-le.csv', index=False)
df_orig_para_base_class_le.to_csv('df-orig-para-base-class-le.csv', index=False)

df_proc_distiluse_class_le.to_csv('df-proc-distil-class-le.csv', index=False)
df_proc_para_mini_class_le.to_csv('df-proc-para-mini-class-le.csv', index=False)
df_proc_para_base_class_le.to_csv('df-proc-para-base-class-le.csv', index=False)