In [None]:
pip install -U sentence-transformers

In [None]:
from sentence_transformers import util
import numpy as np
import pandas as pd

# Define the categories
categories = ['Beer', 'Burger', 'Champagne', 'Cider', 'CoffeeTeaMilk', 'Dessert', 
              'Kids', 'Other', 'Salad', 'SoftDrinks', 'Spirits', 'VariedFood', 'Wine', 'Wings']

Let us load the training and test datasets.

In [None]:
X_train_distil_orig = pd.read_csv('x-train-distil-orig.csv')
X_test_distil_orig = pd.read_csv('x-test-distil-orig.csv')
y_train_distil_orig = pd.read_csv('y-train-distil-orig.csv')
y_test_distil_orig = pd.read_csv('y-test-distil-orig.csv')

X_train_distil_proc = pd.read_csv('x-train-distil-proc.csv')
X_test_distil_proc = pd.read_csv('x-test-distil-proc.csv')
y_train_distil_proc = pd.read_csv('y-train-distil-proc.csv')
y_test_distil_proc = pd.read_csv('y-test-distil-proc.csv')

X_train_para_mini_orig = pd.read_csv('x-train-para-mini-orig.csv')
X_test_para_mini_orig = pd.read_csv('x-test-para-mini-orig.csv')
y_train_para_mini_orig = pd.read_csv('y-train-para-mini-orig.csv')
y_test_para_mini_orig = pd.read_csv('y-test-para-mini-orig.csv')

X_train_para_mini_proc = pd.read_csv('x-train-para-mini-proc.csv')
X_test_para_mini_proc = pd.read_csv('x-test-para-mini-proc.csv')
y_train_para_mini_proc = pd.read_csv('y-train-para-mini-proc.csv')
y_test_para_mini_proc = pd.read_csv('y-test-para-mini-proc.csv')

X_train_para_base_orig = pd.read_csv('x-train-para-base-orig.csv')
X_test_para_base_orig = pd.read_csv('x-test-para-base-orig.csv')
y_train_para_base_orig = pd.read_csv('y-train-para-base-orig.csv')
y_test_para_base_orig = pd.read_csv('y-test-para-base-orig.csv')

X_train_para_base_proc = pd.read_csv('x-train-para-base-proc.csv')
X_test_para_base_proc = pd.read_csv('x-test-para-base-proc.csv')
y_train_para_base_proc = pd.read_csv('y-train-para-base-proc.csv')
y_test_para_base_proc = pd.read_csv('y-test-para-base-proc.csv')

Let us calculate the average embedding per category of all the training sets (essentially, our fit function).

In [None]:
# For the 'distiluse-base-multilingual-cased-v2' model
# Original dataset
average_emb_distiluse_orig = []
for category in categories:
  df = X_train_distil_orig[X_train_distil_orig['Class1']==category]
  # Drop the superfluous columns
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  df_mean = df.mean().values.tolist()
  average_emb_distiluse_orig.append(df_mean)

# Processed dataset
average_emb_distiluse_proc = []
for category in categories:
  df = X_train_distil_proc[X_train_distil_proc['Class1']==category]
  # Drop the superfluous columns
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  df_mean = df.mean().values.tolist()
  average_emb_distiluse_proc.append(df_mean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# For the 'paraphrase-multilingual-MiniLM-L12-v2' model
# Original dataset
average_emb_para_mini_orig = []
for category in categories:
  df = X_train_para_mini_orig[X_train_para_mini_orig['Class1']==category]
  # Drop the superfluous columns
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  df_mean = df.mean().values.tolist()
  average_emb_para_mini_orig.append(df_mean)

# Processed dataset
average_emb_para_mini_proc = []
for category in categories:
  df = X_train_para_mini_proc[X_train_para_mini_proc['Class1']==category]
  # Drop the superfluous columns
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  df_mean = df.mean().values.tolist()
  average_emb_para_mini_proc.append(df_mean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# For the 'paraphrase-multilingual-mpnet-base-v2' model
# Original dataset 
average_emb_para_base_orig = []
for category in categories:
  df = X_train_para_base_orig[X_train_para_base_orig['Class1']==category]
  # Drop the superfluous columns
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  df_mean = df.mean().values.tolist()
  average_emb_para_base_orig.append(df_mean)

# Processed dataset 
average_emb_para_base_proc = []
for category in categories:
  df = X_train_para_base_proc[X_train_para_base_proc['Class1']==category]
  # Drop the superfluous columns
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  df_mean = df.mean().values.tolist()
  average_emb_para_base_proc.append(df_mean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Now we can calculate the average error for every category, where the error = 1 - cos_sim(embedding, ave_embedding_category) for all embeddings in a category. In addition, we can also calculate the average of the average errors to get a sense of how well the model performed.  

In [None]:
# For the 'distiluse-base-multilingual-cased-v2' model
# Original dataset
errors_categories_distiluse_orig = []
for i in range(len(categories)):
  df = X_train_distil_orig[X_train_distil_orig['Class1']==categories[i]]
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  avg_emb_cat = average_emb_distiluse_orig[i]
  err_cat = []
  for j in range(len(df)):
    emb_item = df.iloc[j].values.tolist()
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    err_cat.append(err)
  
  print("Min error of " + str(categories[i]) + " = ", min(err_cat))
  print("Max error of " + str(categories[i]) + " = ", max(err_cat))
  avg_err_cat = np.mean(err_cat)
  print(str(categories[i]) + " average error = " + str(avg_err_cat))
  errors_categories_distiluse_orig.append(avg_err_cat)

avg_error_distiluse_orig = np.mean(errors_categories_distiluse_orig)
print("Average error of the distiluse model, original data = ", avg_error_distiluse_orig)

# Processed dataset
errors_categories_distiluse_proc = []
for i in range(len(categories)):
  df = X_train_distil_proc[X_train_distil_proc['Class1']==categories[i]]
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  avg_emb_cat = average_emb_distiluse_proc[i]
  err_cat = []
  for j in range(len(df)):
    emb_item = df.iloc[j].values.tolist()
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    err_cat.append(err)

  print("Min error of " + str(categories[i]) + " = ", min(err_cat))
  print("Max error of " + str(categories[i]) + " = ", max(err_cat)) 
  avg_err_cat = np.mean(err_cat)
  print(str(categories[i]) + " average error = " + str(avg_err_cat))
  errors_categories_distiluse_proc.append(avg_err_cat)

avg_error_distiluse_proc = np.mean(errors_categories_distiluse_proc)
print("Average error of the distiluse model, processed data = ", avg_error_distiluse_proc)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Min error of Beer =  0.19261682033538818
Max error of Beer =  0.6862813830375671
Beer average error = 0.41318302390016154
Min error of Burger =  0.06570225954055786
Max error of Burger =  0.2606963515281677
Burger average error = 0.14985525608062744
Min error of Champagne =  0.16423487663269043
Max error of Champagne =  0.4418281316757202
Champagne average error = 0.30525710582733157
Min error of Cider =  0.16291290521621704
Max error of Cider =  0.4460292458534241
Cider average error = 0.3126724412043889
Min error of CoffeeTeaMilk =  0.09924942255020142
Max error of CoffeeTeaMilk =  0.6601623892784119
CoffeeTeaMilk average error = 0.2668534640608163
Min error of Dessert =  0.16840767860412598
Max error of Dessert =  0.42851102352142334
Dessert average error = 0.2570155594083998
Min error of Kids =  0.11792945861816406
Max error of Kids =  0.2729375958442688
Kids average error = 0.18736557364463807
Min error of Other =  0.11253875494003296
Max error of Other =  0.4348747134208679
Other

In [None]:
# For the 'paraphrase-multilingual-MiniLM-L12-v2' model
# Original dataset
errors_categories_para_mini_orig = []
for i in range(len(categories)):
  df = X_train_para_mini_orig[X_train_para_mini_orig['Class1']==categories[i]]
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  avg_emb_cat = average_emb_para_mini_orig[i]
  err_cat = []
  for j in range(len(df)):
    emb_item = df.iloc[j].values.tolist()
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    err_cat.append(err)
  
  print("Min error of " + str(categories[i]) + " = ", min(err_cat))
  print("Max error of " + str(categories[i]) + " = ", max(err_cat))
  avg_err_cat = np.mean(err_cat)
  print(str(categories[i]) + " average error = " + str(avg_err_cat))
  errors_categories_para_mini_orig.append(avg_err_cat)

avg_error_para_mini_orig = np.mean(errors_categories_para_mini_orig)
print("Average error of the para-mini model, original data = ", avg_error_para_mini_orig)

# Processed dataset
errors_categories_para_mini_proc = []
for i in range(len(categories)):
  df = X_train_para_mini_proc[X_train_para_mini_proc['Class1']==categories[i]]
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  avg_emb_cat = average_emb_para_mini_proc[i]
  err_cat = []
  for j in range(len(df)):
    emb_item = df.iloc[j].values.tolist()
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    err_cat.append(err)

  print("Min error of " + str(categories[i]) + " = ", min(err_cat))
  print("Max error of " + str(categories[i]) + " = ", max(err_cat)) 
  avg_err_cat = np.mean(err_cat)
  print(str(categories[i]) + " average error = " + str(avg_err_cat))
  errors_categories_para_mini_proc.append(avg_err_cat)

avg_error_para_mini_proc = np.mean(errors_categories_para_mini_proc)
print("Average error of the para-mini model, processed data = ", avg_error_para_mini_proc)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Min error of Beer =  0.23558175563812256
Max error of Beer =  0.6840373277664185
Beer average error = 0.40376780504061854
Min error of Burger =  0.09418421983718872
Max error of Burger =  0.2766636610031128
Burger average error = 0.1639827427111174
Min error of Champagne =  0.18246746063232422
Max error of Champagne =  0.4652702212333679
Champagne average error = 0.280228328704834
Min error of Cider =  0.1321379542350769
Max error of Cider =  0.6577900648117065
Cider average error = 0.3516048913200696
Min error of CoffeeTeaMilk =  0.09399276971817017
Max error of CoffeeTeaMilk =  0.7042027711868286
CoffeeTeaMilk average error = 0.2865860914361888
Min error of Dessert =  0.14818495512008667
Max error of Dessert =  0.6104403734207153
Dessert average error = 0.28998398118548924
Min error of Kids =  0.23932147026062012
Max error of Kids =  0.32316839694976807
Kids average error = 0.273282915353775
Min error of Other =  0.09549790620803833
Max error of Other =  0.522315114736557
Other avera

In [None]:
# For the 'paraphrase-multilingual-mpnet-base-v2' model
# Original dataset
errors_categories_para_base_orig = []
for i in range(len(categories)):
  df = X_train_para_base_orig[X_train_para_base_orig['Class1']==categories[i]]
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  avg_emb_cat = average_emb_para_base_orig[i]
  err_cat = []
  for j in range(len(df)):
    emb_item = df.iloc[j].values.tolist()
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    err_cat.append(err)
  
  print("Min error of " + str(categories[i]) + " = ", min(err_cat))
  print("Max error of " + str(categories[i]) + " = ", max(err_cat)) 
  avg_err_cat = np.mean(err_cat)
  print(str(categories[i]) + " average error = " + str(avg_err_cat))
  errors_categories_para_base_orig.append(avg_err_cat)

avg_error_para_base_orig = np.mean(errors_categories_para_base_orig)
print("Average error of the para base model, original data = ", avg_error_para_base_orig)

# Processed dataset
errors_categories_para_base_proc = []
for i in range(len(categories)):
  df = X_train_para_base_proc[X_train_para_base_proc['Class1']==categories[i]]
  df.drop(labels=['ArticleName', 'Class1'], axis=1, inplace=True)
  avg_emb_cat = average_emb_para_base_proc[i]
  err_cat = []
  for j in range(len(df)):
    emb_item = df.iloc[j].values.tolist()
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    err_cat.append(err)
  
  print("Min error of " + str(categories[i]) + " = ", min(err_cat))
  print("Max error of " + str(categories[i]) + " = ", max(err_cat))
  avg_err_cat = np.mean(err_cat)
  print(str(categories[i]) + " average error = " + str(avg_err_cat))
  errors_categories_para_base_proc.append(avg_err_cat)

avg_error_para_base_proc = np.mean(errors_categories_para_base_proc)
print("Average error of the para base model, processed data = ", avg_error_para_base_proc)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Min error of Beer =  0.16959452629089355
Max error of Beer =  0.650503396987915
Beer average error = 0.37734068580615665
Min error of Burger =  0.08894449472427368
Max error of Burger =  0.20736217498779297
Burger average error = 0.14874367337477834
Min error of Champagne =  0.08596688508987427
Max error of Champagne =  0.34947502613067627
Champagne average error = 0.16510926485061644
Min error of Cider =  0.07396125793457031
Max error of Cider =  0.44820231199264526
Cider average error = 0.22893908619880676
Min error of CoffeeTeaMilk =  0.08104813098907471
Max error of CoffeeTeaMilk =  0.577402651309967
CoffeeTeaMilk average error = 0.23995323838858768
Min error of Dessert =  0.14569425582885742
Max error of Dessert =  0.40528857707977295
Dessert average error = 0.22085312339994642
Min error of Kids =  0.15630340576171875
Max error of Kids =  0.3148268461227417
Kids average error = 0.21792532801628112
Min error of Other =  0.08854258060455322
Max error of Other =  0.4910326600074768
O

Let us write the prediction function. The function will compute the error between the item and every average embedding, and will classify the item into the group where the error is the smallest.  

In [None]:
# The prediction function for the 'distiluse-base-multilingual-cased-v2' model
# Original dataset
df_orig = X_test_distil_orig.drop(labels=['ArticleName', 'Class1'], axis=1)
predictions_distil_orig = []

for i in range(len(df_orig)):
  err_min_orig = 1
  emb_item = df_orig.iloc[i].values.tolist()
  for j in range(len(categories)):
    avg_emb_cat = average_emb_distiluse_orig[j]
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    if err < err_min_orig:
      err_min_orig = err
      prediction = categories[j]
  predictions_distil_orig.append(prediction)

# Processed dataset
df_proc = X_test_distil_proc.drop(labels=['ArticleName', 'Class1'], axis=1)
predictions_distil_proc = []

for i in range(len(df_proc)):
  err_min_proc = 1
  emb_item = df_proc.iloc[i].values.tolist()
  for j in range(len(categories)):
    avg_emb_cat = average_emb_distiluse_proc[j]
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    if err < err_min_proc:
      err_min_proc = err
      prediction = categories[j]
  predictions_distil_proc.append(prediction)  

In [None]:
# The prediction function for the 'paraphrase-multilingual-MiniLM-L12-v2' model
# Original dataset
df_orig = X_test_para_mini_orig.drop(labels=['ArticleName', 'Class1'], axis=1)
predictions_para_mini_orig = []

for i in range(len(df_orig)):
  err_min_orig = 1
  emb_item = df_orig.iloc[i].values.tolist()
  for j in range(len(categories)):
    avg_emb_cat = average_emb_para_mini_orig[j]
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    if err < err_min_orig:
      err_min_orig = err
      prediction = categories[j]
  predictions_para_mini_orig.append(prediction)

# Processed dataset
df_proc = X_test_para_mini_proc.drop(labels=['ArticleName', 'Class1'], axis=1)
predictions_para_mini_proc = []

for i in range(len(df_proc)):
  err_min_proc = 1
  emb_item = df_proc.iloc[i].values.tolist()
  for j in range(len(categories)):
    avg_emb_cat = average_emb_para_mini_proc[j]
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    if err < err_min_proc:
      err_min_proc = err
      prediction = categories[j]
  predictions_para_mini_proc.append(prediction)  

In [None]:
# The prediction function for the 'paraphrase-multilingual-mpnet-base-v2' model
# Original dataset
df_orig = X_test_para_base_orig.drop(labels=['ArticleName', 'Class1'], axis=1)
predictions_para_base_orig = []

for i in range(len(df_orig)):
  err_min_orig = 1
  emb_item = df_orig.iloc[i].values.tolist()
  for j in range(len(categories)):
    avg_emb_cat = average_emb_para_base_orig[j]
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    if err < err_min_orig:
      err_min_orig = err
      prediction = categories[j]
  predictions_para_base_orig.append(prediction)

# Processed dataset
df_proc = X_test_para_base_proc.drop(labels=['ArticleName', 'Class1'], axis=1)
predictions_para_base_proc = []

for i in range(len(df_proc)):
  err_min_proc = 1
  emb_item = df_proc.iloc[i].values.tolist()
  for j in range(len(categories)):
    avg_emb_cat = average_emb_para_base_proc[j]
    cos_sim = util.cos_sim(emb_item, avg_emb_cat)
    err = 1 - cos_sim[0][0].numpy()
    if err < err_min_proc:
      err_min_proc = err
      prediction = categories[j]
  predictions_para_base_proc.append(prediction)  

Save the predictions to dataframes and to csv files.

In [None]:
# Distiluse
# Original
d_distil_orig = {'truth': y_test_distil_orig['Class1'].values.tolist(), 'prediction': predictions_distil_orig}
df_predict_distil_orig = pd.DataFrame(data=d_distil_orig)
df_predict_distil_orig.to_csv('df-predict-distil-orig.csv', index=False)
# Processed
d_distil_proc = {'truth': y_test_distil_proc['Class1'].values.tolist(), 'prediction': predictions_distil_proc}
df_predict_distil_proc = pd.DataFrame(data=d_distil_proc)
df_predict_distil_proc.to_csv('df-predict-distil-proc.csv', index=False)

# Para-mini
# Original
d_para_mini_orig = {'truth': y_test_para_mini_orig['Class1'].values.tolist(), 'prediction': predictions_para_mini_orig}
df_predict_para_mini_orig = pd.DataFrame(data=d_para_mini_orig)
df_predict_para_mini_orig.to_csv('df-predict-para-mini-orig.csv', index=False)
# Processed
d_para_mini_proc = {'truth': y_test_para_mini_proc['Class1'].values.tolist(), 'prediction': predictions_para_mini_proc}
df_predict_para_mini_proc = pd.DataFrame(data=d_para_mini_proc)
df_predict_para_mini_proc.to_csv('df-predict-para-mini-proc.csv', index=False)

# Para-base
# Original
d_para_base_orig = {'truth': y_test_para_base_orig['Class1'].values.tolist(), 'prediction': predictions_para_base_orig}
df_predict_para_base_orig = pd.DataFrame(data=d_para_base_orig)
df_predict_para_base_orig.to_csv('df-predict-para-base-orig.csv', index=False)
# Processed
d_para_base_proc = {'truth': y_test_para_base_proc['Class1'].values.tolist(), 'prediction': predictions_para_base_proc}
df_predict_para_base_proc = pd.DataFrame(data=d_para_base_proc)
df_predict_para_base_proc.to_csv('df-predict-para-base-proc.csv', index=False)