In this notebook we will split the dataset into a training and a test set, define a function for the prediction algorithm and run it. Then we will visualise and compare the results.

In [None]:
pip install -U sentence-transformers

In [2]:
from sentence_transformers import util
from sklearn.model_selection import train_test_split
import numpy as np

Let us begin by loading the embeddings.

In [46]:
import pandas as pd

# Load the embeddings
df_orig_distiluse_class_le = pd.read_csv('/content/df-orig-distil-class-le.csv')
df_orig_para_mini_class_le = pd.read_csv('/content/df-orig-para-mini-class-le.csv')
df_orig_para_base_class_le = pd.read_csv('/content/df-orig-para-base-class-le.csv')

df_proc_distiluse_class_le = pd.read_csv('/content/df-proc-distil-class-le.csv')
df_proc_para_mini_class_le = pd.read_csv('/content/df-proc-para-mini-class-le.csv')
df_proc_para_base_class_le = pd.read_csv('/content/df-proc-para-base-class-le.csv')

Let us split the datasets into training and test sets.

In [47]:
# For the distiluse-base-multilingual-cased-v2' model, unprocessed data
# Drop the class label column
df_orig_distiluse_class_le = df_orig_distiluse_class_le.drop(['ArticleGroupName', 'ClassLabel'], axis=1)

X_orig = df_orig_distiluse_class_le
y_orig = df_orig_distiluse_class_le.iloc[:, :2]

# Split into training and test set 
(X_train_distil_orig, X_test_distil_orig, 
 y_train_distil_orig, y_test_distil_orig) = train_test_split(X_orig, y_orig, test_size = 0.2,
                                                             random_state = 42, stratify=df_orig_distiluse_class_le['Class1'])

# Inspect the train and test datasets
print(X_train_distil_orig['Class1'].value_counts())
print(X_test_distil_orig['Class1'].value_counts())
print(y_train_distil_orig['Class1'].value_counts())
print(y_test_distil_orig['Class1'].value_counts())


VariedFood       96
Beer             84
Spirits          81
Wine             57
SoftDrinks       40
CoffeeTeaMilk    25
Burger           17
Cider            11
Kids             11
Dessert           9
Wings             9
Other             6
Salad             4
Champagne         4
Name: Class1, dtype: int64
VariedFood       24
Beer             21
Spirits          20
Wine             14
SoftDrinks       10
CoffeeTeaMilk     7
Burger            4
Cider             3
Kids              3
Wings             3
Dessert           2
Salad             1
Other             1
Champagne         1
Name: Class1, dtype: int64
VariedFood       96
Beer             84
Spirits          81
Wine             57
SoftDrinks       40
CoffeeTeaMilk    25
Burger           17
Cider            11
Kids             11
Dessert           9
Wings             9
Other             6
Salad             4
Champagne         4
Name: Class1, dtype: int64
VariedFood       24
Beer             21
Spirits          20
Wine             14

In [48]:
# For the distiluse-base-multilingual-cased-v2' model, processed data
# Drop the class label column
df_proc_distiluse_class_le = df_proc_distiluse_class_le.drop(['ArticleGroupName', 'ClassLabel'], axis=1)

X_proc = df_proc_distiluse_class_le
y_proc = df_proc_distiluse_class_le.iloc[:, :2]

# Split into training and test set 
(X_train_distil_proc, X_test_distil_proc,
 y_train_distil_proc, y_test_distil_proc) = train_test_split(X_proc, y_proc, test_size = 0.2,
                                                             random_state = 42, stratify=df_proc_distiluse_class_le['Class1'])

# Inspect the train and test datasets
print(X_train_distil_proc['Class1'].value_counts())
print(X_test_distil_proc['Class1'].value_counts())
print(y_train_distil_proc['Class1'].value_counts())
print(y_test_distil_proc['Class1'].value_counts())

VariedFood       93
Spirits          63
Wine             53
Beer             48
SoftDrinks       26
CoffeeTeaMilk    23
Burger           17
Kids             11
Cider            11
Dessert           9
Other             6
Salad             4
Wings             4
Champagne         3
Name: Class1, dtype: int64
VariedFood       24
Spirits          16
Wine             13
Beer             12
SoftDrinks        6
CoffeeTeaMilk     6
Burger            4
Kids              3
Cider             3
Dessert           2
Salad             1
Champagne         1
Other             1
Wings             1
Name: Class1, dtype: int64
VariedFood       93
Spirits          63
Wine             53
Beer             48
SoftDrinks       26
CoffeeTeaMilk    23
Burger           17
Kids             11
Cider            11
Dessert           9
Other             6
Salad             4
Wings             4
Champagne         3
Name: Class1, dtype: int64
VariedFood       24
Spirits          16
Wine             13
Beer             12

In [49]:
# For the 'paraphrase-multilingual-MiniLM-L12-v2' model, unprocessed data
# Drop the class label column
df_orig_para_mini_class_le = df_orig_para_mini_class_le.drop(['ArticleGroupName', 'ClassLabel'], axis=1)

X_orig = df_orig_para_mini_class_le
y_orig = df_orig_para_mini_class_le.iloc[:, :2]

# Split into training and test set 
(X_train_para_mini_orig, X_test_para_mini_orig,
 y_train_para_mini_orig, y_test_para_mini_orig) = train_test_split(X_orig, y_orig, test_size = 0.2,
                                                                   random_state = 42, stratify=df_orig_para_mini_class_le['Class1'])

# Inspect the train and test datasets
print(X_train_para_mini_orig['Class1'].value_counts())
print(X_test_para_mini_orig['Class1'].value_counts())
print(y_train_para_mini_orig['Class1'].value_counts())
print(y_test_para_mini_orig['Class1'].value_counts())

VariedFood       96
Beer             84
Spirits          81
Wine             57
SoftDrinks       40
CoffeeTeaMilk    25
Burger           17
Cider            11
Kids             11
Dessert           9
Wings             9
Other             6
Salad             4
Champagne         4
Name: Class1, dtype: int64
VariedFood       24
Beer             21
Spirits          20
Wine             14
SoftDrinks       10
CoffeeTeaMilk     7
Burger            4
Cider             3
Kids              3
Wings             3
Dessert           2
Salad             1
Other             1
Champagne         1
Name: Class1, dtype: int64
VariedFood       96
Beer             84
Spirits          81
Wine             57
SoftDrinks       40
CoffeeTeaMilk    25
Burger           17
Cider            11
Kids             11
Dessert           9
Wings             9
Other             6
Salad             4
Champagne         4
Name: Class1, dtype: int64
VariedFood       24
Beer             21
Spirits          20
Wine             14

In [50]:
# For the 'paraphrase-multilingual-MiniLM-L12-v2' model, processed data
# Drop the class label column
df_proc_para_mini_class_le = df_proc_para_mini_class_le.drop(['ArticleGroupName', 'ClassLabel'], axis=1)

X_proc = df_proc_para_mini_class_le
y_proc = df_proc_para_mini_class_le.iloc[:, :2]

# Split into training and test set 
(X_train_para_mini_proc, X_test_para_mini_proc,
 y_train_para_mini_proc, y_test_para_mini_proc) = train_test_split(X_proc, y_proc, test_size = 0.2,
                                                                   random_state = 42, stratify=df_proc_para_mini_class_le['Class1'])

# Inspect the train and test datasets
print(X_train_para_mini_proc['Class1'].value_counts())
print(X_test_para_mini_proc['Class1'].value_counts())
print(y_train_para_mini_proc['Class1'].value_counts())
print(y_test_para_mini_proc['Class1'].value_counts())

VariedFood       93
Spirits          63
Wine             53
Beer             48
SoftDrinks       26
CoffeeTeaMilk    23
Burger           17
Kids             11
Cider            11
Dessert           9
Other             6
Salad             4
Wings             4
Champagne         3
Name: Class1, dtype: int64
VariedFood       24
Spirits          16
Wine             13
Beer             12
SoftDrinks        6
CoffeeTeaMilk     6
Burger            4
Kids              3
Cider             3
Dessert           2
Salad             1
Champagne         1
Other             1
Wings             1
Name: Class1, dtype: int64
VariedFood       93
Spirits          63
Wine             53
Beer             48
SoftDrinks       26
CoffeeTeaMilk    23
Burger           17
Kids             11
Cider            11
Dessert           9
Other             6
Salad             4
Wings             4
Champagne         3
Name: Class1, dtype: int64
VariedFood       24
Spirits          16
Wine             13
Beer             12

In [51]:
# For the 'paraphrase-multilingual-mpnet-base-v2' model, unprocessed data
# Drop the class label column
df_orig_para_base_class_le = df_orig_para_base_class_le.drop(['ArticleGroupName', 'ClassLabel'], axis=1)

X_orig = df_orig_para_base_class_le
y_orig = df_orig_para_base_class_le.iloc[:, :2]

# Split into training and test set 
(X_train_para_base_orig, X_test_para_base_orig, 
 y_train_para_base_orig, y_test_para_base_orig) = train_test_split(X_orig, y_orig, test_size = 0.2,
                                                                   random_state = 42, stratify=df_orig_para_base_class_le['Class1'])

# Inspect the train and test datasets
print(X_train_para_base_orig['Class1'].value_counts())
print(X_test_para_base_orig['Class1'].value_counts())
print(y_train_para_base_orig['Class1'].value_counts())
print(y_test_para_base_orig['Class1'].value_counts())

VariedFood       96
Beer             84
Spirits          81
Wine             57
SoftDrinks       40
CoffeeTeaMilk    25
Burger           17
Cider            11
Kids             11
Dessert           9
Wings             9
Other             6
Salad             4
Champagne         4
Name: Class1, dtype: int64
VariedFood       24
Beer             21
Spirits          20
Wine             14
SoftDrinks       10
CoffeeTeaMilk     7
Burger            4
Cider             3
Kids              3
Wings             3
Dessert           2
Salad             1
Other             1
Champagne         1
Name: Class1, dtype: int64
VariedFood       96
Beer             84
Spirits          81
Wine             57
SoftDrinks       40
CoffeeTeaMilk    25
Burger           17
Cider            11
Kids             11
Dessert           9
Wings             9
Other             6
Salad             4
Champagne         4
Name: Class1, dtype: int64
VariedFood       24
Beer             21
Spirits          20
Wine             14

In [52]:
# For the 'paraphrase-multilingual-mpnet-base-v2' model, processed data
# Drop the class label column
df_proc_para_base_class_le = df_proc_para_base_class_le.drop(['ArticleGroupName', 'ClassLabel'], axis=1)

X_proc = df_proc_para_base_class_le
y_proc = df_proc_para_base_class_le.iloc[:, :2]

# Split into training and test set 
(X_train_para_base_proc, X_test_para_base_proc,
 y_train_para_base_proc, y_test_para_base_proc) = train_test_split(X_proc, y_proc, test_size = 0.2,
                                                                   random_state = 42, stratify=df_proc_para_base_class_le['Class1'])

# Inspect the train and test datasets
print(X_train_para_base_proc['Class1'].value_counts())
print(X_test_para_base_proc['Class1'].value_counts())
print(y_train_para_base_proc['Class1'].value_counts())
print(y_test_para_base_proc['Class1'].value_counts())

VariedFood       93
Spirits          63
Wine             53
Beer             48
SoftDrinks       26
CoffeeTeaMilk    23
Burger           17
Kids             11
Cider            11
Dessert           9
Other             6
Salad             4
Wings             4
Champagne         3
Name: Class1, dtype: int64
VariedFood       24
Spirits          16
Wine             13
Beer             12
SoftDrinks        6
CoffeeTeaMilk     6
Burger            4
Kids              3
Cider             3
Dessert           2
Salad             1
Champagne         1
Other             1
Wings             1
Name: Class1, dtype: int64
VariedFood       93
Spirits          63
Wine             53
Beer             48
SoftDrinks       26
CoffeeTeaMilk    23
Burger           17
Kids             11
Cider            11
Dessert           9
Other             6
Salad             4
Wings             4
Champagne         3
Name: Class1, dtype: int64
VariedFood       24
Spirits          16
Wine             13
Beer             12

Let us save the datasets as csv files.

In [53]:
X_train_distil_orig.to_csv('x-train-distil-orig.csv', index=False)
X_test_distil_orig.to_csv('x-test-distil-orig.csv', index=False)
y_train_distil_orig.to_csv('y-train-distil-orig.csv', index=False)
y_test_distil_orig.to_csv('y-test-distil-orig.csv', index=False)

X_train_distil_proc.to_csv('x-train-distil-proc.csv', index=False)
X_test_distil_proc.to_csv('x-test-distil-proc.csv', index=False)
y_train_distil_proc.to_csv('y-train-distil-proc.csv', index=False)
y_test_distil_proc.to_csv('y-test-distil-proc.csv', index=False)

X_train_para_mini_orig.to_csv('x-train-para-mini-orig.csv', index=False)
X_test_para_mini_orig.to_csv('x-test-para-mini-orig.csv', index=False)
y_train_para_mini_orig.to_csv('y-train-para-mini-orig.csv', index=False)
y_test_para_mini_orig.to_csv('y-test-para-mini-orig.csv', index=False)

X_train_para_mini_proc.to_csv('x-train-para-mini-proc.csv', index=False)
X_test_para_mini_proc.to_csv('x-test-para-mini-proc.csv', index=False)
y_train_para_mini_proc.to_csv('y-train-para-mini-proc.csv', index=False)
y_test_para_mini_proc.to_csv('y-test-para-mini-proc.csv', index=False)

X_train_para_base_orig.to_csv('x-train-para-base-orig.csv', index=False)
X_test_para_base_orig.to_csv('x-test-para-base-orig.csv', index=False)
y_train_para_base_orig.to_csv('y-train-para-base-orig.csv', index=False)
y_test_para_base_orig.to_csv('y-test-para-base-orig.csv', index=False)

X_train_para_base_proc.to_csv('x-train-para-base-proc.csv', index=False)
X_test_para_base_proc.to_csv('x-test-para-base-proc.csv', index=False)
y_train_para_base_proc.to_csv('y-train-para-base-proc.csv', index=False)
y_test_para_base_proc.to_csv('y-test-para-base-proc.csv', index=False)
