#### Tecnologie dei dati e del linguaggio
# Significato e contesto
## *You Shall Know a Word by the Company It Keeps*
### Prof. Alfio Ferrara


In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [None]:
import matplotlib.pyplot as plt

## Dataset e nuovo obiettivo
Come esempio, useremo le ricette prese dal dataset [**Food.com Recipes with Search Terms and Tags**](https://www.kaggle.com/datasets/shuyangli94/foodcom-recipes-with-search-terms-and-tags).

Contrariamente al caso della classificazione, cercheremo in questo caso di predire, dato un ingrediente, quali altri ingredienti appaiono nel contesto dell'ingrediente dato.

## Contesto
Possiamo definire il contesto in due modi:
1. **Skip-gram**: ogni ingrediente ha come contesto gli ingredienti che compaiono nella stessa ricetta (eventualmente entro una certa finestra)
2. **Continuous Bag of Words (CBOW)**: a partire dagli ingredienti di una ricetta (entro una finestra), vogliamo predire l'ingrediente centrale

#### Esempio:

Supponiamo di avere una finestra di contesto pari a 2, quindi, per ogni ingrediente, osserviamo i 2 ingredienti prima e dopo.

![](./imgs/context.png)

In [None]:
from collections import defaultdict
import pymongo
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']
label_field = 'category'
data = []
labels = ['italian', 'indian', 'southern', 'mexican', 'chinese', 'greek', 'thai']

q = {'search_terms': {'$in': labels}}
for recipe in recipes.find(q):
    data.append({
        'ingredients': recipe['ingredients'],
        label_field: [x for x in recipe['search_terms'] if x in labels][0]
    })

### Creazione del dataset di training

Obiettivo: predire un ingrediente dato il contesto (**CBOW**)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import nltk 
import nlp.wordbags as wb

In [None]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']
corpus = []
q = {}
for recipe in recipes.find(q).limit(15000):
    corpus.append(recipe['ingredients'])

In [None]:
bow = wb.Bow(corpus=corpus, min_occurrences=10)
dataloader, inputs, targets = bow.one_hot_cbow_dataloader(window=4, batch=4)

In [None]:
for x, y in dataloader:
    print(f"Inputs: {x}")
    print(f"Target: {y}")
    print()
    print(f"Inputs shape: {x.shape}")
    print(f"Target shape: {y.shape}")
    for j, row in enumerate(x):
        input_ingredient = [bow.idx2word[i] for i, k in enumerate(row) if k > 0]
        target_ingredients = [bow.idx2word[i] for i, k in enumerate(y[j]) if k > 0]
        print(f"\nIngrediente di input: {input_ingredient}")
        print(f"Ingredienti target: {target_ingredients}\n")
    break 

### Rete neurale

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class SimpleNet(nn.Module):

    def __init__(self, input_size, output_size):
        super(SimpleNet, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, x):
        x = self.fc(x)
        return x

In [None]:
net = SimpleNet(input_size=len(bow.idx2word), output_size=len(bow.idx2word))

### Train

In [None]:
import time
import math
import torch

In [None]:
def train(document_tensor, label_tensor, criterion, learning_rate):
    net.zero_grad()
    output = net(document_tensor)
    loss = criterion(output, label_tensor)
    loss.backward()

    for p in net.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)
    return output, loss.item()

In [None]:
batches = [(doc, lab) for doc, lab in dataloader]

In [None]:
n_iters = 30_000
print_every = 2000
plot_every = 200
criterion = nn.CrossEntropyLoss()
learning_rate = 0.01

current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

epochs = list(range(1, n_iters + 1))
for it in tqdm(epochs):
    document_tensor, label_tensor = batches[np.random.randint(0, len(batches) - 1)]
    output, loss = train(document_tensor, label_tensor, criterion, learning_rate)
    current_loss += loss

    if it % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(all_losses, 'g')
plt.tight_layout()
plt.show()

## Predittore

In [None]:
def vectorize(ingredients: list):
    input_vector = [np.zeros(len(bow.vocabulary), dtype=np.float32)]
    for i in ingredients:
        try:
            input_vector[0][bow.word2idx[i]] = 1
        except KeyError:
            pass 
    return torch.tensor(input_vector)

In [None]:
bow.vocabulary

In [None]:
ingredients = vectorize(['zucchini', 'spaghetti', 'parmesan cheese'])
with torch.no_grad():
    y_pred = net(ingredients)
    y_pred = F.softmax(y_pred, dim=1)
pd.Series(y_pred[0], index=bow.vocabulary).sort_values(ascending=False).head(10)

In [None]:
top20 = pd.Series(y_pred[0], index=bow.vocabulary).sort_values(ascending=False)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(top20.values, 'g')
ax.set_ylim((0, 0.02))
plt.tight_layout()
plt.show()

### Domanda difficile: perchè la rete non impara nulla e attribuisce a tutti gli ingredienti una probabilità così alta?

Partiamo dall'idea che la rete calcola:

$$
\hat{y} = softmax(\theta.T x)
$$

ma dato che $x$ è un vettore one-hot, il prodotto $\theta.T$ non fa altro che selezionare una colonna della matrice dei parametri e aggiornare solo quella. In pratica calcoliamo solo:

$$
\hat{y} = softmax(\theta.T x_i)
$$

QUindi tutti i vettori di input si appiattiscono in modo simile, e softmax dà sempre una distribuzione simile. Tutte le probabilità si avvicinano e nessuna parola viene predetta correttamente.

## Alcune considerazioni sui parametri della rete.

In [None]:
print([x for x in net.named_parameters()])

In [None]:
params = {name: param.detach().numpy() for name, param in net.named_parameters()}

In [None]:
params['fc.weight'].shape

In [None]:

feature_vectors = pd.DataFrame(params['fc.weight'].T, index=bow.vocabulary, columns=bow.vocabulary)
feature_vectors

Ora, proviamo a calcolare la similarità tra ingredienti utilizzando questi vettori.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sigma = cosine_similarity(feature_vectors, feature_vectors)
S = pd.DataFrame(sigma, index=bow.vocabulary, columns=bow.vocabulary)

In [None]:
S.head()

In [None]:
query = 'soy sauce'
S.loc[query].sort_values(ascending=False).head(10)

## Introduzione di un layer intermedio (hidden layer)
![](./imgs/hidden.png)

In [None]:
class HiddenNet(nn.Module):

    def __init__(self, input_size, output_size, hidden_size):
        super(HiddenNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [None]:
net = HiddenNet(input_size=len(bow.idx2word), output_size=len(bow.idx2word), hidden_size=10)

In [None]:
n_iters = 50_000
print_every = 2000
plot_every = 200
criterion = nn.CrossEntropyLoss()
learning_rate = 0.1

current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

epochs = list(range(1, n_iters + 1))
for it in tqdm(epochs):
    document_tensor, label_tensor = batches[np.random.randint(0, len(batches) - 1)]
    output, loss = train(document_tensor, label_tensor, criterion, learning_rate)
    current_loss += loss

    if it % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(all_losses, 'g')
plt.tight_layout()
plt.show()

In [None]:
ingredients = vectorize(['zucchini', 'spaghetti', 'parmesan cheese'])
with torch.no_grad():
    y_pred = net(ingredients)
    y_pred = F.softmax(y_pred, dim=1)
pd.Series(y_pred[0], index=bow.vocabulary).sort_values(ascending=False)


In [None]:
top20 = pd.Series(y_pred[0], index=bow.vocabulary).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(top20.values, 'g')
ax.set_ylim((0, 0.02))
plt.tight_layout()
plt.show()

## Prendiamo i primi valori per ottenere un embedding di dimensione pari all'hidden layer per ogni ingrediente

### Domanda: è chiaro perchè la matrice ha questa dimensione?

In [None]:
params = {name: param.detach().numpy() for name, param in net.named_parameters()}
feature_vectors = pd.DataFrame(params['fc1.weight'].T, index=bow.vocabulary)
feature_vectors

In [None]:
sigma = cosine_similarity(feature_vectors, feature_vectors)
S = pd.DataFrame(sigma, index=bow.vocabulary, columns=bow.vocabulary)
S

In [None]:
query = 'soy sauce'
S.loc[query].sort_values(ascending=False).head(10)