<a href="https://colab.research.google.com/github/Coding-Rod/NLP_project/blob/main/Extracting_NER_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![cook](https://drive.google.com/uc?export=view&id=1HT2TdXwilP8ovSMFoyr4_n7yqo71Aj2W)

# **Final Group Project NLP**

* Abdelhamid Ahmed Mahmoud Abdelmoneim
* Noureldin Mohamed Abdelsalm Mohamed Hamedo
* Sergio Rodrigo Fernandez Testa
* Shehata, Ahmed Mohamed Elghamry

# **Downloading the Data**

In [1]:
!gdown 1lnoaa6tE2gGDQEEz0DW2hvOnjIMK9oTo

Downloading...
From (original): https://drive.google.com/uc?id=1lnoaa6tE2gGDQEEz0DW2hvOnjIMK9oTo
From (redirected): https://drive.google.com/uc?id=1lnoaa6tE2gGDQEEz0DW2hvOnjIMK9oTo&confirm=t&uuid=7074ec38-0569-40a3-a0ec-badea2471bd8
To: /content/receipeData.zip
100% 621M/621M [00:10<00:00, 59.3MB/s]


In [None]:
!unzip receipeData.zip

# **Imports**

In [20]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import re
from tqdm import tqdm


# **Loading the Train Data**

In [22]:
df=pd.read_csv("dataset/full_dataset.csv", index_col=0)
df= df.iloc[:500000]
df = df[['ingredients', 'NER']].dropna()

In [23]:
df

Unnamed: 0,ingredients,NER
0,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""peanut butter"", ""graham cracker crumbs"", ""bu..."
...,...,...
499995,"[""1 bag fresh cranberries"", ""1 1/2 c. sugar"", ...","[""fresh cranberries"", ""sugar"", ""walnuts"", ""lem..."
499996,"[""4 c. diced apples"", ""1 c. whole cranberries""...","[""apples"", ""cranberries"", ""sugar"", ""oatmeal"", ..."
499997,"[""2 lb. hamburger"", ""sliced cheese"", ""1 can cr...","[""hamburger"", ""cheese"", ""cream of mushroom soup""]"
499998,"[""1 c. chopped onions"", ""1/4 c. green pepper"",...","[""onions"", ""green pepper"", ""ground beef"", ""tom..."


# **Preprocessing the Text**

Just Case-Folding and keeping only alpha-numeric.

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

df['ingredients'] = df['ingredients'].apply(preprocess)
df['NER'] = df['NER'].apply(preprocess)

Tokenizing the Text

In [None]:
def tokenize(text):
    return text.split()

#**Building the Vocab**

In [None]:
class Vocab:
    def __init__(self, tokens, min_freq=1):
        counter = Counter(tokens)
        self.stoi = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
        self.itos = ['<pad>', '<sos>', '<eos>', '<unk>']
        for word, freq in counter.items():
            if freq >= min_freq:
                self.stoi[word] = len(self.itos)
                self.itos.append(word)

    def numericalize(self, tokens):
        return [self.stoi.get(token, self.stoi['<unk>']) for token in tokens]

    def denumericalize(self, indices):
        return [self.itos[idx] for idx in indices]

all_ingredients = [token for row in df['ingredients'] for token in tokenize(row)]
all_ner = [token for row in df['NER'] for token in tokenize(row)]

ingredient_vocab = Vocab(all_ingredients, min_freq=2)
ner_vocab = Vocab(all_ner, min_freq=1)


# **Recipe Dataset Class**

![Model Idea](https://drive.google.com/uc?export=view&id=1BPfp7kSxnEX0xFttDFmVrHifLBYv6MWI)

In [None]:
class RecipeDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ingredients = tokenize(self.data.iloc[idx]['ingredients'])
        ner = tokenize(self.data.iloc[idx]['NER'])
        src = ingredient_vocab.numericalize(ingredients)
        tgt = [ner_vocab.stoi['<sos>']] + ner_vocab.numericalize(ner) + [ner_vocab.stoi['<eos>']]
        return torch.tensor(src), torch.tensor(tgt)

# Collate function
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=ingredient_vocab.stoi['<pad>'])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=ner_vocab.stoi['<pad>'])
    return src_batch, tgt_batch

# DataLoader
train_df, val_df = train_test_split(df, test_size=0.1)
train_loader = DataLoader(RecipeDataset(train_df), batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(RecipeDataset(val_df), batch_size=32, shuffle=False, collate_fn=collate_fn)

# Encoder-Decoder model
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        tgt_vocab_size = len(ner_vocab.itos)

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = tgt[:, 0]

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input = tgt[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs

# **Training the Model**

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_DIM = len(ingredient_vocab.itos)
OUTPUT_DIM = len(ner_vocab.itos)
EMB_DIM = 128
HID_DIM = 256

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM)
model = Seq2Seq(enc, dec, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=ner_vocab.stoi['<pad>'])

def train(model, loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for src, tgt in tqdm(loader):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[:, 1:].reshape(-1, output.shape[-1])
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

for epoch in range(20):
    loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")


100%|██████████| 14063/14063 [12:05<00:00, 19.38it/s]


Epoch 1, Loss: 2.3372


100%|██████████| 14063/14063 [12:05<00:00, 19.40it/s]


Epoch 2, Loss: 1.2214


100%|██████████| 14063/14063 [12:01<00:00, 19.50it/s]


Epoch 3, Loss: 0.9688


100%|██████████| 14063/14063 [11:51<00:00, 19.77it/s]


Epoch 4, Loss: 0.8399


100%|██████████| 14063/14063 [11:53<00:00, 19.72it/s]


Epoch 5, Loss: 0.7546


100%|██████████| 14063/14063 [11:50<00:00, 19.80it/s]


Epoch 6, Loss: 0.6984


100%|██████████| 14063/14063 [11:43<00:00, 20.00it/s]


Epoch 7, Loss: 0.6454


100%|██████████| 14063/14063 [11:32<00:00, 20.30it/s]


Epoch 8, Loss: 0.6166


100%|██████████| 14063/14063 [11:23<00:00, 20.58it/s]


Epoch 9, Loss: 0.5855


 48%|████▊     | 6793/14063 [05:29<05:52, 20.63it/s]


KeyboardInterrupt: 

In [25]:
df

Unnamed: 0,ingredients,NER
0,1 c firmly packed brown sugar 12 c evaporated ...,brown sugar milk vanilla nuts butter bite size...
1,1 small jar chipped beef cut up 4 boned chicke...,beef chicken breasts cream of mushroom soup so...
2,2 16 oz pkg frozen corn 1 8 oz pkg cream chees...,frozen corn cream cheese butter garlic powder ...
3,1 large whole chicken 2 10 12 oz cans chicken ...,chicken chicken gravy cream of mushroom soup s...
4,1 c peanut butter 34 c graham cracker crumbs 1...,peanut butter graham cracker crumbs butter pow...
...,...,...
499995,1 bag fresh cranberries 1 12 c sugar 1 c chopp...,fresh cranberries sugar walnuts lemon
499996,4 c diced apples 1 c whole cranberries 1 c gra...,apples cranberries sugar oatmeal brown sugar m...
499997,2 lb hamburger sliced cheese 1 can cream of mu...,hamburger cheese cream of mushroom soup
499998,1 c chopped onions 14 c green pepper 2 lb grou...,onions green pepper ground beef tomato sauce c...


# **Testing the Model**

In [26]:
testdf=pd.read_csv("dataset/full_dataset.csv", index_col=0)
testdf=testdf.iloc[500000:600000]
testdf = testdf[['ingredients', 'NER']].dropna()
testdf

Unnamed: 0,ingredients,NER
500000,"[""1 egg white unbeaten"", ""1/4 tsp. cream of ta...","[""egg"", ""cream of tartar"", ""sugar"", ""vanilla"",..."
500001,"[""5 1/2 c. powdered milk"", ""3 oz. nondairy cre...","[""powdered milk"", ""nondairy creamer"", ""salt"", ..."
500002,"[""2 lb. beef tips, cut into 1-inch cubes"", ""1 ...","[""beef tips"", ""onion soup mix"", ""cream of mush..."
500003,"[""1/2 c. vegetable oil"", ""1/4 c. red wine vine...","[""vegetable oil"", ""red wine vinegar"", ""salt"", ..."
500004,"[""1 lb. ground beef"", ""40 oz. spaghetti sauce""...","[""ground beef"", ""spaghetti sauce"", ""lasagna no..."
...,...,...
599995,"[""1 1/2 c. sugar"", ""3 Tbsp. cornmeal"", ""2 tsp....","[""sugar"", ""cornmeal"", ""flour"", ""salt"", ""eggs"",..."
599996,"[""1 egg"", ""1 1/2 c. sugar"", ""1 c. melted short...","[""egg"", ""sugar"", ""shortening"", ""soda"", ""salt"",..."
599997,"[""3 qt. water"", ""1 can chicken soup"", ""2 bay l...","[""water"", ""chicken soup"", ""bay leaves"", ""farin..."
599998,"[""2 c. macaroni (very small pasta DaVinci Acin...","[""macaroni"", ""boiling water"", ""oil"", ""salt""]"


In [27]:
# Ensure same preprocessing
testdf['ingredients'] = testdf['ingredients'].apply(preprocess)
testdf['NER'] = testdf['NER'].apply(preprocess)

# Create a Dataset and DataLoader for test data
class TestRecipeDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ingredients = tokenize(self.data.iloc[idx]['ingredients'])
        src = ingredient_vocab.numericalize(ingredients)
        return torch.tensor(src), self.data.iloc[idx]['NER']  # keep original NER string for comparison

def test_collate_fn(batch):
    src_batch, original_ner = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=ingredient_vocab.stoi['<pad>'])
    return src_batch, original_ner

test_loader = DataLoader(TestRecipeDataset(testdf), batch_size=32, shuffle=False, collate_fn=test_collate_fn)

# Inference function
def infer(model, loader, max_len=50):
    model.eval()
    predictions = []
    with torch.no_grad():
        for src, original_ner in tqdm(loader):
            src = src.to(device)
            batch_size = src.size(0)
            hidden, cell = model.encoder(src)
            input = torch.tensor([ner_vocab.stoi['<sos>']] * batch_size).to(device)
            outputs = []

            for _ in range(max_len):
                output, hidden, cell = model.decoder(input, hidden, cell)
                top1 = output.argmax(1)
                outputs.append(top1)
                input = top1

            outputs = torch.stack(outputs, dim=1)  # [batch_size, max_len]
            decoded = [
                " ".join(ner_vocab.denumericalize([token.item() for token in seq if token.item() not in [
                    ner_vocab.stoi['<pad>'], ner_vocab.stoi['<sos>'], ner_vocab.stoi['<eos>']]]))
                for seq in outputs
            ]
            predictions.extend(decoded)
    return predictions

# Run inference
predicted_ner = infer(model, test_loader)

# Combine with original data for inspection
testdf = testdf.reset_index(drop=True)
testdf['Predicted_NER'] = predicted_ner

# Show a few results
print(testdf[['ingredients', 'NER', 'Predicted_NER']].head(10))


100%|██████████| 3125/3125 [02:26<00:00, 21.32it/s]

                                         ingredients  \
0  1 egg white unbeaten 14 tsp cream of tartar 34...   
1  5 12 c powdered milk 3 oz nondairy creamer 1 l...   
2  2 lb beef tips cut into 1inch cubes 1 pkg onio...   
3  12 c vegetable oil 14 c red wine vinegar 12 ts...   
4  1 lb ground beef 40 oz spaghetti sauce 8 oz la...   
5  23 c parsley leaves 1 small onion peeled 2 lb ...   
6  1 envelope unflavored gelatin 34 c firmly pack...   
7  2 lb hamburger 12 c onions 2 tbsp vinegar 2 tb...   
8  1 12 c crisco 2 12 c sugar 4 eggs 3 12 c plain...   
9  1 lb hamburger 12 c chopped onion 12 c chopped...   

                                                 NER  \
0    egg cream of tartar sugar vanilla boiling water   
1  powdered milk nondairy creamer salt powdered s...   
2  beef tips onion soup mix cream of mushroom sou...   
3  vegetable oil red wine vinegar salt pepper lem...   
4  ground beef spaghetti sauce lasagna noodles ri...   
5  parsley onion lean pork shoulder pork fat sa




In [28]:
testdf

Unnamed: 0,ingredients,NER,Predicted_NER
0,1 egg white unbeaten 14 tsp cream of tartar 34...,egg cream of tartar sugar vanilla boiling water,egg cream of tartar sugar vanilla boiling water
1,5 12 c powdered milk 3 oz nondairy creamer 1 l...,powdered milk nondairy creamer salt powdered s...,powdered milk nondairy creamer powdered sugar ...
2,2 lb beef tips cut into 1inch cubes 1 pkg onio...,beef tips onion soup mix cream of mushroom sou...,beef tips onion soup mix cream of mushroom sou...
3,12 c vegetable oil 14 c red wine vinegar 12 ts...,vegetable oil red wine vinegar salt pepper lem...,vegetable oil red wine vinegar salt pepper tar...
4,1 lb ground beef 40 oz spaghetti sauce 8 oz la...,ground beef spaghetti sauce lasagna noodles ri...,ground beef spaghetti sauce lasagna noodles ri...
...,...,...,...
99995,1 12 c sugar 3 tbsp cornmeal 2 tsp flour 14 ts...,sugar cornmeal flour salt eggs vanilla oil coc...,sugar cornmeal flour salt eggs vanilla oil coc...
99996,1 egg 1 12 c sugar 1 c melted shortening 1 tsp...,egg sugar shortening soda salt mincemeat,egg sugar shortening soda salt mincemeat
99997,3 qt water 1 can chicken soup 2 bay leaves 2 c...,water chicken soup bay leaves farina cornmeal ...,water water chicken bay leaves cornmeal cornme...
99998,2 c macaroni very small pasta davinci acini di...,macaroni boiling water oil salt,boiling boiling water oil salt


In [29]:
print(testdf['ingredients'].iloc[1])

5 12 c powdered milk 3 oz nondairy creamer 1 lb nestle quik 12 tsp salt 12 c powdered sugar


In [30]:
print(testdf['NER'].iloc[1])

powdered milk nondairy creamer salt powdered sugar


In [31]:
print(testdf['Predicted_NER'].iloc[1])

powdered milk nondairy creamer powdered sugar salt


# **Saving the Model**

In [32]:
import pickle
import os

# Save model weights
MODEL_PATH = 'seq2seq_model.pt'
torch.save(model.state_dict(), MODEL_PATH)

# Save vocab objects
with open('ingredient_vocab.pkl', 'wb') as f:
    pickle.dump(ingredient_vocab, f)

with open('ner_vocab.pkl', 'wb') as f:
    pickle.dump(ner_vocab, f)

print("Model and vocabularies saved.")

Model and vocabularies saved.


#**Loading the Model**

In [33]:
# Load vocabularies
with open('ingredient_vocab.pkl', 'rb') as f:
    ingredient_vocab = pickle.load(f)

with open('ner_vocab.pkl', 'rb') as f:
    ner_vocab = pickle.load(f)

# Re-create model architecture (must match original hyperparameters)
INPUT_DIM = len(ingredient_vocab.itos)
OUTPUT_DIM = len(ner_vocab.itos)
EMB_DIM = 128
HID_DIM = 256

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM)
loaded_model = Seq2Seq(enc, dec, device).to(device)

# Load saved weights
loaded_model.load_state_dict(torch.load('seq2seq_model.pt', map_location=device))
loaded_model.eval()

print("Model and vocabularies loaded successfully.")


Model and vocabularies loaded successfully.


In [34]:
# Ensure same preprocessing
testdf['ingredients'] = testdf['ingredients'].apply(preprocess)
testdf['NER'] = testdf['NER'].apply(preprocess)

# Create a Dataset and DataLoader for test data
class TestRecipeDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ingredients = tokenize(self.data.iloc[idx]['ingredients'])
        src = ingredient_vocab.numericalize(ingredients)
        return torch.tensor(src), self.data.iloc[idx]['NER']  # keep original NER string for comparison

def test_collate_fn(batch):
    src_batch, original_ner = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=ingredient_vocab.stoi['<pad>'])
    return src_batch, original_ner

test_loader = DataLoader(TestRecipeDataset(testdf), batch_size=32, shuffle=False, collate_fn=test_collate_fn)

# Inference function
def infer(model, loader, max_len=50):
    loaded_model.eval()
    predictions = []
    with torch.no_grad():
        for src, original_ner in tqdm(loader):
            src = src.to(device)
            batch_size = src.size(0)
            hidden, cell = loaded_model.encoder(src)
            input = torch.tensor([ner_vocab.stoi['<sos>']] * batch_size).to(device)
            outputs = []

            for _ in range(max_len):
                output, hidden, cell = loaded_model.decoder(input, hidden, cell)
                top1 = output.argmax(1)
                outputs.append(top1)
                input = top1

            outputs = torch.stack(outputs, dim=1)  # [batch_size, max_len]
            decoded = [
                " ".join(ner_vocab.denumericalize([token.item() for token in seq if token.item() not in [
                    ner_vocab.stoi['<pad>'], ner_vocab.stoi['<sos>'], ner_vocab.stoi['<eos>']]]))
                for seq in outputs
            ]
            predictions.extend(decoded)
    return predictions

# Run inference
predicted_ner = infer(loaded_model, test_loader)

# Combine with original data for inspection
testdf = testdf.reset_index(drop=True)
testdf['Predicted_NER'] = predicted_ner

# Show a few results
print(testdf[['ingredients', 'NER', 'Predicted_NER']].head(10))


100%|██████████| 3125/3125 [02:27<00:00, 21.18it/s]

                                         ingredients  \
0  1 egg white unbeaten 14 tsp cream of tartar 34...   
1  5 12 c powdered milk 3 oz nondairy creamer 1 l...   
2  2 lb beef tips cut into 1inch cubes 1 pkg onio...   
3  12 c vegetable oil 14 c red wine vinegar 12 ts...   
4  1 lb ground beef 40 oz spaghetti sauce 8 oz la...   
5  23 c parsley leaves 1 small onion peeled 2 lb ...   
6  1 envelope unflavored gelatin 34 c firmly pack...   
7  2 lb hamburger 12 c onions 2 tbsp vinegar 2 tb...   
8  1 12 c crisco 2 12 c sugar 4 eggs 3 12 c plain...   
9  1 lb hamburger 12 c chopped onion 12 c chopped...   

                                                 NER  \
0    egg cream of tartar sugar vanilla boiling water   
1  powdered milk nondairy creamer salt powdered s...   
2  beef tips onion soup mix cream of mushroom sou...   
3  vegetable oil red wine vinegar salt pepper lem...   
4  ground beef spaghetti sauce lasagna noodles ri...   
5  parsley onion lean pork shoulder pork fat sa


