## Import Required Libraries

In [26]:
import re
import pandas as pd
import random
import spacy
from spacy.util import minibatch, compounding
from sklearn.model_selection import train_test_split
from spacy.training import Example
from spacy.tokens import Doc
from time import time

## Tagging Data
Format: ("Ingredient_Phrase", {"entities": [(start, end, "Ingredient"), (start, end,"Quantity"), (start, end, "Unit")]})
where start and end are starting and ending indexes respectively. 

In [27]:
tagged_entities = []
def tag_input(input_str):
    # Define the regex pattern
    pattern = r'(\S+)\s*(\d+(?:\.\d+)?)\s*(\S+)'

    # Extract the matches using the pattern
    matches = re.findall(pattern, input_str)

    # Create a list of tuples with the tagged entities
    for match in matches:
        ingredient = match[0]
        quantity = match[1]
        unit = match[2]
        entity = (input_str, {"entities": [(0, re.search('\d', input_str).start()-1, "Ingredient"), 
                                           (re.search('\d', input_str).start(), re.search('\d', input_str).end(), "Quantity"), 
                                           (re.search('\d', input_str).end()+1, len(input_str), "Unit")]})
        tagged_entities.append(entity)
#     return tagged_entities

In [28]:
df = pd.read_excel("urdu-recipes.xlsx")
lst_ing_phrases = []
for index, row in df.iterrows():
    ing_name = row['ing_name']
    ing_quantity = row['Ing_quantity']
    ing_unit = row['Ing_unit']
    concatenated = str(ing_name) +" "+ str(ing_quantity) + " " +str(ing_unit)
#     print(concatenated)
    remove_nan = concatenated.replace("nan", "")
#     print(remove_nan)
    lst_ing_phrases.append(remove_nan)
for i in lst_ing_phrases:
    tag_input(i) 

### Let's look at "Tagged Entities"
tagged_entities = [
("ادرک لہسن کا پیسٹ 1 کھانے کا چمچہ", {"entities": [(0, 13, "Ingredient"), (14, 15, "Quantity"), (16, 23, "Unit")]}),
("ثابت لال مرچ 6 عدد", {"entities": [(0, 9, "Ingredient"), (10, 11, "Quantity"), (12, 15, "Unit")]}),
("ہری مرچ 3 عدد", {"entities": [(0, 7, "Ingredient"), (8, 9, "Quantity"), (10, 13, "Unit")]}),
("لہسن کے جوے آدھا کلو", {"entities": [(0, 11, "Ingredient"), (12, 13, "Quantity"), (14, 16, "Unit")]}),
("فِش فلے 10 عدد", {"entities": [(0, 7, "Ingredient"), (8, 10, "Quantity"), (11, 14, "Unit")]})]

## Shuffle and Split Data 

In [29]:
# Shuffle the data randomly
random.shuffle(tagged_entities)

# Split the data into train and test sets
train_data, test_data = train_test_split(tagged_entities, test_size=0.2, random_state=42)

In [30]:
len(train_data)


775

In [31]:
len(test_data)

194

## Initialize NER Model

In [32]:
# Initialize a blank Urdu model
nlp = spacy.blank("ur")

# Create a new NER component
ner = nlp.create_pipe("ner")

# Add the labels to the NER component
labels = ["Quantity", "Unit", "Ingredient"]
for label in labels:
    ner.add_label(label)    
    
# Add the NER component to the pipeline
nlp.add_pipe('ner', last=True)

<spacy.pipeline.ner.EntityRecognizer at 0x20420065cf0>

## Train the Model

In [33]:
# Train the NER model
n_iter = 200
batch_size = 4
optimizer = nlp.begin_training()

start_time = time()
for i in range(n_iter):
    losses = {}
    random.shuffle(train_data)
    batches = minibatch(train_data, size=compounding(batch_size, 32, 0.01))
    for batch in batches:
        text, annotations = batch[0]
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.35, sgd=optimizer, losses=losses)
        epoch_loss = losses["ner"]
    print(f"Epoch {i+1} - Time taken: {time()-start_time:.2f} seconds")




Epoch 1 - Time taken: 0.03 seconds
Epoch 2 - Time taken: 0.10 seconds
Epoch 3 - Time taken: 0.13 seconds
Epoch 4 - Time taken: 0.17 seconds
Epoch 5 - Time taken: 0.22 seconds




Epoch 6 - Time taken: 0.29 seconds
Epoch 7 - Time taken: 0.33 seconds
Epoch 8 - Time taken: 0.38 seconds
Epoch 9 - Time taken: 0.44 seconds
Epoch 10 - Time taken: 0.49 seconds




Epoch 11 - Time taken: 0.54 seconds
Epoch 12 - Time taken: 0.59 seconds
Epoch 13 - Time taken: 0.64 seconds
Epoch 14 - Time taken: 0.71 seconds




Epoch 15 - Time taken: 0.78 seconds
Epoch 16 - Time taken: 0.84 seconds
Epoch 17 - Time taken: 0.90 seconds
Epoch 18 - Time taken: 0.95 seconds
Epoch 19 - Time taken: 1.01 seconds
Epoch 20 - Time taken: 1.05 seconds
Epoch 21 - Time taken: 1.10 seconds
Epoch 22 - Time taken: 1.15 seconds
Epoch 23 - Time taken: 1.18 seconds




Epoch 24 - Time taken: 1.24 seconds
Epoch 25 - Time taken: 1.30 seconds
Epoch 26 - Time taken: 1.36 seconds
Epoch 27 - Time taken: 1.40 seconds
Epoch 28 - Time taken: 1.45 seconds
Epoch 29 - Time taken: 1.52 seconds
Epoch 30 - Time taken: 1.57 seconds
Epoch 31 - Time taken: 1.61 seconds
Epoch 32 - Time taken: 1.67 seconds
Epoch 33 - Time taken: 1.72 seconds
Epoch 34 - Time taken: 1.76 seconds
Epoch 35 - Time taken: 1.81 seconds
Epoch 36 - Time taken: 1.85 seconds
Epoch 37 - Time taken: 1.91 seconds
Epoch 38 - Time taken: 1.95 seconds
Epoch 39 - Time taken: 2.00 seconds
Epoch 40 - Time taken: 2.05 seconds
Epoch 41 - Time taken: 2.09 seconds
Epoch 42 - Time taken: 2.14 seconds




Epoch 43 - Time taken: 2.20 seconds
Epoch 44 - Time taken: 2.25 seconds
Epoch 45 - Time taken: 2.30 seconds
Epoch 46 - Time taken: 2.34 seconds
Epoch 47 - Time taken: 2.38 seconds




Epoch 48 - Time taken: 2.43 seconds
Epoch 49 - Time taken: 2.50 seconds
Epoch 50 - Time taken: 2.54 seconds
Epoch 51 - Time taken: 2.59 seconds




Epoch 52 - Time taken: 2.65 seconds
Epoch 53 - Time taken: 2.71 seconds
Epoch 54 - Time taken: 2.76 seconds
Epoch 55 - Time taken: 2.80 seconds
Epoch 56 - Time taken: 2.85 seconds
Epoch 57 - Time taken: 2.91 seconds
Epoch 58 - Time taken: 2.96 seconds
Epoch 59 - Time taken: 2.99 seconds
Epoch 60 - Time taken: 3.03 seconds
Epoch 61 - Time taken: 3.07 seconds
Epoch 62 - Time taken: 3.11 seconds




Epoch 63 - Time taken: 3.17 seconds
Epoch 64 - Time taken: 3.21 seconds
Epoch 65 - Time taken: 3.25 seconds
Epoch 66 - Time taken: 3.30 seconds
Epoch 67 - Time taken: 3.34 seconds
Epoch 68 - Time taken: 3.40 seconds
Epoch 69 - Time taken: 3.44 seconds
Epoch 70 - Time taken: 3.49 seconds
Epoch 71 - Time taken: 3.54 seconds
Epoch 72 - Time taken: 3.59 seconds




Epoch 73 - Time taken: 3.65 seconds
Epoch 74 - Time taken: 3.70 seconds
Epoch 75 - Time taken: 3.75 seconds
Epoch 76 - Time taken: 3.79 seconds
Epoch 77 - Time taken: 3.84 seconds




Epoch 78 - Time taken: 3.90 seconds
Epoch 79 - Time taken: 3.95 seconds
Epoch 80 - Time taken: 4.00 seconds
Epoch 81 - Time taken: 4.03 seconds
Epoch 82 - Time taken: 4.08 seconds




Epoch 83 - Time taken: 4.13 seconds
Epoch 84 - Time taken: 4.18 seconds
Epoch 85 - Time taken: 4.23 seconds
Epoch 86 - Time taken: 4.27 seconds
Epoch 87 - Time taken: 4.33 seconds




Epoch 88 - Time taken: 4.40 seconds
Epoch 89 - Time taken: 4.45 seconds
Epoch 90 - Time taken: 4.50 seconds
Epoch 91 - Time taken: 4.55 seconds
Epoch 92 - Time taken: 4.59 seconds




Epoch 93 - Time taken: 4.63 seconds
Epoch 94 - Time taken: 4.68 seconds
Epoch 95 - Time taken: 4.73 seconds
Epoch 96 - Time taken: 4.77 seconds
Epoch 97 - Time taken: 4.80 seconds




Epoch 98 - Time taken: 4.86 seconds
Epoch 99 - Time taken: 4.91 seconds
Epoch 100 - Time taken: 4.98 seconds
Epoch 101 - Time taken: 5.03 seconds
Epoch 102 - Time taken: 5.09 seconds
Epoch 103 - Time taken: 5.14 seconds
Epoch 104 - Time taken: 5.18 seconds
Epoch 105 - Time taken: 5.22 seconds
Epoch 106 - Time taken: 5.26 seconds




Epoch 107 - Time taken: 5.32 seconds
Epoch 108 - Time taken: 5.39 seconds
Epoch 109 - Time taken: 5.44 seconds
Epoch 110 - Time taken: 5.50 seconds
Epoch 111 - Time taken: 5.55 seconds
Epoch 112 - Time taken: 5.60 seconds
Epoch 113 - Time taken: 5.65 seconds
Epoch 114 - Time taken: 5.70 seconds
Epoch 115 - Time taken: 5.75 seconds




Epoch 116 - Time taken: 5.79 seconds
Epoch 117 - Time taken: 5.84 seconds
Epoch 118 - Time taken: 5.88 seconds
Epoch 119 - Time taken: 5.93 seconds
Epoch 120 - Time taken: 5.99 seconds
Epoch 121 - Time taken: 6.05 seconds
Epoch 122 - Time taken: 6.09 seconds
Epoch 123 - Time taken: 6.15 seconds
Epoch 124 - Time taken: 6.19 seconds
Epoch 125 - Time taken: 6.24 seconds




Epoch 126 - Time taken: 6.30 seconds
Epoch 127 - Time taken: 6.35 seconds
Epoch 128 - Time taken: 6.41 seconds
Epoch 129 - Time taken: 6.48 seconds
Epoch 130 - Time taken: 6.53 seconds
Epoch 131 - Time taken: 6.59 seconds
Epoch 132 - Time taken: 6.65 seconds
Epoch 133 - Time taken: 6.71 seconds
Epoch 134 - Time taken: 6.77 seconds
Epoch 135 - Time taken: 6.82 seconds
Epoch 136 - Time taken: 6.86 seconds
Epoch 137 - Time taken: 6.92 seconds
Epoch 138 - Time taken: 6.96 seconds
Epoch 139 - Time taken: 7.01 seconds
Epoch 140 - Time taken: 7.07 seconds
Epoch 141 - Time taken: 7.12 seconds
Epoch 142 - Time taken: 7.15 seconds
Epoch 143 - Time taken: 7.20 seconds
Epoch 144 - Time taken: 7.25 seconds
Epoch 145 - Time taken: 7.29 seconds
Epoch 146 - Time taken: 7.36 seconds
Epoch 147 - Time taken: 7.40 seconds
Epoch 148 - Time taken: 7.45 seconds




Epoch 149 - Time taken: 7.52 seconds
Epoch 150 - Time taken: 7.60 seconds
Epoch 151 - Time taken: 7.67 seconds
Epoch 152 - Time taken: 7.73 seconds
Epoch 153 - Time taken: 7.79 seconds
Epoch 154 - Time taken: 7.88 seconds
Epoch 155 - Time taken: 7.93 seconds
Epoch 156 - Time taken: 8.00 seconds
Epoch 157 - Time taken: 8.05 seconds
Epoch 158 - Time taken: 8.10 seconds
Epoch 159 - Time taken: 8.16 seconds
Epoch 160 - Time taken: 8.21 seconds




Epoch 161 - Time taken: 8.27 seconds
Epoch 162 - Time taken: 8.33 seconds
Epoch 163 - Time taken: 8.38 seconds
Epoch 164 - Time taken: 8.42 seconds
Epoch 165 - Time taken: 8.50 seconds
Epoch 166 - Time taken: 8.55 seconds
Epoch 167 - Time taken: 8.62 seconds
Epoch 168 - Time taken: 8.67 seconds




Epoch 169 - Time taken: 8.74 seconds
Epoch 170 - Time taken: 8.80 seconds
Epoch 171 - Time taken: 8.85 seconds
Epoch 172 - Time taken: 8.90 seconds
Epoch 173 - Time taken: 8.95 seconds




Epoch 174 - Time taken: 9.03 seconds
Epoch 175 - Time taken: 9.08 seconds
Epoch 176 - Time taken: 9.14 seconds
Epoch 177 - Time taken: 9.19 seconds
Epoch 178 - Time taken: 9.24 seconds
Epoch 179 - Time taken: 9.32 seconds
Epoch 180 - Time taken: 9.37 seconds
Epoch 181 - Time taken: 9.43 seconds
Epoch 182 - Time taken: 9.49 seconds
Epoch 183 - Time taken: 9.55 seconds
Epoch 184 - Time taken: 9.62 seconds
Epoch 185 - Time taken: 9.67 seconds
Epoch 186 - Time taken: 9.72 seconds
Epoch 187 - Time taken: 9.76 seconds
Epoch 188 - Time taken: 9.83 seconds
Epoch 189 - Time taken: 9.88 seconds
Epoch 190 - Time taken: 9.93 seconds
Epoch 191 - Time taken: 9.98 seconds




Epoch 192 - Time taken: 10.05 seconds
Epoch 193 - Time taken: 10.09 seconds
Epoch 194 - Time taken: 10.18 seconds
Epoch 195 - Time taken: 10.25 seconds




Epoch 196 - Time taken: 10.32 seconds
Epoch 197 - Time taken: 10.39 seconds
Epoch 198 - Time taken: 10.46 seconds
Epoch 199 - Time taken: 10.54 seconds
Epoch 200 - Time taken: 10.59 seconds


## Test the Model 

In [34]:
# Test the NER model on the test data
for text, annotations in test_data:
    doc = nlp(text)
    print("Text:", text)
    print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
    print("")
# Save the model to disk
nlp.to_disk("urdu_ner_model")

Text: کٹی کالی مرچ 1 چائے کا چمچہ
Entities: [('کٹی کالی مرچ', 'Ingredient'), ('1', 'Quantity'), ('چائے کا چمچہ', 'Unit')]

Text: ہلدی 1 چائے کا چمچہ
Entities: [('ہلدی', 'Ingredient'), ('1', 'Quantity'), ('چائے کا چمچہ', 'Unit')]

Text: مکھن 2  کھانے کا چمچہ
Entities: [('مکھن', 'Ingredient'), ('2', 'Quantity')]

Text: کٹا زیرہ 1 چائے کا چمچہ
Entities: [('کٹا زیرہ', 'Ingredient'), ('1', 'Quantity'), ('چائے کا چمچہ', 'Unit')]

Text: بھنا ہوا زیرہ 2 چائے کا چمچہ
Entities: [('بھنا ہوا زیرہ', 'Ingredient'), ('2', 'Quantity'), ('چائے کا چمچہ', 'Unit')]

Text: لال مرچ 2 چائے کا چمچہ
Entities: [('لال مرچ', 'Ingredient'), ('2', 'Quantity'), ('چائے کا چمچہ', 'Unit')]

Text: دہی 250 گرام
Entities: [('دہی', 'Ingredient'), ('250', 'Quantity'), ('گرام', 'Ingredient')]

Text: ثابت لال مرچ  5 عدد
Entities: [('ثابت لال مرچ', 'Ingredient'), ('5', 'Quantity'), ('عدد', 'Unit')]

Text: ہرا دھنیا 0.25 گٹھی
Entities: [('ہرا دھنیا', 'Ingredient'), ('0.25', 'Quantity'), ('گٹھی', 'Unit')]

Text: بھنی پسی ثابت گو

Text: پانی 2 جگ
Entities: [('پانی', 'Ingredient'), ('2', 'Quantity'), ('جگ', 'Unit')]

Text: چھلے اور اُبلے مٹر 1 پیالی
Entities: [('چھلے اور', 'Ingredient'), ('اُبلے مٹر', 'Ingredient'), ('1', 'Quantity'), ('پیالی', 'Unit')]

Text: پساگرم مصالحہ 1 چائے کا چمچہ
Entities: [('پساگرم مصالحہ', 'Ingredient'), ('1', 'Quantity'), ('چائے کا چمچہ', 'Unit')]

Text: چاول  1 کلو
Entities: [('چاول  ', 'Ingredient'), ('1', 'Quantity'), ('کلو', 'Unit')]

Text: بسکٹ 1 پیکٹ
Entities: [('بسکٹ', 'Ingredient'), ('1', 'Quantity'), ('پیکٹ', 'Unit')]

Text: مرغی کا قیمہ 125 گرام
Entities: [('مرغی کا قیمہ', 'Ingredient'), ('125', 'Quantity'), ('گرام', 'Unit')]

Text: ہرا دھنیا 1 گٹھی
Entities: [('ہرا دھنیا', 'Ingredient'), ('1', 'Quantity'), ('گٹھی', 'Unit')]

Text: گائے کے گوشت کی بوٹیاں 1 کلو
Entities: [('گائے کے گوشت کی بوٹیاں', 'Ingredient'), ('1', 'Quantity'), ('کلو', 'Unit')]

Text: پسی لال مرچ 1 کھانے کا چمچہ
Entities: [('پسی لال مرچ', 'Ingredient'), ('1', 'Quantity'), ('کھانے کا چمچہ', 'Unit')]

Text:

### Let's look at a random test case.

In [35]:
doc = nlp('چنار کے پھول 1 کپ')
print("Text:", 'چنار کے پھول 1 کپ')
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("")


Text: چنار کے پھول 1 کپ
Entities: [('چنار کے پھول', 'Ingredient'), ('1', 'Quantity'), ('کپ', 'Unit')]



## Evalution
### Metrics
* Precision
* Recall
* F1Score


In [36]:
# Function to evaluate the model on a test set
def evaluate_model(test_data):
    # Initialize counters
    total_true = 0
    total_predicted = 0
    total_correct = 0
    
    # Iterate over the test data
    for text, annotations in test_data:
        # Get the true entities
        true_entities = set([(ent[0], ent[1], ent[2]) for ent in annotations["entities"]])
        
        
        # Get the predicted entities
        doc = nlp(text)
        predicted_entities = set([(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents])
        
        # Update the counters
        total_true += len(true_entities)
        total_predicted += len(predicted_entities)
        total_correct += len(true_entities.intersection(predicted_entities))
    
    # Calculate precision, recall, and F1 score
    precision = total_correct / total_predicted if total_predicted > 0 else 0.0
    recall = total_correct / total_true if total_true > 0 else 0.0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
    
    # Print the evaluation metrics
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1_score:.2f}")

# Evaluate the model on the test data
evaluate_model(test_data)

Precision: 0.86
Recall: 0.84
F1 score: 0.85


### Conclusion: 
Above are the evaluation metrics of our NER model. The precision of the model is 0.86, which means that out of all the predicted entities, 86% were correct. The recall of the model is 0.84, which means that out of all the actual entities, the model was able to correctly identify 84% of them. The F1 score is 0.85, which is the harmonic mean of precision and recall and is a measure of the overall performance of the model which is indicating better performance.