In [1]:
import pandas as pd
import numpy as np
import spacy
import re

from sklearn.model_selection import train_test_split
from spacy.training.example import Example
from spacy.tokens import Doc
from tqdm import tqdm

In [2]:
df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [3]:
df.head()

Unnamed: 0,tokens,labels
0,4 cloves garlic 2 cups cooked corned beef -LRB...,3 7 1 3 7 5 5 1 2 2 2 2
1,"2 tablespoons vegetable oil , divided 1 1/2 cu...",3 7 1 1 2 5 3 7 5 1
2,2 tablespoons dried marjoram 3 tablespoons pac...,3 7 0 1 3 7 5 1 1
3,"1 large red onion , 1/4-inch slices pulled int...",3 4 1 1 2 2 2 2 2 2 3 7 1 2 5
4,"2 jalapeno peppers , seeded and minced 1/2 - 3...",3 1 1 2 5 2 5 3 2 2 7 5 1 1


There are some unreadable characters in the text. We have to remove them

In [4]:
df["tokens"][1]

'2 tablespoons vegetable oil , divided 1\xa01/2 cups chopped pecans'

In [5]:
df[df["tokens"].str.contains("\xa0")]

Unnamed: 0,tokens,labels
1,"2 tablespoons vegetable oil , divided 1 1/2 cu...",3 7 1 1 2 5 3 7 5 1
43,1 1/2 teaspoons red pepper flakes 2 teaspoons ...,3 7 1 1 1 3 7 0 1 2 5
48,8 ounces sliced Swiss cheese 1 1/2 teaspoons c...,3 7 5 1 1 3 7 5 0 1 2 2 2 2 2 2
62,1 1/2 pounds lean ground beef 1 cup fresh gold...,3 7 5 5 1 3 7 0 1 1 2 2 2 2 1
67,"1 1/2 tablespoons arrowroot powder , or as des...",3 7 1 1 2 2 2 2 3 1 1 2 5 2 2 2
...,...,...
5111,1 1/2 cups boiling water,3 7 5 1
5115,2 1/2 teaspoons baking powder,3 7 1 1
5118,1 1/4 ounces old el paso taco spicy taco seaso...,3 7 2 2 2 1 2 1 1
5134,1 1/2 teaspoons baking soda,3 7 1 1


Before this char there is also unnecessary number, which must be removed also. 

In [6]:
def clear_num_char(__text: str):
    return re.sub(r"\d\xa0", "", __text)

In [7]:
df["tokens"] = df["tokens"].apply(clear_num_char)

In [8]:
df.head()

Unnamed: 0,tokens,labels
0,4 cloves garlic 2 cups cooked corned beef -LRB...,3 7 1 3 7 5 5 1 2 2 2 2
1,"2 tablespoons vegetable oil , divided 1/2 cups...",3 7 1 1 2 5 3 7 5 1
2,2 tablespoons dried marjoram 3 tablespoons pac...,3 7 0 1 3 7 5 1 1
3,"1 large red onion , 1/4-inch slices pulled int...",3 4 1 1 2 2 2 2 2 2 3 7 1 2 5
4,"2 jalapeno peppers , seeded and minced 1/2 - 3...",3 1 1 2 5 2 5 3 2 2 7 5 1 1


In [9]:
df[df["tokens"].str.contains("\xa0")]

Unnamed: 0,tokens,labels
3823,"saffron , alioli see ""<a href=""""https://www.ge...",1 2 1 2 2 2 2 2
4842,"1 teaspoon ras el hanout spice mix , ""<a href=...",3 7 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
4847,"1/2 cup enchilada sauce , I use Texas Red Ench...",3 7 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2


In [10]:
print(len(df["tokens"][3823].split()) == len(df["labels"][3823].split()))
print(len(df["tokens"][3823].split()) - 1 == len(df["labels"][3823].split()))

False
True


In [11]:
df["tokens"][3823].split()

['saffron',
 ',',
 'alioli',
 'see',
 '"<a',
 'href=""https://www.geniuskitchen.com/recipe/saffron-alioli-369693"">"',
 'Saffron',
 'Alioli',
 '</a>']

So here the link must be saved as `ahref`, so I'll remove the char.

In [12]:
def fix_links(__text: str):
    return __text.replace("\xa0", "")
df["tokens"] = df["tokens"].apply(fix_links)

In [13]:
df.head()

Unnamed: 0,tokens,labels
0,4 cloves garlic 2 cups cooked corned beef -LRB...,3 7 1 3 7 5 5 1 2 2 2 2
1,"2 tablespoons vegetable oil , divided 1/2 cups...",3 7 1 1 2 5 3 7 5 1
2,2 tablespoons dried marjoram 3 tablespoons pac...,3 7 0 1 3 7 5 1 1
3,"1 large red onion , 1/4-inch slices pulled int...",3 4 1 1 2 2 2 2 2 2 3 7 1 2 5
4,"2 jalapeno peppers , seeded and minced 1/2 - 3...",3 1 1 2 5 2 5 3 2 2 7 5 1 1


In [14]:
df[df["tokens"].str.contains("\xa0")]

Unnamed: 0,tokens,labels


Now I'm going to preprocess the data

In [15]:
train_df, val_df = train_test_split(df, test_size=0.2)

In [16]:
train_x, train_y, val_x, val_y = [], [], [], []


def train_connect_text(__text: str):
    train_x.extend(__text.split())


def train_connect_labels(__text: str):
    train_y.extend(__text.split())


def val_connect_text(__text: str):
    val_x.extend(__text.split())


def val_connect_labels(__text: str):
    val_y.extend(__text.split())


train_df["tokens"].apply(train_connect_text)
train_df["labels"].apply(train_connect_labels)
val_df["tokens"].apply(val_connect_text)
val_df["labels"].apply(val_connect_labels)

print(train_x[:5])
print(train_y[:5])
print(val_x[:5])
print(val_y[:5])

['2', 'cans', 'vegetable', 'soup', '-LRB-']
['3', '7', '1', '1', '2']
['1', 'pinch', 'powdered', 'sugar', '2']
['3', '7', '5', '1', '3']


In [17]:
def label_to_annotation(x: list[str], y: list[str]):
    for i in range(len(y)):
        y[i] = (0, len(x[i]), y[i])

label_to_annotation(train_x, train_y)
label_to_annotation(val_x, val_y)

print(train_x[:5])
print(train_y[:5])
print(val_x[:5])
print(val_y[:5])

['2', 'cans', 'vegetable', 'soup', '-LRB-']
[(0, 1, '3'), (0, 4, '7'), (0, 9, '1'), (0, 4, '1'), (0, 5, '2')]
['1', 'pinch', 'powdered', 'sugar', '2']
[(0, 1, '3'), (0, 5, '7'), (0, 8, '5'), (0, 5, '1'), (0, 1, '3')]


Creating batch function and then I'll create the model

In [18]:
def batch(__x: list[str], __y: list[str], __batch_size):
    _data = list(zip(__x, __y))
    np.random.shuffle(_data)
    _i = 0
    for _i in range(0, len(_data), __batch_size):
        yield _data[_i:_i + __batch_size]
    return _data[max(_i - __batch_size, 0):]       

Time to create a model

In [41]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

for label in range(8):
    ner.add_label(str(label))

In [20]:
def train(model, epochs: int, batch_size: int):
    global train_x, train_y
    optimizer = model.initialize()
    for epoch in range(epochs):
        print(f"Epoch {epoch}:")
        losses = {}
        for data in batch(train_x, train_y, batch_size):
            texts, annotations = zip(*data)
            examples = []
            for text, annotation in zip(texts, annotations):
                doc = model.make_doc(text)
                examples.append(Example.from_dict(doc, {"entities": [annotation]}))
            model.update(examples, drop=0.5, losses=losses, sgd=optimizer)
        print("Losses:", losses)
nlp.to_disk("models")

In [21]:
def evaluate_model(model):
    global val_x, val_y
    correct = 0
    total = 0
    for data in batch(val_x, val_y, 1):
        texts, annotations = zip(*data)
        text, annotation = texts[0], annotations[0]
        doc = model(text)
        predicted_label = doc.ents[0].label_
        correct += predicted_label == annotation[2]
        total += 1
    return correct / total

In [42]:
train(nlp, 20, 64)

Epoch 0:
Losses: {'ner': 11067.769801578968}
Epoch 1:
Losses: {'ner': 5665.237514648402}
Epoch 2:
Losses: {'ner': 4833.058099232272}
Epoch 3:
Losses: {'ner': 4456.385325006955}
Epoch 4:
Losses: {'ner': 4225.9804881538585}
Epoch 5:
Losses: {'ner': 4023.7435695889008}
Epoch 6:
Losses: {'ner': 3884.497302424301}
Epoch 7:
Losses: {'ner': 3638.856255480705}
Epoch 8:
Losses: {'ner': 3676.1573499465394}
Epoch 9:
Losses: {'ner': 3593.01719938979}
Epoch 10:
Losses: {'ner': 3423.6866722714294}
Epoch 11:
Losses: {'ner': 3422.1707415789206}
Epoch 12:
Losses: {'ner': 3331.6046294342345}
Epoch 13:
Losses: {'ner': 3291.706777409881}
Epoch 14:
Losses: {'ner': 3248.8432998341123}
Epoch 15:
Losses: {'ner': 3244.9766602895043}
Epoch 16:
Losses: {'ner': 3227.500167608355}
Epoch 17:
Losses: {'ner': 3163.457614370689}
Epoch 18:
Losses: {'ner': 3131.942563111891}
Epoch 19:
Losses: {'ner': 3114.943016557646}


In [43]:
accuracy = evaluate_model(nlp)
print(f"Model Accuracy: {round(accuracy * 100, 2)}%")

Model Accuracy: 92.47%


In [44]:
test_df.head()

Unnamed: 0,id,token
0,0,1/2
1,1,large
2,2,sweet
3,3,red
4,4,onion


In [45]:
test_x = test_df["token"].tolist()
test_y = []

In [46]:
for text in test_x:
    ents = nlp(text).ents
    label = ents[0].label_
    test_y.append(label)

In [48]:
with open("submission.csv", "w") as f:
    f.write("id,count\n")
    for i, label in enumerate(test_y):
        f.write(f"{i},{label}\n")