In [506]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"

import json

data = json.load(open("data/train.json"))

print(len(data))
print(data[0].keys())

x = data[0]

print(x["tokens"][:10])
print(x["labels"][:10])
print(x["trailing_whitespace"][:10])

from itertools import chain

all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

id2label

6807
dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])
['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT']
[True, True, True, True, False, False, True, False, False, True]


{0: 'B-EMAIL',
 1: 'B-ID_NUM',
 2: 'B-NAME_STUDENT',
 3: 'B-PHONE_NUM',
 4: 'B-STREET_ADDRESS',
 5: 'B-URL_PERSONAL',
 6: 'B-USERNAME',
 7: 'I-ID_NUM',
 8: 'I-NAME_STUDENT',
 9: 'I-PHONE_NUM',
 10: 'I-STREET_ADDRESS',
 11: 'I-URL_PERSONAL',
 12: 'O'}

In [495]:
len(data)

6807

In [496]:
target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

In [497]:

from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)


def tokenize(example, tokenizer, label2id, max_length):
    text = []

    # these are at the character level
    labels = []
    targets = []

    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):

        text.append(t)
        labels.extend([l]*len(t))
        
        if l in target:
            targets.append(1)
        else:
            targets.append(0)
        # if there is trailing whitespace
        if ws:
            text.append(" ")
            labels.append("O")

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length)
    
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:

        # CLS token
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {
        **tokenized,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num>0 else 0
    }



#id=0
#for entry in ds:
#    if entry['document'] == 13315:
#        print(id)
#        break
#    id+=1

sample = data[2790]
sample_tokenized=tokenize(sample, tokenizer, label2id, max_length=512)


print(sample_tokenized.keys())

# create text of sample_tokenized

text = tokenizer.decode(sample_tokenized["input_ids"])

print("original: [CLS] " + sample["full_text"].replace("\n", " "))
print("tokenize: " + text)
print("labels: " + " ".join([id2label[x] for x in sample_tokenized["labels"]]))

# tokenizer.convert_ids_to_tokens(sample_tokenized["input_ids"])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length', 'target_num', 'group'])
original: [CLS] Design Thinking for Innovation  Reflection  1. Challenge: To expand the distribution of Indian Sindhi papad, globally, to the areas  where the Indian diaspora lives.  A papad basically is a crisp, round flatbread from India. It is typically based on a  seasoned dough usually made from peeled black gram flour, either fried or cooked  with dry heat. And Sindhis are an ethno-linguistic group who speak the Sindhi  language. After the partition of India, most of them migrated to other parts of the  world. And Sindhi papad has a unique taste of its own. But it is usually restricted to  some of the Indian cities itself. Not everyone has the access to have the food items  in their households, especially those who stay thousands of kilometres away from  India.  2. Selection: The tool I’ve selected is Learning Launch. I selected it for the challenge  because I

In [498]:
#for i in range(100):
#    x=data[i]
#    print(i, sum([1 if x["labels"][i] != "O" else 0 for i in range(len(x["labels"]))]))

In [499]:
# Confirm that alignment is good

# run multiple times to see different rows
import random
import pandas as pd

max_length=60

x=data[29]
print("keys", x.keys())
print("document id", x["document"])
original_token_list = []

original_token_list=x["tokens"][:max_length]
cls_token="[CLS]"
#original_token_list.insert(0, cls_token)

decoded_token_list=[]
input_ids=tokenize(x, tokenizer, label2id, max_length=max_length)['input_ids']
labels=tokenize(x, tokenizer, label2id, max_length=max_length)['labels']

for single_token in tokenizer.convert_ids_to_tokens(input_ids):
    decoded_token_list.append((single_token))

#for i in range(max_length):
#    print(f"{i} {original_token_list[i]} {decoded_token_list[i]}")
    
df = pd.DataFrame(list(zip(original_token_list, decoded_token_list)), columns =['original', 'decoded'])
df['tokenized_labels'] = [id2label[x] for x in labels]
df['original_labels'] = x['labels'][:max_length]

print(df['original_labels'].unique())
df

keys dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])
document id 609
['O' 'B-NAME_STUDENT' 'I-NAME_STUDENT' 'B-ID_NUM']


Unnamed: 0,original,decoded,tokenized_labels,original_labels
0,Date:14,[CLS],O,O
1,-,▁Date,O,O
2,09,:,O,O
3,-,14,O,O
4,2021,-,O,O
5,\n\n,09,O,O
6,NEWS,-,O,O
7,PAPER,2021,O,O
8,\n\n,▁NEWS,O,O
9,Project,▁PAPER,O,O


In [500]:
df

Unnamed: 0,original,decoded,tokenized_labels,original_labels
0,Date:14,[CLS],O,O
1,-,▁Date,O,O
2,09,:,O,O
3,-,14,O,O
4,2021,-,O,O
5,\n\n,09,O,O
6,NEWS,-,O,O
7,PAPER,2021,O,O
8,\n\n,▁NEWS,O,O
9,Project,▁PAPER,O,O


In [501]:
# Confirm that alignment is good

# run multiple times to see different rows
import random

data = data[25:35]
import pandas as pd
df=pd.DataFrame()

for x in data:
    print("keys", x.keys())
    print("document id", x["document"])


    token_list = []
    for single_token,single_label in zip(x["tokens"], x["labels"]):
        if single_label != "O":
            print((single_token,single_label))

    print("*"*100)
    input_ids=tokenize(x, tokenizer, label2id, max_length=512)['input_ids']
    labels=tokenize(x, tokenizer, label2id, max_length=512)['labels']
    print("input_ids", input_ids)
    for single_token, single_label in zip(tokenizer.convert_ids_to_tokens(input_ids), labels):
        if id2label[single_label] != "O":
            print((single_token,id2label[single_label]))

    print("\n"*5)

keys dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])
document id 470
('Daniel', 'B-NAME_STUDENT')
****************************************************************************************************
input_ids [1, 4113, 45730, 377, 4735, 16508, 6738, 5365, 77529, 268, 300, 273, 276, 358, 266, 1034, 265, 312, 1416, 276, 268, 5950, 2049, 2307, 263, 316, 1719, 269, 264, 1300, 316, 2155, 265, 1117, 2422, 267, 556, 264, 286, 1761, 380, 263, 5645, 270, 1779, 264, 262, 1099, 260, 581, 1117, 2475, 303, 266, 813, 265, 3248, 261, 399, 301, 3355, 275, 266, 3856, 265, 1157, 277, 266, 12962, 1599, 261, 275, 2407, 475, 6100, 260, 279, 374, 269, 1466, 261, 301, 516, 676, 266, 1327, 264, 993, 316, 1117, 5384, 583, 266, 996, 326, 261, 385, 262, 513, 265, 262, 3983, 301, 286, 637, 2784, 270, 3275, 267, 262, 410, 628, 384, 260, 14700, 581, 511, 303, 17699, 656, 1028, 478, 347, 587, 1291, 1415, 260, 345, 281, 615, 264, 2062, 266, 310, 15788, 263, 1287, 1779, 270, 462, 2510, 

In [502]:

import matplotlib.pyplot as plt


plt.hist(ds["length"], bins=100)

NameError: name 'ds' is not defined

In [None]:

import pandas as pd
import plotly.express as px
from collections import Counter


group = []
labels = []

group_thresholds = [0, 50, 100, 200, 500, 1000, 2000, 10000]

for sample_labels in ds["provided_labels"]:
    for i, label in enumerate(sample_labels):
        if label != "O":
            for j in range(1, len(group_thresholds)):
                lower = group_thresholds[j-1]
                upper = group_thresholds[j]
                
                if lower <= i < upper:
                    group.append(f"{lower}-{upper}")
                    labels.append(label)
                    break

pairs = list(zip(labels, group))

counts = Counter(pairs)


data = {
    "label": [],
    "count": [],
    "range": [],
}

for (label, range_), count in counts.items():
    data["label"].append(label)
    data["range"].append(range_)
    data["count"].append(count)

            
df = pd.DataFrame(data)


px.scatter(df, x="range", y="count", color="label", log_y=True, height=1000)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed