In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Params

In [3]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json', 'test.json', 'sample_submission.csv']

In [11]:
sample_df = pd.read_csv(data_path/'sample_submission.csv')
sample_df.shape

(26, 4)

In [12]:
sample_df.head(5)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT


In [36]:
sample_df.label.unique()

array(['B-NAME_STUDENT', 'I-NAME_STUDENT'], dtype=object)

In [30]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [31]:
df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [271]:
df['nb_labels'] = df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))

In [272]:
df['nb_labels'].value_counts() 

0     5862
2      599
4      108
1       86
3       52
6       46
8       14
5       10
12       6
10       5
11       3
9        3
15       2
14       2
21       2
7        1
23       1
18       1
17       1
26       1
34       1
22       1
Name: nb_labels, dtype: int64

In [None]:
def get_text_start_end(txt,s,search_from=0):
    txt = txt[int(search_from):]
    try:
        idx = txt.find(s)
        if idx>=0:
            st=idx
            ed = st+len(s)
        else:
            raise ValueError('Error')
    except:                
        res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
        if len(res):
            st,ed = res[0][0],res[0][1]
        else:
            m = SequenceMatcher(None, s,txt).get_opcodes()
            for tag,i1,i2,j1,j2 in m:
                if tag=='replace':
                    s = s[:i1]+txt[j1:j2]+s[i2:]
                if tag=="delete":
                    s = s[:i1]+s[i2:]
            
            res = [(m.start(0), m.end(0)) for m in re.finditer(s,txt)]
            if len(res):
                st,ed = res[0][0],res[0][1]
            else:
                idx = txt.find(s)
                if idx>=0:
                    st=idx
                    ed = st+len(s)
                else:
                    st,ed = 0,0
    return st+search_from,ed+search_from

In [420]:
def get_offset_mapping(full_text, tokens, trailing_whitespace):
    offset_mapping = []
    current_offset = 0

    for token, has_whitespace in zip(tokens, trailing_whitespace):
        start = current_offset
        end = current_offset + len(token)
        
        
        
        # Adjust end offset if there is trailing whitespace
        if has_whitespace:
            while end < len(full_text) and full_text[end].isspace():
                end += 1
        
        offset_mapping.append((start, end))
        current_offset = end
        

    return offset_mapping

# idx = random.choice(df[df.nb_labels>6].index)
# Example usage:
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
trailing_whitespace = df.iloc[idx]['trailing_whitespace']
labels = df.iloc[idx]['labels']

offset_mapping = get_offset_mapping(full_text, tokens, trailing_whitespace)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")

Rhiannon: (40, 48) : B-NAME_STUDENT
Karim: (48, 53) : I-NAME_STUDENT
Rhiannon: (129, 138) : B-NAME_STUDENT
Karim: (138, 143) : I-NAME_STUDENT
Rhiannon: (2417, 2425) : B-NAME_STUDENT
Karim: (2425, 2430) : I-NAME_STUDENT
Rhiannon: (5355, 5363) : B-NAME_STUDENT
Karim: (5363, 5368) : I-NAME_STUDENT
Rhiannon: (8081, 8089) : B-NAME_STUDENT
Karim: (8089, 8094) : I-NAME_STUDENT


In [425]:
import re
from difflib import SequenceMatcher

def get_text_start_end(txt, s, search_from=0):
    txt = txt[int(search_from):]
    try:
        idx = txt.find(s)
        if idx >= 0:
            st = idx
            ed = st + len(s)
        else:
            raise ValueError('Error')
    except:
        res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
        if len(res):
            st, ed = res[0][0], res[0][1]
        else:
            m = SequenceMatcher(None, s, txt).get_opcodes()
            for tag, i1, i2, j1, j2 in m:
                if tag == 'replace':
                    s = s[:i1] + txt[j1:j2] + s[i2:]
                if tag == "delete":
                    s = s[:i1] + s[i2:]

            res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
            if len(res):
                st, ed = res[0][0], res[0][1]
            else:
                idx = txt.find(s)
                if idx >= 0:
                    st = idx
                    ed = st + len(s)
                else:
                    st, ed = 0, 0
    return st + search_from, ed + search_from


def get_offset_mapping(full_text, tokens):
    offset_mapping = []

    current_offset = 0
    for token in tokens:
        start, end = get_text_start_end(full_text, token, search_from=current_offset)
        offset_mapping.append((start, end))
        current_offset = end

    return offset_mapping


# Example usage:
# full_text = "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use..."
# tokens = ['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-',
#           'Nathalie', 'Sylla', '\n\n', 'Challenge', '&', 'selection', '\n\n', 'The', 'tool', 'I', 'use', 'to', 'help', 'all', 'stakeholders', 'finding', 'their', 'way', 'through', 'the', 'complexity', 'of', 'a', 'project', 'is', 'the', ' ', 'mind', 'map', '.', '\n\n', ...]
idx = random.choice(df[df.nb_labels>6].index)
# Example usage:
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
trailing_whitespace = df.iloc[idx]['trailing_whitespace']
labels = df.iloc[idx]['labels']


offset_mapping = get_offset_mapping(full_text, tokens)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")


Md: (15, 17) : B-NAME_STUDENT
Ahamad: (18, 24) : I-NAME_STUDENT
369615882777: (26, 38) : B-ID_NUM
Taher: (1528, 1533) : B-NAME_STUDENT
014674070485: (1536, 1548) : B-ID_NUM
Pintu: (1760, 1765) : B-NAME_STUDENT
Kumar: (1766, 1771) : I-NAME_STUDENT
Ajay: (1772, 1776) : I-NAME_STUDENT
Rana: (1777, 1781) : I-NAME_STUDENT
264945858442: (1783, 1795) : B-ID_NUM
Chiara: (1936, 1942) : B-NAME_STUDENT
320622779078: (1944, 1956) : B-ID_NUM


In [424]:
full_text[1863:1869]

'Zahida'

In [422]:
full_text[8089:8094], 

('rim\n\n',)

In [412]:
print(full_text)

Reflection-Storytelling

Challenge

My friend Teresa was very anxious and unhappy recently and it was because of her son Naser. Naser is  10 years old and turned to be a little rebellious now. Due to the school closure in covid-19, Naser  has classes online and he then formed a habit of spending much on computers playing games or  watching videos he likes. Teresa was at work and can not monitor her son during working hours,  and she worried about Naser’s health as he spent too much time on computers so he completed his  homework very late, so he slept very late and his eyesight is poor as well. Naser used to be a good  student and obedient son, and he agreed to lessen time on computers or cellphones, but he can  not always keep his words. That’s why Teresa worried. She is afraid he turned to a bad boy.

Selection

Naser is ten years old and now he began to seek himself and have the desire to be independent.  Teresa can not just control and monitor him but let him to take responsibility

In [413]:
# pip install spacy

In [414]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [415]:
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [416]:
visualize(full_text,offset_mapping_,labels_)