In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Params

In [3]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json',
 'dubai-ar.zip',
 'pii_dataset_fixed.csv',
 'mixtral-8x7b-v1.json',
 '.~lock.lecture2.pptx#',
 'openaddr-collected-global.zip',
 'lecture2.pptx',
 'openaddr-collected-us_west-sa.zip',
 'test.json',
 'moredata_dataset_fixed.csv',
 'sample_submission.csv']

In [4]:
sample_df = pd.read_csv(data_path/'sample_submission.csv')
sample_df.shape

(26, 4)

In [5]:
sample_df.head(5)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT


In [6]:
sample_df.label.unique()

array(['B-NAME_STUDENT', 'I-NAME_STUDENT'], dtype=object)

In [7]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [17]:
df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,NAME_STUDENT,EMAIL,USERNAME,ID_NUM,PHONE_NUM,URL_PERSONAL,STREET_ADDRESS,nb_labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",6,0,0,0,0,0,0,6
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",4,0,0,0,0,0,0,4
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",2,0,0,0,0,0,0,2
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...",2,0,0,0,0,0,0,2
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...",2,0,0,0,0,0,0,2


In [9]:
LABEL2TYPE = ('NAME_STUDENT','EMAIL','USERNAME','ID_NUM', 'PHONE_NUM','URL_PERSONAL','STREET_ADDRESS','O')
len(LABEL2TYPE)

8

In [10]:
for name in LABEL2TYPE[:-1]:
    df[name] = ((df['labels'].transform(lambda x:len([i for i in x if i.split('-')[-1]==name ]))))*1

In [11]:
df['nb_labels'] = df['labels'].transform(lambda x:len([i for i in x if i!="O" ]))

In [12]:
df['nb_labels'].value_counts() 

0     5862
2      599
4      108
1       86
3       52
6       46
8       14
5       10
12       6
10       5
11       3
9        3
15       2
14       2
21       2
7        1
23       1
18       1
17       1
26       1
34       1
22       1
Name: nb_labels, dtype: int64

In [13]:
df[list(LABEL2TYPE)[:-1]+['nb_labels']].sum()

NAME_STUDENT      2461
EMAIL               39
USERNAME             6
ID_NUM              79
PHONE_NUM           21
URL_PERSONAL       111
STREET_ADDRESS      22
nb_labels         2739
dtype: int64

In [14]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [26]:
seeds = [42]
folds_names = []
for K in [5]:  
    for seed in seeds:
        mskf = MultilabelStratifiedKFold(n_splits=K,shuffle=True,random_state=seed)
        name = f"fold_msk_{K}_seed_{seed}"
        df[name] = -1
        for fold, (trn_, val_) in enumerate(mskf.split(df,df[list(LABEL2TYPE)[:-1]])):
            df.loc[val_, name] = fold

In [1]:
df.groupby(name)[list(LABEL2TYPE)[:-1]].sum()

NameError: name 'df' is not defined

In [19]:
df.NAME_STUDENT.value_counts()

0     5916
2      626
4      112
1       52
6       42
3       23
8       11
5        5
7        3
10       3
9        3
12       2
15       2
17       2
21       1
18       1
14       1
34       1
22       1
Name: NAME_STUDENT, dtype: int64

In [56]:
import re
from difflib import SequenceMatcher

def get_text_start_end(txt, s, search_from=0):
    txt = txt[int(search_from):]
    try:
        idx = txt.find(s)
        if idx >= 0:
            st = idx
            ed = st + len(s)
        else:
            raise ValueError('Error')
    except:
        res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
        if len(res):
            st, ed = res[0][0], res[0][1]
        else:
            m = SequenceMatcher(None, s, txt).get_opcodes()
            for tag, i1, i2, j1, j2 in m:
                if tag == 'replace':
                    s = s[:i1] + txt[j1:j2] + s[i2:]
                if tag == "delete":
                    s = s[:i1] + s[i2:]

            res = [(m.start(0), m.end(0)) for m in re.finditer(s, txt)]
            if len(res):
                st, ed = res[0][0], res[0][1]
            else:
                idx = txt.find(s)
                if idx >= 0:
                    st = idx
                    ed = st + len(s)
                else:
                    st, ed = 0, 0
    return st + search_from, ed + search_from


def get_offset_mapping(full_text, tokens):
    offset_mapping = []

    current_offset = 0
    for token in tokens:
        start, end = get_text_start_end(full_text, token, search_from=current_offset)
        offset_mapping.append((start, end))
        current_offset = end

    return offset_mapping


# Example usage:
# full_text = "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use..."
# tokens = ['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-',
#           'Nathalie', 'Sylla', '\n\n', 'Challenge', '&', 'selection', '\n\n', 'The', 'tool', 'I', 'use', 'to', 'help', 'all', 'stakeholders', 'finding', 'their', 'way', 'through', 'the', 'complexity', 'of', 'a', 'project', 'is', 'the', ' ', 'mind', 'map', '.', '\n\n', ...]
idx = random.choice(df[df.NAME_STUDENT==15].index)
# Example usage:
full_text = df.iloc[idx]['full_text']
tokens = df.iloc[idx]['tokens']
trailing_whitespace = df.iloc[idx]['trailing_whitespace']
labels = df.iloc[idx]['labels']


offset_mapping = get_offset_mapping(full_text, tokens)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset} : {l}")


Zohaib: (34, 40) : B-NAME_STUDENT
Ashraf: (41, 47) : I-NAME_STUDENT
Matheus: (877, 884) : B-NAME_STUDENT
Kumari: (885, 891) : I-NAME_STUDENT
Matheus: (995, 1002) : B-NAME_STUDENT
Kumari: (1003, 1009) : I-NAME_STUDENT
Matheus: (1073, 1080) : B-NAME_STUDENT
Kumari: (1081, 1087) : I-NAME_STUDENT
Matheus: (1200, 1207) : B-NAME_STUDENT
Kumari: (1208, 1214) : I-NAME_STUDENT
Matheus: (1323, 1330) : B-NAME_STUDENT
Kumari: (1331, 1337) : I-NAME_STUDENT
Matheus: (2212, 2219) : B-NAME_STUDENT
Kumari: (2220, 2226) : I-NAME_STUDENT
Matheus: (3572, 3579) : B-NAME_STUDENT


In [57]:
full_text[1863:1869]

'from m'

In [58]:
full_text

"REFLECTION – VIZUALIZATION\n\n-  By Zohaib Ashraf\n\nBeing a design enthusiast, I had an opportunity to participate in the Design thinking workshop  held by Odisha Design Council chairman, Akshay Kumari. It was a 3-day workshop and  included 18 participants. During this workshop, we were divided into teams of 2 and given a  challenge to solve using design thinking processes.     1. Challenge  During the post-lockdown phase, many citizens have started returning to their native and are  in a dilemma to find a suitable place to self-quarantine for 14 days. We were asked to provide  a simple solution to help these members to curb the spread of COVID-19 by taking a specific  case into consideration. This included a client, user, and the area which needs to be  considered to find a solution. Self-quarantine measures had to be taken into consideration.   I was teamed up with Matheus Kumari, a graphic designer from Odisha, India. Based on the  problem statement, I had to design solutions for 

In [59]:
print(df.full_text.sample(1).values[0])

Student: Maria Amin  Coursera: Design Thinking for Innovation, Maria Amin, UVA    Challenge:  The design thinking tool I chose from the Design Thinking course was storytelling.  This is a  skill I have wanted to improve as some of the most successful consultants I work with are great  storytellers.  Traditionally, the way we sell consulting within our service line is to identify the client's  current issues and demonstrate our group’s capabilities to address them.  We needed to change the way  we told our group's consulting story.  A story that moves from a capabilities to character-driven  approach that all team members could tell and customers wanted to hear.     Selection: Our consulting sales practice is based on developing relationships based on high value and  trust.  As a result, I chose storytelling as a tool to build on connecting with clients in the way Design  Thinking week 2 lecture described it “as a way to make a strategy relate-able, to give it color, life,  through char

In [60]:
print(full_text)

REFLECTION – VIZUALIZATION

-  By Zohaib Ashraf

Being a design enthusiast, I had an opportunity to participate in the Design thinking workshop  held by Odisha Design Council chairman, Akshay Kumari. It was a 3-day workshop and  included 18 participants. During this workshop, we were divided into teams of 2 and given a  challenge to solve using design thinking processes.     1. Challenge  During the post-lockdown phase, many citizens have started returning to their native and are  in a dilemma to find a suitable place to self-quarantine for 14 days. We were asked to provide  a simple solution to help these members to curb the spread of COVID-19 by taking a specific  case into consideration. This included a client, user, and the area which needs to be  considered to find a solution. Self-quarantine measures had to be taken into consideration.   I was teamed up with Matheus Kumari, a graphic designer from Odisha, India. Based on the  problem statement, I had to design solutions for Mathe

In [61]:
# pip install spacy

In [62]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'NAME_STUDENT': '#8000ff',
            'EMAIL': '#2b7ff6',
            'USERNAME': '#2adddd',
            'ID_NUM': '#80ffb4',
            'PHONE_NUM': 'd4dd80',
            'URL_PERSONAL': '#ff8042',
            'STREET_ADDRESS': '#ff0000'
         }


def visualize(full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab.split('-')[1]) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
#         "title": "idx"
    }

    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [63]:
offset_mapping_ = [x for (x,y) in zip(offset_mapping,labels) if y!="O"]
labels_ = [x for x in labels if x!="O"]

In [64]:
visualize(full_text,offset_mapping_,labels_)