In [2]:
import os
import gc
import sys
import json
import time
import torch
import joblib
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Params

In [3]:
data_path = Path(r"/database/kaggle/PII/data")
os.listdir(data_path)

['train.json', 'test.json', 'sample_submission.csv']

In [11]:
sample_df = pd.read_csv(data_path/'sample_submission.csv')
sample_df.shape

(26, 4)

In [12]:
sample_df.head(5)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,482,B-NAME_STUDENT
3,3,7,483,I-NAME_STUDENT
4,4,7,741,B-NAME_STUDENT


In [30]:
df = pd.read_json(data_path/'train.json')
df.shape

(6807, 5)

In [31]:
df.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [32]:
def get_offset_mapping(full_text, tokens, trailing_whitespace):
    offset_mapping = []
    current_offset = 0

    for token, has_whitespace in zip(tokens, trailing_whitespace):
        start = current_offset
        end = current_offset + len(token)
        
        
        offset_mapping.append((start, end))
        current_offset = end
        
        # Adjust end offset if there is trailing whitespace
        if has_whitespace:
            while end < len(full_text) and full_text[end].isspace():
                end += 1

        

    return offset_mapping

# Example usage:
full_text = df.iloc[0]['full_text']
tokens = df.iloc[0]['tokens']
trailing_whitespace = df.iloc[0]['trailing_whitespace']
labels = df.iloc[0]['labels']

offset_mapping = get_offset_mapping(full_text, tokens, trailing_whitespace)
for token, offset,l in zip(tokens, offset_mapping,labels):
    if l!="O":
        print(f"{token}: {offset}")

Nathalie: (47, 55)
Sylla: (55, 60)
Nathalie: (1930, 1938)
Sylla: (1938, 1943)
Nathalie: (3088, 3096)
Sylla: (3096, 3101)


In [35]:
# labels

In [33]:
print(full_text)

Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla

Challenge & selection

The tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.

What exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1

This tool has many advantages:

•  It is accessible to all and does not require significant material investment and can be done  quickly

•  It is scalable

•  It allows categorization and linking of information

•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas

•  It is suitable for all people and is easy to learn

•  It is fun and encourages exchanges

•  It makes visible the dimension of pr

In [28]:
df.iloc[0]['trailing_whitespace']

[True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 Fal

In [14]:
df['text_len'] = df['full_text'].transform(lambda x:len(x.split()))

In [16]:
df['tokenz_len'] = df['tokens'].transform(lambda x:len((x)))

In [16]:
df['discourse_effectiveness'] = df['discourse_effectiveness'].fillna('NoEffectiveness')

In [17]:
# pip install spacy

In [26]:
import spacy
from spacy import displacy
from pylab import cm, matplotlib
import os

colors = {
            'Lead': '#8000ff',
            'Position': '#2b7ff6',
            'Evidence': '#2adddd',
            'Claim': '#80ffb4',
            'Concluding Statement': 'd4dd80',
            'Counterclaim': '#ff8042',
            'Rebuttal': '#ff0000'
         }

colors_effectiveness = {
            'Adequate': '#8000ff',
            'Effective': '#2b7ff6',
            'Ineffective': '#2adddd',
         }

def visualize(idx,full_text,offset_mapping,labels):
    
    ents = []
    for offset,lab in zip(offset_mapping,labels):
        ents.append({
                        'start': int(offset[0]), 
                         'end': int(offset[1]), 
                         'label': str(lab) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    doc2 = {
        "text": full_text,
        "ents": ents,
        "title": idx
    }

    options = {"ents": train.discourse_type.unique().tolist(), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)
    
def visualize_effectiveness(idx,train):
    
#     print(train[train['essay_id_comp'] == idx].assignment.values[0])
    ents = []
    for i, row in train[train['essay_id_comp'] == idx].iterrows():
        ents.append({
                        'start': int(row['discourse_start']), 
                         'end': int(row['discourse_end']), 
                         'label': str(row['discourse_effectiveness']) #+ ' - ' + str(row['discourse_effectiveness'])
                    })

    data = train[train['essay_id_comp'] == idx].full_text.str.strip().values[0]

    doc2 = {
        "text": data,
        "ents": ents,
        "title": idx
    }

    options = {"ents": train.discourse_effectiveness.unique().tolist(), "colors": colors_effectiveness}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [36]:
idx = random.choice(df.essay_id_comp.unique())
idx

'2B8374B25171'

In [37]:
visualize(idx,df)

In [38]:
visualize_effectiveness(idx,df)

In [39]:
for i, row in df[df['essay_id_comp'] == idx].iterrows():
    print(row['discourse_text'])
    print('---------------------------\n')

Dear Senator,


---------------------------

I think the Electoral Collage is unfair to the citizens of the United States of America. 
---------------------------

The Electoral collage resembles too much control by the United States government 
---------------------------

As said in Source 2: The Indefensible Electoral Collage: Why even the best-laid defenses of the system are wrong, "Back in 1960, segregationists in the Louisiana legislature nearly succeeded in replacing the Democratic electors with new electors who would oppose John F. Kennedy. This dosnt even sound right does it... This is one example of United States government trying to rail  the presidential elections, and i can asure you that this is not the first time this has been attempted by the government. The U.S Constitution delcares that

ALL

citizens of United States of America have the right to vote. What this looks like is the government trying to vote for us 
---------------------------

Another thing that i do no

In [5]:
df.competition_set.value_counts()

competition_set
train    173266
test     112117
Name: count, dtype: int64

In [32]:
df.discourse_type.value_counts()

discourse_type
Claim                   82584
Evidence                75588
Unannotated             47410
Position                25669
Concluding Statement    22283
Lead                    15098
Counterclaim             9534
Rebuttal                 7217
Name: count, dtype: int64

In [29]:
df['in_feedback2.0'].value_counts()

in_feedback2.0
0    214928
1     70455
Name: count, dtype: int64

In [6]:
df.isna().sum()

essay_id                           0
essay_id_comp                      0
competition_set                    0
full_text                          0
holistic_essay_score               0
discourse_id                       0
discourse_start                    0
discourse_end                      0
discourse_text                     0
discourse_type                     0
discourse_type_num                 0
discourse_effectiveness        47417
hierarchical_id               128704
hierarchical_text             128704
hierarchical_label            128704
provider                           0
task                               0
source_text                   159240
prompt_name                        0
assignment                         0
gender                             0
grade_level                    12442
ell_status                     12759
race_ethnicity                     0
economically_disadvantaged     56222
student_disability_status      55582
essay_word_count                   0
i

In [33]:
mask = (df.discourse_effectiveness.notna()) & (df.discourse_type.notna()) & (df.discourse_type!="Unannotated")

pd.crosstab(df[mask].competition_set,
            df[mask].discourse_effectiveness)

discourse_effectiveness,Adequate,Effective,Ineffective
competition_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
test,75363,14089,4225
train,113244,24583,6462


In [35]:
mask = (df['in_feedback2.0']==0)

pd.crosstab(df[mask].competition_set,
            df[mask].discourse_effectiveness)

discourse_effectiveness,Adequate,Effective
competition_set,Unnamed: 1_level_1,Unnamed: 2_level_1
test,61747,8948
train,92267,15257


In [1]:
# df[df['in_feedback2.0']==0]