# Imports and data loading

In [29]:

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re

from tqdm.notebook import tqdm
from IPython.display import HTML

In [10]:
# loading only a sample for testing purposes
df = pd.read_csv('../raw_data/train.csv',nrows=300)

# Get None type discourses

<font size=4 color='red'>**Lovely, something *is off***</font>

In [23]:
def flag_gaps(df,return_filtered=False):
    """
    Add 2 new columns to the df passed : next_start and gap_flag
    
    next_start : char position of the next discourse
    gap_flag : 1 if the next discourse is missing i.e. next_start and discourse_start of the next
    row does not match
    
    if return_filtered = True, returns only the rows where gap_flag=1
    """
    df['next_start']=df['discourse_start'].shift(-1)
    df['next_predictionstring']=df['predictionstring'].shift(-1)
    df['gap_flag']=np.where(df['next_start']-df['discourse_end']>3,1,0)
    
    if return_filtered:
        return df[df['gap_flag']==1]
    
    return None

In [24]:
## Create missing discourses for a single essay
def row_creator(df_filtered):
    """
    Returns a DF with missing discourses to be labeled as None. 
    Must be passed a filtered DF where each row flags a gap. 
    This function works at single doc-ID level.
    """

    start_new = df_filtered['discourse_end'] + 1
    end_new = df_filtered['next_start'] - 1
    
    start_new.reset_index(drop=True,inplace=True)
    end_new.reset_index(drop=True,inplace=True)
    
    #import ipdb ; ipdb.set_trace()
    
    predictionstring_start = df_filtered['predictionstring']\
                            .apply(lambda string : int(string.split()[-1])+1)
    predictionstring_end = df_filtered['next_predictionstring']\
                            .apply(lambda string : int(string.split()[0])-1)
    
    f = np.vectorize(lambda a,b : ' '.join(list(map(str,range(a,b+1)))),otypes=[str])
    
    predictionstring= f(predictionstring_start,predictionstring_end)
    
    discourse_type_num = [f'None {i+1}' for i in range(len(df_filtered))]
    
    id_ = df_filtered['id'].copy().reset_index(drop=True)
    
    x = pd.DataFrame({
                        'id':id_,
                        'discourse_start':start_new,
                        'discourse_end':end_new,
                        'predictionstring':predictionstring,
                        'discourse_type': 'None',
                        'discourse_type_num':discourse_type_num
                    },columns=df_filtered.columns)


    return x

In [25]:
## Slicer from full text to discourses
def text_to_slices(full_text,start,end):
    """
    Returns a slice of full_text according to start and end args.
    """
    sliced_text = full_text[int(start):int(end)]
    sliced_text= re.sub('\n$',' ',sliced_text) #check whether it works with most of ids
    sliced_text= re.sub('^\n',' ',sliced_text)
    return sliced_text

## vectorize the function for later use
slicer_vectorized=np.vectorize(text_to_slices,excluded=['full_text'],otypes=[str])

In [26]:
## Get text of an essay id
def get_essay(id_):
    """
    Returns the text from the .txt file in raw_data/train directory.
    """
    with open(f'../raw_data/train/{id_}.txt','r') as file :
        text = file.read()
    return text

In [27]:
def gap_filler(df):
    
    """
    Returns an augmented dataframe with the missing discourses labelled None, for all the essays.
    This function makes use of precedent functions working at single essay level.
    """
    
    df_augm=df.copy()

    ids = df_augm.id.unique()

    for i in tqdm(ids):
        full_text = get_essay(i)
        df_augm_ = df_augm[df_augm['id']==i].copy()
        df_augm_ = flag_gaps(df_augm_,return_filtered=True)
        df_augm_ = row_creator(df_augm_)
        df_augm_['discourse_text'] = slicer_vectorized(full_text,
                                                       df_augm_['discourse_start'],
                                                       df_augm_['discourse_end'])
        df_augm = df_augm.append(df_augm_)

    df_augm.drop(['next_start','next_predictionstring','gap_flag'],axis=1,inplace=True)
    
    id_max=df_augm.discourse_id.max()
    n_none=len(df_augm[df_augm.discourse_id.isna()])
    new_ids=np.random.choice(np.arange(id_max,id_max+n_none*1000),n_none)
    
    df_augm.loc[df_augm.discourse_id.isna(),'discourse_id']=new_ids
    
    return df_augm
    

In [30]:
df_augm=gap_filler(df)

  0%|          | 0/34 [00:00<?, ?it/s]

In [47]:
df_augm

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,html
0,423A1CA112E2,1622627660524.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,<Lead style='padding: 2px'>Modern humans today...
1,423A1CA112E2,1622627653021.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,<Position style='padding: 2px'>They are some r...
2,423A1CA112E2,1622627671020.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,<Evidence style='padding: 2px'>Some certain ar...
3,423A1CA112E2,1622627696365.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,<Evidence style='padding: 2px'>When people hav...
4,423A1CA112E2,1622627759780.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,<Claim style='padding: 2px'>Driving is one of ...
...,...,...,...,...,...,...,...,...,...
0,D0CBBD43827C,1623081274381.0,406.0,409.0,an,,None 1,72,<None style='padding: 2px'> an <strong> [None]...
0,62644C50869C,1623081273049.0,356.0,358.0,or,,None 1,61,<None style='padding: 2px'>or <strong> [None] ...
0,7FF6281EC288,1623081284926.0,131.0,143.0,For instance,,None 1,21 22,<None style='padding: 2px'>For instance <stron...
1,7FF6281EC288,1623081272125.0,171.0,176.0,"Also,",,None 2,27,"<None style='padding: 2px'>Also, <strong> [Non..."


# Showing ground truth and prediction in text

In [1]:
#import custom.css into this notebook

def css():
    styles = open("./styles/custom.css", "r").read()
    return HTML('<style>'+styles+'</style>')
css()

NameError: name 'HTML' is not defined

In [34]:
def render_html(df):
    return "<{0} style='padding: 2px'>{1} <strong> [{0}] </strong></{0}>".format(df['discourse_type'],df['discourse_text'])

In [35]:
def comparison_text(prediction, ground_truth):
    html = f"""
    <div class="content">
     <span style="font-size:16px">Legend --></span>
      <lead>Lead</lead>
      <Position>Position</Position>
      <Claim>Claim</Claim>
      <Counterclaim>Counterclaim</Counterclaim>
      <Rebuttal>Rebuttal</Rebuttal>
      <Evidence>Evidence</Evidence>
      <Concluding_Statement>Concluding_Statement</Concluding_Statement>
    </div>

    <div class="row">
      <div class="column">
        <h2 class="title">Prediction</h2>
        <p style="text-align:justify">{prediction}</p>
      </div>
      <div class="column">
        <h2 class="title">Ground Truth</h2>
        <p style="text-align:justify">{ground_truth}</p>
      </div>
    </div>
    
    """
    
    return HTML(html)

In [55]:
# as if the output were post processed ; just to see 

df['html'] = df.apply(render_html, axis=1) #transform discourse_text in html txt with formating 
df_essays = df.groupby('id').agg({'html':' '.join,}).reset_index() #groupby essay id

true = df_essays.loc[4,'html'].replace('Concluding Statement','Concluding_Statement')
pred = df_essays.loc[19,'html'].replace('Concluding Statement','Concluding_Statement')

In [56]:
comparison_text(pred,true)