# Parse annotations

In [1]:
# imports

import pandas as pd
import ast
import collections
import re
import spacy
import scispacy
import time
import numpy as np
import matplotlib.pyplot as plot
from collections import Counter

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [2]:
# Load tokenizers

#loading the scispacy model
nlp = spacy.load('en_core_sci_sm')

In [3]:
# open file

file_path_train = '/mnt/nas2/data/systematicReview/semeval2023/data/st2_train_inc_text.csv'
file_path_test = '/mnt/nas2/data/systematicReview/semeval2023/data/st2_test_inc_text.csv'

df_train = pd.read_csv(file_path_train, sep=',')
train_df = df_train.to_dict('records')

df_test = pd.read_csv(file_path_test, sep=',')
test_df = df_test.to_dict('records')

In [4]:
print(len(train_df))
print(len(test_df))

597
150


In [5]:
df_train.head()

Unnamed: 0,post_id,subreddit_id,claim,stage2_labels,text
0,sn9u41,t5_2s23e,I read an old thread on here that someone said...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Tysabri experiences\nHi all\n\nI just had my 3...
1,p7j49y,t5_2syer,"I have read that gout can't be cured, that it'...","[{""crowd-entity-annotation"":{""entities"":[{""end...",Crazy amount of sardines caused gout (possibly...
2,smgy0q,t5_2s3g1,I always read stories of people who suffer fro...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Im sorry for intruding but I just want to say ...
3,sxglhl,t5_2s3g1,Our results indicate that the addition of prob...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Is Bacillus coagulans supplementation plus low...
4,rxyk1d,t5_2s1h9,Ive read that amnesia always followed a tonic ...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",[deleted by user]\n[removed]


In [6]:
df_test.head()

Unnamed: 0,post_id,subreddit_id,claim,text
0,s8qa1c,t5_2tyg2,[Structural dissociation](https://did-research...,Can structural dissociation lead to psychosis?...
1,opt7bh,t5_2syer,I though that the gout was for old whiskey dri...,"First time gout flare\nHi guys,\n\nJust turned..."
2,rfj6tf,t5_2qlaa,I've read that a liquid alginate suspension (G...,Can I buy liquid alginate suspension (Gaviscon...
3,qyfer0,t5_2syer,This will reduce the effects of Lisinopril pre...,[deleted by user]\n[removed]
4,rr99la,t5_2qlaa,all the gerd/lpr stuff online says we should a...,I seem to reflux more and sleep worse when I d...


In [7]:
deleted_list = [ '[deleted]', '[removed]', 'deleted by user' ]

In [8]:
picos_mapping = {'population': 1, 'intervention':2, 'outcome':3}

In [9]:
def get_char_labels(df):
    
    # parse dataframe and fetch annotations in a dict

    labels = []
    claim_offsets = []

    for counter, row in enumerate(df):
        #print('--------------------------------------------------------')

        reddit_id = row['subreddit_id']

        post_id = row['post_id']
        #print( post_id )

        claim = row['claim']
        #print('Claim: ', claim)

        full_text = row['text']
        #print('Full-Text: ', full_text)         

        if any(word in full_text for word in deleted_list):
            # If the post was removed by the user
            labels.append('N.A.')
            claim_offsets.append('N.A.')
        else:
            # If the post was not removed by the user
            # Get entities
            stage2_labels = ast.literal_eval( row['stage2_labels']  )
            #print( 'MAIN:     ', stage2_labels )
            stage2_labels = stage2_labels[0]['crowd-entity-annotation']['entities']
            #print( 'OFFSHOOT:     ', stage2_labels )

            # Get Char indices
            full_text_indices = [ counter for counter, i in enumerate(full_text) ]
            #print( 'full_text_indices:     ', full_text_indices )

            label_each_char = [0] * len(full_text) # Generate a 0 label for each character in the full text
            
            # claim offsets
            claim_start = full_text.index(claim)
            claim_end = claim_start + len(claim)

            for l in stage2_labels:
                extrct_annot = row['text'][ l['startOffset'] : l['endOffset'] ]
                pico =  l['label']

                # Are the start and stop offsets in the full-text offsets?
                if l['startOffset'] in full_text_indices:

                    prev_length = len(label_each_char)
                    start = l['startOffset']
                    end = l['startOffset']+(len(extrct_annot))
                    
                    label_indices = [ i for i in range(start, end) ]

                    for i in range( start, end ):
                        old_label = label_each_char[i]
                        new_label = picos_mapping[pico]
                        if new_label > old_label:
                            label_each_char[i] = new_label
                    assert len(label_each_char) == prev_length
                    assert len(label_each_char) == len(full_text)

            labels.append(label_each_char)
            claim_offsets.append( (claim_start,claim_end) )
            
    return labels, claim_offsets

In [10]:
# Get the labels for train dataframe
labels_train, claim_offsets = get_char_labels(train_df)
df_train['labels_char'] = labels_train
df_train['claim_offsets'] = claim_offsets

In [11]:
def get_token_labels(df):
    
    tokens_series = []
    labels_series = []
    claim_token_offset_series = []
    
    for counter, row in enumerate(df):
        
        reddit_id = row['subreddit_id']

        post_id = row['post_id']
        claim = row['claim']
        full_text = row['text']
        char_labels = row['labels_char']
        claim_offsets = row['claim_offsets']
        print('--------------------------------------------------------')
        #print( full_text )
        #print( 'claim: ', claim )
        #print( 'claim offsets: ', claim_offsets )
        
        tokens = []
        token_claim = []
        token_labels = []

        if 'N.A.' not in char_labels:
            assert len(full_text) == len(char_labels)
            
            tokenized_text = [(m.group(0), m.start(), m.end() - 1) for m in re.finditer(r'\S+', full_text)]
            
            for counter, i in enumerate(tokenized_text):
                start = i[1]
                end = i[2] + 1 
                char_to_token_lab = list(set(char_labels[ start : end ]))
                if len(char_to_token_lab) == 1: # If the char offsets correspond to only one token
                    tokens.append( i[0] )
                    token_labels.append( char_to_token_lab[0] )

                    # get claim start and end tokens
                    if claim_offsets[0] in list(range(start, end+1)):
                        print('start: ', claim_offsets[0])
                        token_claim.append( 'claim_starts' )
                    elif claim_offsets[1] in list(range(start, end+1)):
                        #print( 'possible ends: ', range(start, end+1) )
                        print('end: ', claim_offsets[1])
                        token_claim.append( 'claim_ends' )
                    else:
                        token_claim.append( 'N.A.' )

                else: # If there are multiple tokens, then tokenize further
                    
                    #tokenize further
                    new_text = tokenized_text[counter][0]
                    new_labels = char_labels[ start : end ]
                    new_offsets = list(range(start , end))

                    v = np.array( new_labels )
                    tok_ind = np.where(np.roll(v,1)!=v)[0]
                    tok_ind = list(tok_ind)
                    if 0 not in tok_ind:
                        tok_ind = [0] + tok_ind

                    new_text_tokens = [new_text[i:j] for i,j in zip(tok_ind, tok_ind[1:]+[None])]
                    new_text_labels = [new_labels[i:j] for i,j in zip(tok_ind, tok_ind[1:]+[None])]
                    new_char_offsets = [new_offsets[i:j] for i,j in zip(tok_ind, tok_ind[1:]+[None])]
                                        
                    for t, l in zip(new_text_tokens, new_text_labels):
                        tokens.append( t )
                        token_labels.append( list(set(l))[0] )

                    # Get tokens that are a part of claims and add to claims_tokens
                    for m in new_char_offsets:
                        largest_m = max(m)
                        smallest_m = min(m)
                        m.append(largest_m+1)

                        # get claim start and end tokens
                        if claim_offsets[0] in m:

                            print('start: ', claim_offsets[0])
                            token_claim.append( 'claim_starts' )

                        elif claim_offsets[1] in m:

                            #print('possible ends: ', m)
                            print('end special: ', claim_offsets[1])
                            token_claim.append( 'claim_ends' )

                        else:
                            token_claim.append( 'N.A.' )
                        
        else:
            tokens.append( ['N.A.'] )
            token_labels.append( ['N.A.'] )
            token_claim.append( ['N.A.'] )
                        
        tokens_series.append(tokens)
        labels_series.append(token_labels)
        claim_token_offset_series.append(token_claim)
        
    assert len(tokens_series) == len(labels_series) == len(claim_token_offset_series)
                        
    return tokens_series, labels_series, claim_token_offset_series

In [12]:
# Get the labels for train dataframe
train_df = df_train.to_dict('records')
text_tokens, token_labels, token_claim_offsets = get_token_labels(train_df)

--------------------------------------------------------
start:  508
end:  724
--------------------------------------------------------
start:  1401
end:  1654
--------------------------------------------------------
start:  302
end:  419
--------------------------------------------------------
start:  1442
end:  1617
--------------------------------------------------------
--------------------------------------------------------
--------------------------------------------------------
start:  1568
end:  1597
--------------------------------------------------------
--------------------------------------------------------
start:  72
end:  122
--------------------------------------------------------
start:  287
end:  517
--------------------------------------------------------
--------------------------------------------------------
start:  438
end:  539
--------------------------------------------------------
start:  70
end:  174
--------------------------------------------------------


end:  298
--------------------------------------------------------
start:  119
end:  190
--------------------------------------------------------
start:  112
end:  149
--------------------------------------------------------
--------------------------------------------------------
start:  230
end special:  336
--------------------------------------------------------
start:  0
--------------------------------------------------------
start:  428
end:  534
--------------------------------------------------------
start:  192
end special:  303
--------------------------------------------------------
start:  5127
end:  5234
--------------------------------------------------------
--------------------------------------------------------
start:  1406
end:  1499
--------------------------------------------------------
start:  124
end special:  198
--------------------------------------------------------
start:  732
end:  834
--------------------------------------------------------
start:  951
e

In [13]:
df_train['tokens'] = text_tokens
df_train['labels'] = token_labels
df_train['token_claim_offsets'] = token_claim_offsets

In [14]:
for i in df_train.token_claim_offsets:
    try:
        if len( set(i) ) == 2:
            print(set(i))
    except:
        print(i)

[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
{'N.A.', 'claim_starts'}
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
{'N.A.', 'claim_starts'}
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
{'N.A.', 'claim_starts'}
{'N.A.', 'claim_starts'}
{'N.A.', 'claim_ends'}
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
{'N.A.', 'claim_starts'}
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N.A.']]
[['N

In [15]:
# dump the dataframe to a csv file

write_parsed = '/mnt/nas2/data/systematicReview/semeval2023/data/parsed/st2_train_parsed.tsv'
#df_train.to_csv(write_parsed, encoding='utf-8', sep='\t')

## Tokenize and prepare test_df

In [16]:
def getClaimIndex(full_text, claim):
 
    set_bool = False

    for t in full_text.split(' '):
        if t in deleted_list:
            set_bool = True
            break

    if 'removed' not in full_text and 'deleted' not in full_text:
        claim_start = full_text.index(claim)
        claim_end = claim_start + len(claim)
    else:
        claim_start = 'N.A.'
        claim_end = 'N.A.'
 

    return tuple( [claim_start, claim_end] )

df_test['claim_offsets'] = df_test[['text', 'claim']].apply(lambda x: getClaimIndex(*x), axis=1)

In [17]:
deleted_list

['[deleted]', '[removed]', 'deleted by user']

In [18]:
_RE_DELETED_REPLACE = re.compile(r"(\[deleted\]|\[removed\]|deleted by user)")

In [19]:
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")

In [20]:
def preprocess_text(text):
    
    text = text.replace("\n"," ")
    text = _RE_COMBINE_WHITESPACE.sub(" ", text).strip()
    
    set_bool = False
    
    for t in text.split(' '):
        if t in deleted_list:
            set_bool = True
            break
            
    if set_bool == True:
        text = _RE_DELETED_REPLACE.sub("", text).strip()
    

    return text

df_test['text'] = df_test.text.progress_apply(preprocess_text)

  0%|          | 0/150 [00:00<?, ?it/s]

In [21]:
def getPOStags(value):
    
    tok_pos = []
    
    if type(value) == str:   
        if len(value) > 2:
            doc = nlp(value)
            tok_pos = [ token.pos_ for token in doc ]
        else:
            tok_pos = [ 'N.A.' ]
    else:
        tok_pos = [ 'N.A.' ]
    
    return tok_pos


def getPOSfinetags(value):
    
    tok_pos = []
    
    if type(value) == str:
        if len(value) > 2:
            doc = nlp(value)
            tok_pos = [ token.tag_ for token in doc ]
        else:
            tok_pos = [ 'N.A.' ]
    else:
        tok_pos = [ 'N.A.' ]
    
    return tok_pos


def getLemma(value):
    
    tok_lemma = []
    
    if type(value) == str:
        if len(value) > 2:
            doc = nlp(value)
            tok_lemma = [ token.lemma_ for token in doc ]
        else:
            tok_lemma = [ 'N.A.' ]
    else:
        tok_lemma = [ 'N.A.' ]
    
    return tok_lemma


def getTokens(value):
    
    tok_tok = []
    
    if type(value) == str:
        if len(value) > 2:
            doc = nlp(value)
            tok_tok = [ token.text for token in doc ]
        else:
            tok_tok = [ 'N.A.' ]
    else:
        tok_tok = [ 'N.A.' ]
    
    return tok_tok

def getPseudolabels(tokens):
    
    labels = [0] * len( tokens )
    
    return labels


def getOffsets(value):
    
    tok_charoffs = []
    
    if type(value) == str:
        if len(value) > 2:
            doc = nlp(value)
            tok_charoffs = [ token.idx for token in doc ]
        else:
            tok_charoffs = [ 'N.A.' ]
    else:
        tok_charoffs = [ 'N.A.' ]
    
    return tok_charoffs

In [22]:
df_test['pos'] = df_test.text.progress_apply(getPOStags)
df_test['pos_fine'] = df_test.text.progress_apply(getPOSfinetags)
df_test['tokens'] = df_test.text.progress_apply(getTokens)
df_test['lemma'] = df_test.text.progress_apply(getLemma)
df_test['labels'] = df_test.tokens.progress_apply(getPseudolabels)

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

In [23]:
df_test['char_offsets'] = df_test.text.progress_apply(getOffsets)

  0%|          | 0/150 [00:00<?, ?it/s]

In [24]:
df_test.shape

(150, 11)

In [25]:
df_test = df_test.drop( df_test[ df_test.text == '[]' ].index, inplace=False ) 

In [26]:
df_test.shape

(138, 11)

In [35]:
# convert char offsets to token_claim_offsets

def gettoken_claim_offsets(char_offsets, claim_offsets):
    
    start = claim_offsets[0]
    end = claim_offsets[1]
    
    token_claim_offsets = []
    
    if start !=  'N.A.' and end != 'N.A.':
        
        for i in  char_offsets:
            if (i == start or int(i)+1 == start or int(i)-1 == start):
                token_claim_offsets.append('claim_starts')
            elif (i == end or int(i)+1 == end or int(i)-1 == end):
                token_claim_offsets.append('claim_ends')
            else:
                #print( char_offsets )
                #print( claim_offsets )
                token_claim_offsets.append('N.A.')
                
        return token_claim_offsets
    
    else:
        
        return ['N.A.']


df_test['token_claim_offsets'] = df_test[['char_offsets', 'claim_offsets']].apply(lambda x: gettoken_claim_offsets(*x), axis=1)

In [36]:
for i in df_test.token_claim_offsets:
    if 'not there' not in i:
        print(set(i))

{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.'}
{'N.A.', 'claim_starts', 'claim_ends'}
{'N.A.', 'claim_start

In [58]:
from unidecode import unidecode 

def remove_nonascii(t):
    
    if isinstance(t, str):
        return unidecode(t)
    
    if isinstance(t, list):
        return [ unidecode(t_i) for t_i in t ]
    
df_test['text'] = df_test.text.progress_apply(remove_nonascii)
df_test['tokens'] = df_test.tokens.progress_apply(remove_nonascii)

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

In [65]:
write_dir = '/mnt/nas2/data/systematicReview/semeval2023/data/preprocessed'
write_test = 'st2_test_preprocessed.tsv'

#df_test.to_csv(f"{write_dir}/{write_test}", sep = "\t")

In [61]:
letters = []

for i in df_test.text:
    for i_i in i:
        letters.append( i_i.lower() )

In [64]:
set(letters)

{' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '[',
 '\\',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '~'}

In [45]:
text_lengths = []

for i in df_test.text:
    text_lengths.append( len(i) )

In [47]:
max(text_lengths)

3365