# Exploratory Data Analysis

In [1]:
# imports

import pandas as pd
import ast
import collections
import re
import spacy
import scispacy
import time
import numpy as np
import matplotlib.pyplot as plot
from collections import Counter

In [2]:
# Load tokenizers

#loading the scispacy model
nlp = spacy.load('en_core_sci_sm')

In [4]:
# open file

file_path_train = '/mnt/nas2/data/systematicReview/semeval2023/data/st2_train_inc_text.csv'
file_path_test = '/mnt/nas2/data/systematicReview/semeval2023/data/st2_test_inc_text.csv'

df_train = pd.read_csv(file_path_train, sep=',')
train_df = df_train.to_dict('records')

df_test = pd.read_csv(file_path_test, sep=',')
test_df = df_test.to_dict('records')

In [5]:
print(len(train_df))
print(len(test_df))

597
150


In [6]:
df_train.head()

Unnamed: 0,post_id,subreddit_id,claim,stage2_labels,text
0,sn9u41,t5_2s23e,I read an old thread on here that someone said...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Tysabri experiences\nHi all\n\nI just had my 3...
1,p7j49y,t5_2syer,"I have read that gout can't be cured, that it'...","[{""crowd-entity-annotation"":{""entities"":[{""end...",Crazy amount of sardines caused gout (possibly...
2,smgy0q,t5_2s3g1,I always read stories of people who suffer fro...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Im sorry for intruding but I just want to say ...
3,sxglhl,t5_2s3g1,Our results indicate that the addition of prob...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Is Bacillus coagulans supplementation plus low...
4,rxyk1d,t5_2s1h9,Ive read that amnesia always followed a tonic ...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",[deleted by user]\n[removed]


In [7]:
df_test.head()

Unnamed: 0,post_id,subreddit_id,claim,text
0,s8qa1c,t5_2tyg2,[Structural dissociation](https://did-research...,Can structural dissociation lead to psychosis?...
1,opt7bh,t5_2syer,I though that the gout was for old whiskey dri...,"First time gout flare\nHi guys,\n\nJust turned..."
2,rfj6tf,t5_2qlaa,I've read that a liquid alginate suspension (G...,Can I buy liquid alginate suspension (Gaviscon...
3,qyfer0,t5_2syer,This will reduce the effects of Lisinopril pre...,[deleted by user]\n[removed]
4,rr99la,t5_2qlaa,all the gerd/lpr stuff online says we should a...,I seem to reflux more and sleep worse when I d...


In [8]:
deleted_list = ['[deleted]', '[removed]', 'deleted by user']

In [9]:
picos_mapping = {'population': 1, 'intervention':2, 'outcome':3}

In [10]:
def get_char_labels(df):
    
    # parse dataframe and fetch annotations in a dict

    labels = []
    claim_offsets = []

    for counter, row in enumerate(df):
        #print('--------------------------------------------------------')

        reddit_id = row['subreddit_id']

        post_id = row['post_id']
        #print( post_id )

        claim = row['claim']
        #print('Claim: ', claim)

        full_text = row['text']
        #print('Full-Text: ', full_text)         

        if any(word in full_text for word in deleted_list):
            # If the post was removed by the user
            labels.append('N.A.')
            claim_offsets.append('N.A.')
        else:
            # If the post was not removed by the user
            # Get entities
            stage2_labels = ast.literal_eval( row['stage2_labels']  )
            #print( 'MAIN:     ', stage2_labels )
            stage2_labels = stage2_labels[0]['crowd-entity-annotation']['entities']
            #print( 'OFFSHOOT:     ', stage2_labels )

            # Get Char indices
            full_text_indices = [ counter for counter, i in enumerate(full_text) ]
            #print( 'full_text_indices:     ', full_text_indices )

            label_each_char = [0] * len(full_text) # Generate a 0 label for each character in the full text
            
            claim_start = full_text.index(claim)
            claim_end = claim_start + len(claim)

            for l in stage2_labels:
                extrct_annot = row['text'][ l['startOffset'] : l['endOffset'] ]
                pico =  l['label']

                # Are the start and stop offsets in the full-text offsets?
                if l['startOffset'] in full_text_indices:

                    prev_length = len(label_each_char)
                    start = l['startOffset']
                    end = l['startOffset']+(len(extrct_annot))
                    label_indices = [ i for i in range(start, end) ]

                    for i in range( start, end ):
                        old_label = label_each_char[i]
                        new_label = picos_mapping[pico]
                        if new_label > old_label:
                            label_each_char[i] = new_label
                    assert len(label_each_char) == prev_length
                    assert len(label_each_char) == len(full_text)
                    #print( label_each_char )

            labels.append(label_each_char)
            claim_offsets.append( (claim_start,claim_end) )
            
    return labels, claim_offsets

In [11]:
# Get the labels for train dataframe
labels_train, claim_offsets = get_char_labels(train_df)
df_train['labels_char'] = labels_train
df_train['claim_offsets'] = claim_offsets

In [12]:
df_train.head()

Unnamed: 0,post_id,subreddit_id,claim,stage2_labels,text,labels_char,claim_offsets
0,sn9u41,t5_2s23e,I read an old thread on here that someone said...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Tysabri experiences\nHi all\n\nI just had my 3...,"[2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, ...","(508, 724)"
1,p7j49y,t5_2syer,"I have read that gout can't be cured, that it'...","[{""crowd-entity-annotation"":{""entities"":[{""end...",Crazy amount of sardines caused gout (possibly...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(1401, 1654)"
2,smgy0q,t5_2s3g1,I always read stories of people who suffer fro...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Im sorry for intruding but I just want to say ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(302, 419)"
3,sxglhl,t5_2s3g1,Our results indicate that the addition of prob...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Is Bacillus coagulans supplementation plus low...,"[0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","(1442, 1617)"
4,rxyk1d,t5_2s1h9,Ive read that amnesia always followed a tonic ...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",[deleted by user]\n[removed],N.A.,N.A.


In [13]:
def get_token_labels(df):
    
    tokens_series = []
    labels_series = []
    
    for counter, row in enumerate(df):
        #print('--------------------------------------------------------')

        reddit_id = row['subreddit_id']

        post_id = row['post_id']
        claim = row['claim']
        full_text = row['text']
        char_labels = row['labels_char']
        
        tokens = []
        token_labels = []


        if 'N.A.' not in char_labels:
            assert len(full_text) == len(char_labels)
            
            tokenized_text = [(m.group(0), m.start(), m.end() - 1) for m in re.finditer(r'\S+', full_text)]
            
            for counter, i in enumerate(tokenized_text):
                start = i[1]
                end = i[2] + 1 
                char_to_token_lab = list(set(char_labels[ start : end ]))
                if len(char_to_token_lab) == 1:
                    tokens.append( i[0] )
                    token_labels.append( char_to_token_lab[0] )

                else:
                    
                    #tokenize further
                    new_text = tokenized_text[counter][0]
                    new_labels = char_labels[ start : end ]
                    #print(new_text , ' : ', new_labels)
                    
                    v = np.array( new_labels )
                    tok_ind = np.where(np.roll(v,1)!=v)[0]
                    tok_ind = list(tok_ind)
                    if 0 not in tok_ind:
                        tok_ind = [0] + tok_ind

                    
                    new_text_tokens = [new_text[i:j] for i,j in zip(tok_ind, tok_ind[1:]+[None])]
                    new_text_labels = [new_labels[i:j] for i,j in zip(tok_ind, tok_ind[1:]+[None])]
                    
                    for t, l in zip(new_text_tokens, new_text_labels):
                        tokens.append( t )
                        token_labels.append( list(set(l))[0] )
                        
        else:
            tokens.append( ['N.A.'] )
            token_labels.append( ['N.A.'] )              
                        
        tokens_series.append(tokens)
        labels_series.append(token_labels)
                        
    return tokens_series, labels_series

In [14]:
# Get the labels for train dataframe
train_df = df_train.to_dict('records')
text_tokens, token_labels = get_token_labels(train_df)

In [15]:
df_train['tokens'] = text_tokens
df_train['labels'] = token_labels

In [16]:
df_train.head()

Unnamed: 0,post_id,subreddit_id,claim,stage2_labels,text,labels_char,claim_offsets,tokens,labels
0,sn9u41,t5_2s23e,I read an old thread on here that someone said...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Tysabri experiences\nHi all\n\nI just had my 3...,"[2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, ...","(508, 724)","[Tysabri, experiences, Hi, all, I, just, had, ...","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,p7j49y,t5_2syer,"I have read that gout can't be cured, that it'...","[{""crowd-entity-annotation"":{""entities"":[{""end...",Crazy amount of sardines caused gout (possibly...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(1401, 1654)","[Crazy, amount, of, sardines, caused, gout, (p...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,smgy0q,t5_2s3g1,I always read stories of people who suffer fro...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Im sorry for intruding but I just want to say ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","(302, 419)","[Im, sorry, for, intruding, but, I, just, want...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,sxglhl,t5_2s3g1,Our results indicate that the addition of prob...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",Is Bacillus coagulans supplementation plus low...,"[0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","(1442, 1617)","[Is, Bacillus, coagulans, supplementation, plu...","[0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,rxyk1d,t5_2s1h9,Ive read that amnesia always followed a tonic ...,"[{""crowd-entity-annotation"":{""entities"":[{""end...",[deleted by user]\n[removed],N.A.,N.A.,[[N.A.]],[[N.A.]]


In [17]:
# dump the dataframe to a csv file

write_parsed = '/mnt/nas2/data/systematicReview/semeval2023/data/parsed/st2_train_parsed.tsv'
#df_train.to_csv(write_parsed, encoding='utf-8', sep='\t')

In [18]:
picos_mapping

{'population': 1, 'intervention': 2, 'outcome': 3}