In [150]:
from sklearn.model_selection import train_test_split

In [100]:
import pandas as pd
import glob
import spacy 
from tqdm.auto import tqdm
import random
import numpy as np
nlp = spacy.load("en_core_web_lg")

In [13]:
texts = glob.glob('../experiments/corpus-webis-editorials-16/annotated-txt/split-by-portal-final/*/*')

In [105]:
all_sent_dat_dfs = []
for text in tqdm(texts):
    flines = open(text).read().strip().split('\n')
    for line in flines:
        splits = line.split('\t')[:3]

    lines_df = pd.Series(flines).str.split('\t',expand=True).iloc[:, :3]
    lines_df.columns = ['label_idx', 'label', 'sent_frag']
    lines_df = (
        lines_df
             .assign(sent_frag=lambda df: df['sent_frag'] + ' ')
             .assign(str_len=lambda df: df['sent_frag'].str.len())
             .assign(end_idx=lambda df: df['str_len'].cumsum())
             .assign(start_idx=lambda df: [0] + df['end_idx'].iloc[:-1].tolist())
             [['label_idx', 'label', 'sent_frag', 'str_len', 'start_idx', 'end_idx']]
    )

    # Join everything into a single string
    full_text = "".join(lines_df['sent_frag'].tolist()).strip()
    
    doc = nlp(full_text)
    sentence_data = []
    for sent in doc.sents:
        sent_start = sent.start_char
        sent_end = sent.end_char
        overlapping_labels = []
        for _, (frag_start, frag_end, frag_label_dict) in lines_df[['start_idx', 'end_idx', 'label']].iterrows():
            if (frag_start < sent_end) and (frag_end > sent_start):
                overlapping_labels.append(frag_label_dict)
        
        sentence_data.append({
            "sentence_text": sent.text,
            "sentence_char_span": (sent_start, sent_end),
            "overlapping_fragment_labels": overlapping_labels
        })

    sent_dat_df = pd.DataFrame(sentence_data)
    sent_dat_df['doc_index'] = text
    all_sent_dat_dfs.append(sent_dat_df)

  0%|          | 0/300 [00:00<?, ?it/s]

In [107]:
full_sent_dat_df = pd.concat(all_sent_dat_dfs)

In [110]:
full_sent_dat_df['overlapping_fragment_labels'].explode().value_counts()

overlapping_fragment_labels
continued        25950
no-unit          16401
assumption        9811
par-sep           4664
anecdote          2611
testimony         1093
statistics         421
title              328
common-ground      243
other              171
Name: count, dtype: int64

In [None]:
to_exclude = set(['continued', 'no-unit', 'par-sep'])
full_sent_dat_df['overlapping_fragment_labels'] = (
    full_sent_dat_df['overlapping_fragment_labels']
         .apply(lambda x: list(filter(lambda y: y not in to_exclude, x)))
         .apply(lambda x: list(set(x)))
         .apply(lambda x: random.choice(x) if len(x) > 0 else np.nan)
)

full_sent_dat_df = (
    full_sent_dat_df
         .drop(columns=['sentence_char_span'])
         .reset_index()
         .rename(columns={'index': 'sent_index'})
 # ['overlapping_fragment_labels'].str.len().value_counts()
 # .loc[lambda df: df['overlapping_fragment_labels'].str.len() > 2]
 # .iloc[-1].to_dict()
)

In [163]:
train_doc_ids, test_doc_ids = train_test_split(full_sent_dat_df['doc_index'].drop_duplicates().tolist(), test_size=.25)
val_doc_ids, test_doc_ids = train_test_split(test_doc_ids, test_size=.2)
train_doc_ids, val_doc_ids, test_doc_ids = set(train_doc_ids), set(val_doc_ids), set(test_doc_ids)

In [169]:
full_sent_dat_df = (
    full_sent_dat_df
     .assign(split=lambda df: df['doc_index'].apply(lambda x: 'train' if x in train_doc_ids else ('val' if x in val_doc_ids else 'test')))
)

In [170]:
full_sent_dat_df.to_csv('../experiments/editorial-discourse.csv')

In [173]:
g = (full_sent_dat_df
 .groupby('doc_index')
 # .apply(lambda df: df.sort_values('sent_index'))
)

In [176]:
for doc_index, g_i in g:
    g_i

In [178]:
g_i
g_i['sentence_text'].str.cat(sep=' ')

'Zero suicides is an admirable aim but it requires all-out change.   On Monday, Nick Clegg pledged to sign up the NHS to a national "zero suicide" campaign . This comes in a week when I have spent a lot of time talking to the media about depression in an attempt to get people to support Calm and the other charities that are doing incredible work to help people coping with mental illness . I have appeared on national radio, in the press and on television to talk about my story , but the more coverage I get , the more I ask : why me?   Let me make this clear . I am not famous , I am not trying to be famous . I am a normal lad who made mistakes, lived a lie, and eventually attempted to take my own life . I am just another statistic . Someone who suffered from an illness that took over my life and made me a person I didn\'t recognise . There is absolutely nothing special or unique about me . Yet I am getting calls from people asking me to share my story . The fact that I am newsworthy , si