In [None]:
import pandas as pd
import numpy as np
import sys

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/Shared drives/CSCI 544 Project/Script'

Mounted at /content/drive
/content/drive/Shared drives/CSCI 544 Project/Script


# Read Data

In [None]:
import json

with open('./Data/limerick_dataset_oedilf_v3.json') as f:
  data = json.load(f)

In [None]:
limerick_data = [d["limerick"] for d in data] # list with limericks

## Line Based

In [None]:
line_data = []
for lim in limerick_data:
  line_data += [line for line in lim.split("\n")]

In [None]:
with open("./Data/line_based_input.txt","w") as file:
  for line in line_data:
    file.write(line)
    file.write("\n")

## Limerick Based

In [None]:
lim_data = []
for lim in limerick_data:
  if len(lim) > 20: # Get rid of short lims
    lim_lines = lim.lower().split("\n")
    if len(lim_lines) == 5: # make sure its not too long
      lim_data.append(lim_lines)

NameError: ignored

In [None]:
len(lim_data)

98228

## Read Cleaned

In [None]:
lim_data = []
with open('./Data/limerick_clean_v2.txt') as f:
  for line in f.readlines():
    lim_data.append(line.strip().split("\t"))

In [None]:
lim_data[0]

["`` i 'm really hon 'rably intentioned , `` ",
 'he said to a lass well dimensioned ,',
 "but she said , `` no , i 'll leave",
 'for i do not believe',
 'the intentions that were aforementioned . ``']

# Process

## Split

In [None]:
train_idx, val_idx, test_idx = int(len(lim_data)*0.6), int(len(lim_data)*0.8), len(lim_data)

In [None]:
train_corp, val_corp, test_corp = lim_data[:train_idx], lim_data[train_idx:val_idx], lim_data[val_idx:test_idx]

In [None]:
for prefix,corp in zip(["train","val","test"], [train_corp, val_corp, test_corp]):
  with open(f"./Data/lim_cleaned_v3/{prefix}.txt","w") as file:
    for lim in corp:
      #file.write(" ".join(lim)) # this is the older version
      file.write("[SEP]".join(lim)+"[SEP]") # this is crucial for generating sbertscore and order files
      file.write("\n")

## Source & Target Extract

In [None]:
!pip install transformers
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
import sys
import re
import nltk.data
import copy
import numpy as np
np.random.seed(520)
import scipy
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
def get_sbertscore(tmp_sen_list):
    encoded_input = tokenizer(tmp_sen_list, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        #Perform pooling. In this case, mean pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).cpu().numpy()
    distances = 1 - scipy.spatial.distance.cdist(sentence_embeddings, sentence_embeddings, "cosine")

    score = []
    for i in range(len(tmp_sen_list)):
        for j in range(len(tmp_sen_list)):
            score.append("%d %d %.8f" % (i, j, distances[i][j]))
    return score

In [None]:
device="cuda:0"
data_name='lim_cleaned_v3'
tgt_dir='/content/drive/Shareddrives/CSCI 544 Project/Script/Data'
model_type='sentence-transformers/all-mpnet-base-v2'
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
etrica_folder = '/content/drive/Shareddrives/CSCI 544 Project/Script/Data/lim_cleaned_v3/Etrica_Input'
HINT_folder = '/content/drive/Shareddrives/CSCI 544 Project/Script/Data/lim_cleaned_v3/HINT_Input'

In [None]:
for name in ["train","val","test"]: #
    sources = []
    targets = []
    ord_list = []
    score_list = []
    with open("%s/%s/%s.txt"%(tgt_dir, data_name, name)) as fin:
        data = [line.strip().split("[SEP]")[:-1] for line in fin]
    print(np.shape(data))
    assert len(list(np.shape(data))) == 2       
    for i, d in enumerate(data):
        if i % 100 == 0:
            print("processing %d lines"%i)
        ipt = d[0]
        sen_list = d[1:]

        true_order = " ".join([str(iii) for iii in range(len(sen_list))])
        score = get_sbertscore(sen_list)

        sources.append(ipt)
        targets.append(sen_list)
        ord_list.append("0,%s\n"%true_order)
        score_list.append(score)
        
        random_num = np.random.random()
        sen_list_changed = sen_list.copy()
        if random_num < 0.33:            
            random_idx = np.random.permutation(range(len(sen_list)))
            sen_list_changed = np.take(sen_list, random_idx).tolist()
            ord_list.append("1,%s\n"%(" ".join(list(map(str, random_idx)))))
        elif random_num < 0.66:
            random_idx = sorted(np.random.choice(range(len(sen_list)), 2, replace=False))
            sen_list_changed[random_idx[1]] = sen_list[random_idx[0]]
            ord_list.append("2,%s\n"%true_order)
        else:
            random_idx = np.random.choice(range(len(sen_list)), 1)[0]
            other_data_idx = np.random.choice(range(len(data)), 1)[0]
            sen_list_changed[random_idx] = data[other_data_idx][random_idx]
            ord_list.append("3,%s\n"%true_order)

        score = get_sbertscore(sen_list)

        sources.append(ipt)
        targets.append(sen_list_changed)    
        score_list.append(score)
    with open("%s/%s.source"%(HINT_folder, name), "w") as fout1:
      for line in sources:
        fout1.write(line + "\n")
    with open("%s/%s.target"%(HINT_folder, name), "w") as fout2:
      for line in targets:
        fout2.write("".join(["%s<mask><s>"%s for s in line]) + "\n")  
    with open("%s/%s_order.target"%(HINT_folder, name), "w") as fout3:
      for line in ord_list:
        fout3.write(line)
    with open("%s/%s_sbertscore.target"%(HINT_folder, name), "w") as fout4:
      for s in score_list:
        fout4.write(",".join(s)+"\n")

(48756, 5)
processing 0 lines
processing 100 lines
processing 200 lines
processing 300 lines
processing 400 lines
processing 500 lines
processing 600 lines
processing 700 lines
processing 800 lines
processing 900 lines
processing 1000 lines
processing 1100 lines
processing 1200 lines
processing 1300 lines
processing 1400 lines
processing 1500 lines
processing 1600 lines
processing 1700 lines
processing 1800 lines
processing 1900 lines
processing 2000 lines
processing 2100 lines
processing 2200 lines
processing 2300 lines
processing 2400 lines
processing 2500 lines
processing 2600 lines
processing 2700 lines
processing 2800 lines
processing 2900 lines
processing 3000 lines
processing 3100 lines
processing 3200 lines
processing 3300 lines
processing 3400 lines
processing 3500 lines
processing 3600 lines
processing 3700 lines
processing 3800 lines
processing 3900 lines
processing 4000 lines
processing 4100 lines
processing 4200 lines
processing 4300 lines
processing 4400 lines
processing 

## Dedouble the Source and Target files

In [None]:
import os

In [None]:
for filename in os.listdir(HINT_folder):
    f = os.path.join(HINT_folder, filename)
    
    with open(f) as file:
      content = file.readlines()
    with open(os.path.join(etrica_folder, filename),'w') as file:
      flip = True
      for line in content:
        flip = not flip
        if flip:
          continue
        file.write(line)
        

In [None]:
extractor_path = "EtriCA-storygeneration/preprocessing/event_trigger/"
sys.path.insert(0, extractor_path)

In [None]:
from hint_roc_stories_helper import *
from pathlib import Path

In [None]:
input_dir = Path(HINT_folder)
output_dir = Path(etrica_folder)

In [None]:
for prefix in ["train", "val", "test"]:
        src_clean(f"{HINT_folder}/{prefix}.source", f"{etrica_folder}/{prefix}.source.txt")
        target_clean(f"{HINT_folder}/{prefix}.target", f"{etrica_folder}/{prefix}.target.txt")
write_to_corpus(output_dir, output_dir, corpus_file_name="corpus.txt", splits=["train", "val"])
write_to_corpus(output_dir, output_dir, corpus_file_name="all_data.txt", splits=["train", "val", "test"])

data to /content/drive/Shareddrives/CSCI 544 Project/Script/Data/lim_cleaned_v3/Etrica_Input/corpus.txt
data to /content/drive/Shareddrives/CSCI 544 Project/Script/Data/lim_cleaned_v3/Etrica_Input/all_data.txt


## Etrica input event extract

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!python3 -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
from event_trigger.event_annotator import *

In [None]:
process_list = []
dataset_list = ["lim_cleaned_v3/Etrica_Input"]
for dataset_name in dataset_list:
    # save_path = f"{BASE_DIR}/output/event-trigger/cache/{dataset_name}_event_graph.pkl"
    # if os.path.exists(save_path):
    #     print(f"extractor loaded from {save_path}")
    #     event_extractor = EventExtractor.load(save_path)
    # else:
    event_annotator = EventAnnotator(name=dataset_name,
                                        cache_dir=f"Data/{dataset_name}/cache",
                                        data_dir=f"Data/{dataset_name}",
                                        output_dir=f"Data/{dataset_name}")
    for prefix in ["train", "val", "test"]: # 
            ps = Process(target=event_annotator.annotate_file,
                         args=(f"{prefix}.target.txt", f"{prefix}_event.source.txt"))
            ps.start()
            process_list.append(ps)

for ps in process_list:
    ps.join()

annotating file: train.target.txt, total: 48756, already finished: 0, rest: 48756annotating file: val.target.txt, total: 16252, already finished: 0, rest: 16252



annotating file val.target.txt, and output to val_event.source.txt:   0%|          | 0/16252 [00:00<?, ?it/s]

annotating file: test.target.txt, total: 16253, already finished: 0, rest: 16253


annotating file val.target.txt, and output to val_event.source.txt: 100%|██████████| 16252/16252 [27:44<00:00,  9.77it/s]
annotating file test.target.txt, and output to test_event.source.txt: 100%|██████████| 16253/16253 [28:32<00:00,  9.49it/s]
annotating file train.target.txt, and output to train_event.source.txt: 100%|██████████| 48756/48756 [46:11<00:00, 17.59it/s]
