# Finetune Model for Measuring Sentence Relatedness

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import pandas as pd 

######
# TODO: Download dataset: https://github.com/Priya22/semantic-textual-relatedness/blob/master/sem_text_rel_ranked.csv
#       Store CSV file in data directory

df = pd.read_csv('data/sem_text_rel_ranked.csv')

train_examples = df.apply(lambda x: InputExample(texts=x['Text'].split('\n'), label=x['Score']), axis=1)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

model = SentenceTransformer('bert-base-cased')

train_examples = df.apply(lambda x: InputExample(texts=x['Text'].split('\n'), label=x['Score']), axis=1)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

model.save("models/sbert-relatedness")

# Score Relatedness of Endpoints in Training Data Set

In [None]:
# Set up ROCStories
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

def get_similarity(a, b):
    n = np.dot(a, b)
    a_norm = np.sqrt(np.sum(a**2))
    b_norm = np.sqrt(np.sum(b**2))
    d = a_norm * b_norm
    return n/d

model = SentenceTransformer('models/sbert-relatedness')

#####
# TODO: Complete form to download dataset: https://cs.rochester.edu/nlp/rocstories/
#       Store all CSV files in data directory

df16 = pd.read_csv('data/ROCStories__spring2016 - ROCStories_spring2016.csv')
df17 = pd.read_csv('data/ROCStories_winter2017 - ROCStories_winter2017.csv')

df = pd.concat([df16, df17], axis=0)    # combine both datasets
endpoints = df.loc[:,'sentence1':'sentence5']

start = endpoints["sentence1"].values.tolist()
stop = endpoints["sentence5"].values.tolist()

start_embeddings = model.encode(start)
stop_embeddings = model.encode(stop)

scores = [get_similarity(start, stop) for start,stop in zip(start_embeddings, stop_embeddings)]

df['score'] = scores
df = df.sort_values(by='score', ascending=True).reset_index(drop=True)
df.to_csv('data/roc_scored.csv')
df

In [None]:
# Size of sub-datasets for each score (higher score indicates endpoints are more related)
import pandas as pd

df = pd.read_csv('data/roc_scored.csv')

score_min = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0]
for m in score_min:
  story_count = sum(df['score'] >= m)
  print("# stories scoring >= {}:  {}".format(m, story_count))

# Fine-tune GPT Baseline 3x

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import os
  
df = pd.read_csv('data/roc_scored.csv', usecols=['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5', 'score'])
df['rocstory'] = df.sentence1 + ' ' + df.sentence2 + ' ' + df.sentence3 + ' ' + df.sentence4 + ' ' + df.sentence5 + '<|endoftext|>'

df = df.rename(columns={'sentence1':'start'})
df = df.drop(columns=['sentence2', 'sentence3', 'sentence4', 'sentence5'])
df = df.sample(frac=1, random_state=104).reset_index(drop=True)

# Sanity check
print(df.at[5, 'rocstory'])


# Build datasets 
# Originally train_test_ratio = 0.9; train_valid_ratio = 7/9
# Now using story cloze corpus for testing!

train_valid_ratio = 0.8     # 80% training; 20% validation 
df_train, df_valid = train_test_split(df, train_size=train_valid_ratio, random_state=1)

def build_dataset_csv(df, dest_path):
    df.to_csv(dest_path+'.csv', index=False)

def build_dataset_txt(df, dest_path):
    f = open(dest_path+'.txt', 'w')
    stories = df['rocstory'].tolist()
    for s in stories:
        s = str(s).strip()
        s = re.sub(r"\s", " ", s)
        f.write(s + '\n')

rounds = ['F1', 'F2', 'F3']

for r in rounds:
    if r == 'F1':
        score = 0.0
    if r == 'F2':
        score = 0.3
    if r == 'F3':
        score = 0.5

    # Create new directory
    os.makedirs('data', exist_ok=True)
    new_dirs_path = os.path.join('data', f'baseline_{r}')
    os.makedirs(new_dirs_path, exist_ok=True)

    roc = df[df['score'] >= score].reset_index(drop=True)
    df_train, df_valid = train_test_split(roc, train_size=train_valid_ratio, random_state=1)

    build_dataset_csv(df_train, f'data/baseline_{r}/train')
    build_dataset_csv(df_valid, f'data/baseline_{r}/valid')
    build_dataset_txt(df_train, f'data/baseline_{r}/train')
    build_dataset_txt(df_valid, f'data/baseline_{r}/valid')

In [None]:
# train
# note: test data is joint cloze2016 testing and validation datasets

!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file            data/baseline_F1/train.txt \
        --validation_file       data/baseline_F1/valid.txt \
        --test_file             data/baseline_F1/valid.txt \
        --do_train \
        --model_name_or_path    gpt2 \
        --output_dir            models/baseline \
        --gradient_accumulation_steps 8

In [None]:
# 2x 
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file            data/baseline_F2/train.txt \
        --validation_file       data/baseline_F2/valid.txt \
        --test_file             data/baseline_F2/valid.txt \
        --do_train \
        --model_name_or_path    models/baseline \
        --output_dir            models/baseline_F2 \
        --gradient_accumulation_steps 8

In [None]:
# 3X
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file            data/baseline_F3/train.txt \
        --validation_file       data/baseline_F3/valid.txt \
        --test_file             data/baseline_F3/valid.txt \
        --do_train \
        --model_name_or_path    models/baseline_F2 \
        --output_dir            models/baseline_F3 \
        --gradient_accumulation_steps 8

# Fine-tune Stop Generators

### Stop Baseline: Given start, generate stop

In [None]:
# Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
import os

rocs = pd.read_csv('data/roc_scored.csv', usecols=["sentence1", "sentence5", "score"])

def build_dataset(df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    stories = df['target'].tolist()
    for s in stories:
        s = str(s).strip()
        s = re.sub(r"\s", " ", s)
        # bos_token = '<BOS>'
        eos_token = '<|endoftext|>'
        data += s + eos_token + '\n'
    f.write(data)


# Set up for 3-way fine-tuning
rounds = ['F1', 'F2', 'F3']
for r in rounds:
    if r == 'F1':
        score = 0.0
    if r == 'F2':
        score = 0.3
    if r == 'F3':
        score = 0.7

    rocs_subset = rocs[rocs['score'] >= score].reset_index(drop=True)
    rocs_subset['target'] = rocs_subset['sentence1'] + ' ' + rocs_subset['sentence5']
    rocs_subset.rename(columns = {'sentence1':'start', 'sentence5':'stop'}, inplace = True)

    # Build datasets
    train_test_ratio = 0.9
    train_valid_ratio = 7/9

    df_full_train, df_test = train_test_split(rocs_subset, train_size = train_test_ratio, random_state = 1)
    df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

    os.makedirs('data', exist_ok=True)                              # check 'data' directory exists
    new_dirs_path = os.path.join('data', f'stop_baseline_{r}')      # define new directory within
    os.makedirs(new_dirs_path, exist_ok=True)                       # create new directory

    # df_train.to_csv(f'data/stop_baseline_{r}/train.csv', index=False)
    # df_valid.to_csv(f'data/stop_baseline_{r}/valid.csv', index=False)
    # df_test.to_csv(f'data/stop_baseline_{r}/test.csv', index=False)

    build_dataset(df_train, f'data/stop_baseline_{r}/train.txt')
    build_dataset(df_valid, f'data/stop_baseline_{r}/valid.txt')
    build_dataset(df_test, f'data/stop_baseline_{r}/test.txt')

In [None]:
#####
# TODO: download run_clm_no_trainer.py from https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm_no_trainer.py
#       place in renargen_lm directory
# Train F1
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file            data/stop_baseline_F1/train.txt \
        --validation_file       data/stop_baseline_F1/valid.txt \
        --test_file             data/stop_baseline_F1/test.txt \
        --do_train \
        --model_name_or_path    models/baseline \
        --output_dir            models/stop_baseline_F1 \
        --gradient_accumulation_steps 8 

In [None]:
# Train F2
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file            data/stop_baseline_F2/train.txt \
        --validation_file       data/stop_baseline_F2/valid.txt \
        --test_file             data/stop_baseline_F2/test.txt \
        --do_train \
        --model_name_or_path    models/stop_baseline_F1 \
        --output_dir            models/stop_baseline_F2 \
        --gradient_accumulation_steps 8 

In [None]:
# Train F3
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file            data/stop_baseline_F3/train.txt \
        --validation_file       data/stop_baseline_F3/valid.txt \
        --test_file             data/stop_baseline_F3/test.txt \
        --do_train \
        --model_name_or_path    models/stop_baseline_F2 \
        --output_dir            models/stop_baseline_F3 \
        --gradient_accumulation_steps 8 

### Phrase Generator: Given start, generate list of phrases

In [None]:
import pandas as pd
from nltk import word_tokenize
import os

df = pd.read_csv('data/roc_scored.csv', usecols=['sentence1', 'sentence5', 'score'])

df = df.loc[:, ['sentence1','sentence5']].applymap(word_tokenize)

# Find overlap, maintain order of stop sentence
get_overlap = lambda x, y: [i for i in x if i in y]       
df['rep'] = df.apply(lambda x: list(get_overlap(x['sentence1'], x['sentence5'])), axis=1)
df['rep'] = df['rep'].apply(lambda x: ", ".join(x)).apply(lambda x: '[' + x + ']')

df_o = pd.read_csv('data/roc_scored.csv', usecols=['sentence1', 'sentence5', 'score'])

df['sentence1'] = df_o['sentence1']
df['sentence5'] = df_o['sentence5']
df['score'] = df_o['score']
df.rename(columns = {'sentence1':'start', 'sentence5':'stop'}, inplace = True)

df['target'] = df['start'] + ' ' + df['stop'] + ' ' + df['rep'] + '<|endoftext|>'

# Sanity check
print(df.at[90000,'target'])

def build_dataset(df, dest_path):
    f = open(dest_path+'.txt', 'w')
    data = ''
    stories = df.target.tolist()
    for s in stories:
        s = str(s).strip()
        s = re.sub(r"\s", " ", s)
        data += s + '\n'
    f.write(data)
    df.to_csv(dest_path+'.csv', index=False)

# Build datasets
train_test_ratio = 0.9
train_valid_ratio = 7/9

df_full_train, df_test = train_test_split(df, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

os.makedirs('data', exist_ok=True)              
new_dirs_path = os.path.join('data', 'phrase_generator') 
os.makedirs(new_dirs_path, exist_ok=True)          

build_dataset(df_train, 'data/phrase_generator/train')
build_dataset(df_valid, 'data/phrase_generator/valid')
build_dataset(df_test, 'data/phrase_generator/test')

In [None]:
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
                --train_file        data/phrase_generator/train.txt \
                --validation_file   data/phrase_generator/valid.txt \
                --test_file         data/phrase_generator/test.txt \
                --do_train \
                --model_name_or_path models/stop_baseline_F3 \
                --output_dir         models/phrase_generator \
                --gradient_accumulation_steps 8

### Stop Generator: Given start + rep phrases, generate stop

In [None]:
import pandas as pd
from nltk import word_tokenize
import os

df = pd.read_csv('data/roc_scored.csv', usecols=['sentence1', 'sentence5', 'score'])

df = df.loc[:, ['sentence1','sentence5']].applymap(word_tokenize)

# Find overlap, maintain order of stop sentence
get_overlap = lambda x, y: [i for i in x if i in y]       
df['rep'] = df.apply(lambda x: list(get_overlap(x['sentence1'], x['sentence5'])), axis=1)
df['rep'] = df['rep'].apply(lambda x: ", ".join(x)).apply(lambda x: '[' + x + ']')

df_o = pd.read_csv('data/roc_scored.csv', usecols=['sentence1', 'sentence5', 'score'])

df['sentence1'] = df_o['sentence1']
df['sentence5'] = df_o['sentence5']
df['score'] = df_o['score']
df.rename(columns = {'sentence1':'start', 'sentence5':'stop'}, inplace = True)

df['target'] = df['start'] + df['rep'] + ' ' + df['stop'] + ' ' + '<|endoftext|>'

# Sanity check
print(df.at[90000,'target'])

def build_dataset(df, dest_path):
    f = open(dest_path+'.txt', 'w')
    data = ''
    stories = df.target.tolist()
    for s in stories:
        s = str(s).strip()
        s = re.sub(r"\s", " ", s)
        data += s + '\n'
    f.write(data)
    df.to_csv(dest_path+'.csv', index=False)

# Build datasets
train_test_ratio = 0.9
train_valid_ratio = 7/9

df_full_train, df_test = train_test_split(df, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

os.makedirs('data', exist_ok=True)
new_dirs_path = os.path.join('data', 'stop_generator') 
os.makedirs(new_dirs_path, exist_ok=True)              

build_dataset(df_train, 'data/stop_generator/train')
build_dataset(df_valid, 'data/stop_generator/valid')
build_dataset(df_test, 'data/stop_generator/test')

In [None]:
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
                --train_file        data/stop_generator/train.txt \
                --validation_file   data/stop_generator/valid.txt \
                --test_file         data/stop_generator/test.txt \
                --do_train \
                --model_name_or_path models/stop_baseline_F3 \
                --output_dir         models/stop_generator \
                --gradient_accumulation_steps 8

# Fine-tune Story Infillers
### Position Classifier: Determine where to infill next sentence

In [None]:
import pandas as pd
import random

rocs_path = 'data/roc_scored.csv'
rocs = pd.read_csv(rocs_path, usecols=['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5'])


### Generate TRUE samples

true_sample_list = []
for i in range(50000):

    # Choose middle sentence
    index_m = random.randint(2,4)
    # print(index_m)

    # Choose LC and RC
    potentials_lc = [j for j in range(1,index_m)]
    potentials_rc = [j for j in range(index_m+1, 6)]
    
    index_lc = sorted(random.sample(potentials_lc, random.choice(potentials_lc)))
    index_rc = sorted(random.sample(potentials_rc, random.choice(potentials_rc)-index_m))

    # Sample story from ROCStories dataset
    index_story = random.randint(0, 98160)
    lc = ""
    rc = "" 

    if 1 in index_lc:
        lc += rocs.at[index_story, 'sentence1'] + " "
    if 2 in index_lc:
        lc += rocs.at[index_story, 'sentence2'] + " "
    if 3 in index_lc:
        lc += rocs.at[index_story, 'sentence3'] + " "

    if 3 in index_rc:
        rc +=  " " + rocs.at[index_story, 'sentence3']
    if 4 in index_rc:
        rc += " " + rocs.at[index_story, 'sentence4']
    if 5 in index_rc:
        rc += " " + rocs.at[index_story, 'sentence5']

    true_sample_list.append(lc + "<mask>" + rc)


### Generate FALSE samples

false_sample_list = []

for i in range(50000):

    # Choose middle sentence
    index_m = random.randint(2,4)
    # print(index_m)

    # Choose LC and RC
    potentials_lc = [j for j in range(1,index_m)]
    potentials_rc = [j for j in range(index_m+1, 6)]
    
    index_lc = sorted(random.sample(potentials_lc, random.choice(potentials_lc)))
    index_rc = sorted(random.sample(potentials_rc, random.choice(potentials_rc)-index_m))

    # Only choose lc+rc < 5 
    while len(index_lc) + len(index_rc) == 4:       # 4 since m is appended below
        index_lc = sorted(random.sample(potentials_lc, random.choice(potentials_lc)))
        index_rc = sorted(random.sample(potentials_rc, random.choice(potentials_rc)-index_m))

    # Sample story from ROCStories dataset
    index_story = random.randint(0, 98160)
    lc = ""
    rc = "" 

    if 1 in index_lc:
        lc += rocs.at[index_story, 'sentence1'] + " "
    if 2 in index_lc:
        lc += rocs.at[index_story, 'sentence2'] + " "
    if 3 in index_lc:
        lc += rocs.at[index_story, 'sentence3'] + " "

    if 3 in index_rc:
        rc +=  " " + rocs.at[index_story, 'sentence3']
    if 4 in index_rc:
        rc += " " + rocs.at[index_story, 'sentence4']
    if 5 in index_rc:
        rc += " " + rocs.at[index_story, 'sentence5']

    # Get masked sentence from sampled story
    if index_m == 2:
        m = rocs.at[index_story, 'sentence2']
    if index_m == 3:
        m = rocs.at[index_story, 'sentence3']
    if index_m == 4:
        m = rocs.at[index_story, 'sentence4']

    add_left = random.choice([True, False])
    if add_left == True:
        false_sample_list.append(lc + m + " <mask>" + rc)
    else:
        false_sample_list.append(lc + "<mask> " + m + rc)

In [None]:
import os

joint_list = true_sample_list + false_sample_list

targets = []
for i in range(50000):
    targets.append("1")
for i in range(50000):
    targets.append("0")

df_infill = pd.DataFrame()
df_infill['text'] = joint_list
df_infill['label']  = targets
df_infill = df_infill.sample(frac = 1).reset_index(drop=True)   # Shuffle

os.makedirs('data', exist_ok=True)
new_dirs_path = os.path.join('data', 'position_classifier') 
os.makedirs(new_dirs_path, exist_ok=True)              

df_infill.to_csv('data/position_classifier/position_classifier.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Build datasets
train_test_ratio = 0.9
train_valid_ratio = 7/9
df_full_train, df_test = train_test_split(df_infill, train_size = train_test_ratio, random_state = 1)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

df_train.to_csv('data/position_classifier/train.csv', index=False)
df_valid.to_csv('data/position_classifier/valid.csv', index=False)
df_test.to_csv('data/position_classifier/test.csv', index=False)

train = load_dataset("csv", data_files='data/position_classifier/train.csv')
test = load_dataset("csv", data_files='data/position_classifier/valid.csv')
valid = load_dataset("csv", data_files='data/position_classifier/test.csv')


In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate

# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

# tokenized_df = df.map(preprocess_function, batched=True)
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)
tokenized_valid = valid.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir='models/position_classifier',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset=tokenized_valid["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

### Infill Generator: Generate new infill sentence

Input data format:

- Training input: 

    **LC** <INFILL_LOC> **RC** \<SEP\> **INFILL** <|endoftext|>

- Inference time input: 

    **LC** <INFILL_LOC> **RC** \<SEP\>

- Inference time output: 

    **LC** <INFILL_LOC> **RC** \<SEP\> **INFILL** <|endoftext|>

In [None]:
import pandas as pd
from tqdm import tqdm
import random

DATA_PATH = 'data/roc_scored.csv'
rocs = pd.read_csv(DATA_PATH, usecols=['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5'])

story_list = []

for i in tqdm(range(len(rocs))):
    story = ''

    # Choose middle sentence
    index_m = random.randint(2,4)
    # print(index_m)

    # Choose LC and RC
    # List of all potential sentences in LC, RC (except endpoints)
    potentials_lc = [j for j in range(1,index_m)]
    potentials_rc = [j for j in range(index_m+1, 6)]
    # Randomly choose LC, RC sentences
    index_lc = sorted(random.sample(potentials_lc, random.choice(potentials_lc)))
    index_rc = sorted(random.sample(potentials_rc, random.choice(potentials_rc)-index_m))

    # Set up LC
    story += rocs.at[i, 'sentence1']     # Always have start sentence
    if 2 in index_lc:
        story += ' ' + rocs.at[i, 'sentence2']
    if 3 in index_lc:
        story += ' ' + rocs.at[i, 'sentence3']

    story += ' <INFILL_LOC> '

    # Set up RC
    if 3 in index_rc:
        story +=  rocs.at[i, 'sentence3'] + ' '
    if 4 in index_rc:
        story += rocs.at[i, 'sentence4'] + ' '
    story += rocs.at[i, 'sentence5']     # Always have stop sentence

    # Build target output list
    if index_m == 2:
        infill_sentence = 'sentence2'
    if index_m == 3:
        infill_sentence = 'sentence3'
    if index_m == 4:
        infill_sentence = 'sentence4'
    
    story += ' <SEP> ' + rocs.at[i, infill_sentence]
    story_list.append(story)

In [None]:
from sklearn.model_selection import train_test_split
import re
import os

train_valid_ratio = 0.8
df_train, df_valid = train_test_split(story_list, train_size = train_valid_ratio, random_state = 1)

def build_dataset(story_list, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for s in story_list:
        s = str(s).strip()
        s = re.sub(r"\s", " ", s)
        data += s + '<|endoftext|>\n'
    f.write(data)

os.makedirs('data', exist_ok=True)
new_dirs_path = os.path.join('data', 'infill_generator') 
os.makedirs(new_dirs_path, exist_ok=True) 

build_dataset(df_train, 'data/infill_generator/train.txt')
build_dataset(df_valid, 'data/infill_generator/valid.txt')

In [None]:
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file        data/infill_generator/train.txt \
        --validation_file   data/infill_generator/valid.txt \
        --test_file         data/infill_generator/valid.txt \
        --do_train \
        --model_name_or_path    models/baseline_F1 \
        --output_dir            models/infill_generator \
        --gradient_accumulation_steps 8

In [None]:
# Run this block multiple times (e.g. 4x) for best results

!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file        data/infill_generator/train.txt \
        --validation_file   data/infill_generator/valid.txt \
        --test_file         data/infill_generator/valid.txt \
        --do_train \
        --model_name_or_path    models/infill_generator \
        --output_dir            models/infill_generator \
        --gradient_accumulation_steps 8

# Fine-tune Models for Ablation Experiments
### a2_1: Remove position classifier, infill 3 sentences

TODO: Change ablation models here to show ablations in paper

In [None]:
import pandas as pd

data_path = 'data/roc_scored.csv'
cols = ["sentence1", "sentence2", "sentence3", "sentence4", "sentence5", "score"]
rocs = pd.read_csv(data_path, usecols=cols)

rocs = rocs[rocs['score'] >= 0.7].reset_index(drop=True)
rocs["text"] = rocs["sentence1"] + " " + rocs["sentence5"] + " <MASK> " + rocs["sentence2"] + " " + rocs["sentence3"] + " " + rocs["sentence4"] + "<|endoftext|>"
rocs = rocs.drop("sentence1", axis=1).drop("sentence2", axis=1).drop("sentence3", axis=1).drop("sentence4", axis=1).drop("sentence5", axis=1).drop("score", axis=1)
print(f"Number of samples: {len(rocs)}")
print(f"Sample: {rocs.at[1, 'text']}")
rocs

In [None]:
from sklearn.model_selection import train_test_split
import re

def build_dataset(df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    stories = df['text'].tolist()
    for s in stories:
        s = str(s).strip()
        s = re.sub(r"\s", " ", s)
        data += s + '\n'
    f.write(data)

# Build training, validation datasets
train_test_ratio = 0.8
df_train, df_valid = train_test_split(rocs, train_size = train_test_ratio, random_state = 1)

build_dataset(df_train, "data/train_ablation2_1.txt")
build_dataset(df_valid, "data/valid_ablation2_1.txt")

In [None]:
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file data/train_ablation2_1.txt \
        --validation_file data/valid_ablation2_1.txt \
        --test_file data/valid_ablation2_1.txt \
        --do_train \
        --model_name_or_path e2e_baseline \
        --output_dir models/ablation2_1 \
        --gradient_accumulation_steps 8

In [None]:
# Test a2_1 model

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm
from nltk.tokenize import sent_tokenize


tokenizer = GPT2Tokenizer.from_pretrained('models/ablation2_1')
model = GPT2LMHeadModel.from_pretrained('models/ablation2_1', pad_token_id=tokenizer.eos_token_id)

# Sample start, stop
starts = ["Rick grew up in a troubled household. Rick was glad his family was no longer troubled. <MASK>"]

for s in tqdm(starts):
    encoded_input = tokenizer.encode(s, return_tensors='pt')
    output = model.generate(encoded_input, max_length=256, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    print(decoded_output)

### a2_2: Remove phrase generator

In [None]:
### Generate samples (missing sentence)

# Each sample is stored as a list of sentences, so during training/inference
#   we predict likelihood of sentence infilling between each sentence.
# Temporary marker for 1 missing (target) sentence
# Temporary markers for alternative missing sentences

import random
import pandas as pd
from random import choice
from tqdm import tqdm

DATA_PATH = './data/roc_scored.csv'
cols = ["sentence1", "sentence2", "sentence3", "sentence4", "sentence5", "score"]
rocs = pd.read_csv(DATA_PATH, usecols=cols)

def gen_samples(num_stories):
    all_samples = []
    two_s, three_s, four_s = 0, 0, 0    # Count num samples in each len category\n",

    for index in tqdm(range(num_stories)):

        # From S2, S3, S4, pick 0, 1, or 2 sentences\n",
        #   always want at least 1 infill\n",
        #   note 3 middle -> 5 total, no infill (neg samples do not include 5)\n",
        sample_indices = sorted(random.sample([2, 3, 4], random.choice([0, 1, 2])))
        # print(\"sample indices: {}\".format(sample_indices))\n",

        if len(sample_indices) == 0:
            two_s += 1
        elif len(sample_indices) == 1:
            three_s += 1
        elif len(sample_indices) == 2:
            four_s += 1

        # Generate randomized sample as list of strings (sentences), ints (placeholder masks)\n",
        sample = []
        sample.append(rocs.at[index, 'sentence1'])        # start always included\n",
        missing = []

        if 2 in sample_indices:
            sample.append(rocs.at[index, 'sentence2'])    
        else:
            missing.append(rocs.at[index, 'sentence2']) 

        if 3 in sample_indices:
            sample.append(rocs.at[index, 'sentence3'])
        else:
            missing.append(rocs.at[index, 'sentence3']) 

        if 4 in sample_indices:
            sample.append(rocs.at[index, 'sentence4'])
        else:
            missing.append(rocs.at[index, 'sentence4']) 

        infill = choice(missing)
        sample.append(rocs.at[index, 'sentence5'])        # stop always included
        sample = ' '.join(sample)
        sample += ' <SEP> ' + infill + '<|endoftext|>'
        
        all_samples.append(sample)

        

    print('-'*40)
    print('samples with 2 sentences: {}'.format(two_s))
    print('samples with 3 sentences: {}'.format(three_s))
    print('samples with 4 sentences: {}'.format(four_s))
    print('-'*40)
    return all_samples

samples = gen_samples(len(rocs))
df = pd.DataFrame()
df['text'] = samples

In [None]:
from sklearn.model_selection import train_test_split
import re

def build_dataset(df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    stories = df['text'].tolist()
    for s in stories:
        s = str(s).strip()
        s = re.sub(r"\s", " ", s)
        data += s + '\n'
    f.write(data)

# Build training, validation datasets
train_test_ratio = 0.8
df_train, df_valid = train_test_split(df, train_size = train_test_ratio, random_state = 1)

build_dataset(df_train, "data/train_ablation2_3.txt")
build_dataset(df_valid, "data/valid_ablation2_3.txt")

In [None]:
!CUDA_VISIBLE_DEVICES=0 python3 run_clm_no_trainer.py \
        --train_file data/train_ablation2_3.txt \
        --validation_file data/valid_ablation2_3.txt \
        --test_file data/valid_ablation2_3.txt \
        --do_train \
        --model_name_or_path e2e_baseline \
        --output_dir models/ablation2_3 \
        --gradient_accumulation_steps 8