In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch
import pprint
import glob
import random

import numpy as np

import sys
add_paths = ['..', 'style_transfer_paraphrase', 'style_transfer_paraphrase/style_paraphrase']
for add_path in add_paths: 
    if add_path not in sys.path: sys.path.append(add_path)
        
from style_paraphrase.inference_utils import GPT2Generator
import data_preprocessing

# prep work:
# conda env create --file /home/svcl-oowl/brandon/misc/dockerfiles/A40_para.yaml
# conda activate A40_para
#? export ROBERTA_LARGE=/home/svcl-oowl/brandon/classes/ECE_229/project/ece_229_group_9_nlp_suite/nlp_suite/chatbot/style_transfer_paraphrase/fairseq/weights/roberta.large


# pip install transformers
# cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/fairseq
# pip install --editable .
# cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/fairseq/weights
# export ROBERTA_LARGE=$PWD/roberta.large
# cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot

# Model

In [None]:
paraphraser_model_dir = os.path.join("style_transfer_paraphrase/models", "paraphraser_gpt2_large")
#inverse_paraphraser_model_dir = os.path.join("style_transfer_paraphrase/models", "bible")
inverse_paraphraser_model_dir = "style_transfer_paraphrase/style_paraphrase/saved_models/298954459172700181_muffins/checkpoint-9294"

# 6795
with torch.cuda.device(0):
    model = GPT2Generator(inverse_paraphraser_model_dir)
    #paraphraser = GPT2Generator(paraphraser_model_dir, upper_length="same_5")

In [None]:
top_p_paraphrase = 0.0 # 0.0
top_p_style = 0.7 # 0.7
use_paraphraser = False

#input_text = "I should have created all the animals first."
#input_text = "Hey, how was your day?"
#input_text = "That's really nice of you."
#input_text = "My name is sarah."
#input_text = "Do you have any pets?"
#input_text = "I have a pet cat."
#input_text = "I love cats, they're so cute."
input_text = "What other hobbies do you have?"

print([input_text])
with torch.cuda.device(0):
    with torch.no_grad():
        if use_paraphraser:
            output_paraphrase = paraphraser.generate_batch([input_text], top_p=top_p_paraphrase)[0]
        else:
            output_paraphrase = [input_text]
        print(output_paraphrase)

        transferred_output = model.generate_batch(output_paraphrase, top_p=top_p_style)[0]
        print(transferred_output)
    

# Data Prep

In [3]:
# preps based on instructions from https://github.com/martiansideofthemoon/style-transfer-paraphrase#custom-datasets

def prep_discord_data_for_style_transfer(user, user_messages, output_dir, dev_ratio=0.10, test_ratio=0.02):
    user_output_dir = os.path.join(output_dir, user)
    os.makedirs(user_output_dir, exist_ok=True)
    
    # partitioning user messages
    user_messages = random.sample(user_messages, len(user_messages))
    train_messages, dev_messages, test_messages = np.split(user_messages, [int(len(user_messages)*(1-test_ratio-dev_ratio)), int(len(user_messages)*(1-test_ratio))])
    partitions_info = {"train":{"data_file": "train.txt", "label_file": "train.label", "num_messages":0, "messages":train_messages},
                      "dev":{"data_file": "dev.txt", "label_file": "dev.label", "num_messages":0, "messages":dev_messages},
                      "test":{"data_file": "test.txt", "label_file": "test.label", "num_messages":0, "messages":test_messages}}

    for partition in partitions_info:
        data_file = partitions_info[partition]["data_file"]
        label_file = partitions_info[partition]["label_file"]
        num_messages = partitions_info[partition]["num_messages"]
        messages = partitions_info[partition]["messages"]
    
        with open(os.path.join(user_output_dir, data_file), "w") as f:
            for msg in messages:
                try:
                    f.write("{}\n".format(msg))
                    num_messages += 1
                except:
                    pass

        with open(os.path.join(user_output_dir, label_file), "w") as f:
            for i in range(num_messages):
                f.write("{}\n".format(user))

    cmd = "cd style_transfer_paraphrase"
    print(cmd)
    print("")
                
    # convert plaintext to BPE
    cmd = "python datasets/dataset2bpe.py --dataset {}".format('/'.join(user_output_dir.split('/')[1:]))
    print(cmd)
    print("")
    
    # convert BPE to binaries # can reduce
    cmd = "datasets/bpe2binary.sh {}".format('/'.join(user_output_dir.split('/')[1:]))
    print(cmd)
    print("")
                                             
    cmd = "cp datasets/{}-bin/dict.txt datasets/{}".format(user, user)
    print(cmd)
    print("")

    # paraphrase the dataset
    cmd = "python datasets/paraphrase_splits.py --dataset {} --model_dir {}".format('/'.join(user_output_dir.split('/')[1:]), "models/paraphraser_gpt2_large" )
    print(cmd)
    print("")
    
    # fine tune inverse paraphraser (run in style_transfer_paraphrase)
    cmd = """python style_paraphrase/run_lm_finetuning.py \
    --output_dir=style_paraphrase/saved_models/{} \
    --model_type=gpt2 \
    --model_name_or_path=gpt2-large \
    --data_dir=datasets/{} \
    --do_train \
    --save_steps 1000 \
    --evaluate_during_training \
    --logging_steps 400 \
    --save_total_limit -1 \
    --num_train_epochs 3 \
    --gradient_accumulation_steps 1 \
    --per_gpu_train_batch_size 16 \
    --job_id {} \
    --learning_rate 5e-5 \
    --prefix_input_type paraphrase_250 \
    --global_dense_feature_list none \
    --specific_style_train -1 \
    --optimizer adam""".format(user, user, user)
    print(cmd)

In [4]:
channel_id = "689963458967765003"
discord_data_dir = "../../discord_dataset/discord-v3-detox-antispam"
user_name = "NoPoint"

chat_log_paths = glob.glob(os.path.join(discord_data_dir, "*{}*".format(channel_id)))
channel_messages, message_counts = data_preprocessing.process_discord_data(chat_log_paths, 3)
user_messages = channel_messages[user_name]

In [5]:
prep_discord_data_for_style_transfer(user_name, user_messages, "style_transfer_paraphrase/datasets")

cd style_transfer_paraphrase

python datasets/dataset2bpe.py --dataset datasets/NoPoint

datasets/bpe2binary.sh datasets/NoPoint

cp datasets/NoPoint-bin/dict.txt datasets/NoPoint

python datasets/paraphrase_splits.py --dataset datasets/NoPoint --model_dir models/paraphraser_gpt2_large

python style_paraphrase/run_lm_finetuning.py     --output_dir=style_paraphrase/saved_models/NoPoint     --model_type=gpt2     --model_name_or_path=gpt2-large     --data_dir=datasets/NoPoint     --do_train     --save_steps 1000     --evaluate_during_training     --logging_steps 400     --save_total_limit -1     --num_train_epochs 3     --gradient_accumulation_steps 1     --per_gpu_train_batch_size 16     --job_id NoPoint     --learning_rate 5e-5     --prefix_input_type paraphrase_250     --global_dense_feature_list none     --specific_style_train -1     --optimizer adam
