In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch
import pprint

import sys
add_paths = ['style_transfer_paraphrase', 'style_transfer_paraphrase/style_paraphrase'
            ]
for add_path in add_paths: 
    if add_path not in sys.path: sys.path.append(add_path) 
from style_paraphrase.inference_utils import GPT2Generator
import data_preprocessing

# pip install transformers
# cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/fairseq
# pip install --editable .
# cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/fairseq/weights
# export ROBERTA_LARGE=$PWD/roberta.large
# cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot

# Model

In [3]:
paraphraser_model_dir = os.path.join("style_transfer_paraphrase/models", "paraphraser_gpt2_large")
#inverse_paraphraser_model_dir = os.path.join("style_transfer_paraphrase/models", "bible")
inverse_paraphraser_model_dir = "style_transfer_paraphrase/style_paraphrase/saved_models/298954459172700181_muffins/checkpoint-9294"

# 6795
with torch.cuda.device(0):
    model = GPT2Generator(inverse_paraphraser_model_dir)
    #paraphraser = GPT2Generator(paraphraser_model_dir, upper_length="same_5")

In [14]:
top_p_paraphrase = 0.0 # 0.0
top_p_style = 0.7 # 0.7
use_paraphraser = False

#input_text = "I should have created all the animals first."
#input_text = "Hey, how was your day?"
#input_text = "That's really nice of you."
#input_text = "My name is sarah."
#input_text = "Do you have any pets?"
#input_text = "I have a pet cat."
#input_text = "I love cats, they're so cute."
input_text = "What other hobbies do you have?"

print([input_text])
with torch.cuda.device(0):
    with torch.no_grad():
        if use_paraphraser:
            output_paraphrase = paraphraser.generate_batch([input_text], top_p=top_p_paraphrase)[0]
        else:
            output_paraphrase = [input_text]
        print(output_paraphrase)

        transferred_output = model.generate_batch(output_paraphrase, top_p=top_p_style)[0]
        print(transferred_output)
    

['What other hobbies do you have?']
['What other hobbies do you have?']
['What other hobbies are you into?']


# Data Prep

In [2]:
# preps based on instructions from https://github.com/martiansideofthemoon/style-transfer-paraphrase#custom-datasets

def prep_discord_data_for_style_transfer(channel_dict, output_dir):
    for user in channel_dict:
        print(user)
        user_output_dir = os.path.join(output_dir, user)
        os.makedirs(user_output_dir, exist_ok=True)
        
        for data_file in ["train.txt", "dev.txt", "test.txt"]:
            num_msgs = 0
            with open(os.path.join(user_output_dir, data_file), "w") as f:
                for msg in channel_dict[user]:
                    try:
                        f.write("{}\n".format(msg))
                        num_msgs += 1
                    except:
                        pass
        
        for label_file in ["train.label", "dev.label", "test.label"]:
            with open(os.path.join(user_output_dir, label_file), "w") as f:
                for i in range(num_msgs):
                    f.write("{}\n".format(user))
        
        # convert plaintext to BPE
        cmd = "python style_transfer_paraphrase/datasets/dataset2bpe.py --dataset {}".format(user_output_dir)
        print(cmd)
        print("")
        
        # convert BPE to binaries
        cmd = "style_transfer_paraphrase/datasets/bpe2binary.sh {}".format(user_output_dir)
        print(cmd)
        print("")
        
        # paraphrase the dataset
        cmd = "cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/style_transfer_paraphrase && python datasets/paraphrase_splits.py --dataset {} --model_dir {}".format('/'.join(user_output_dir.split('/')[1:]), "models/paraphraser_gpt2_large" )
        print(cmd)
        print("")

        break

In [3]:
channel = "298954459172700181"
#channel = "390037306230177803"
discord_data_dir = "../data/test_data_small"

channel_messages, message_counts = data_preprocessing.process_discord_data(discord_data_dir, channel, 3)
print(message_counts[:10])

[['muffins', 12391], ['LenKagamine', 11978], ['Sezzy', 11083], ['Dich', 9058], ['Fabrin', 9001], ['GR88', 8337], ['Victoria', 7341], ['DanniKawai', 7322], ['Neysa', 6949], ['Stalker', 6860]]


In [4]:
valid_users = [user[0] for user in message_counts if user[1]>1000]
print(valid_users)
style_transfer_msgs = {"{}_{}".format(channel, user):channel_messages[user] for user in valid_users}

prep_discord_data_for_style_transfer(style_transfer_msgs, "style_transfer_paraphrase/datasets")

['muffins', 'LenKagamine', 'Sezzy', 'Dich', 'Fabrin', 'GR88', 'Victoria', 'DanniKawai', 'Neysa', 'Stalker', 'Mkie', 'TPetraT', 'ScytheOfTheUnholy', 'iza', 'boredmuziekmaster', 'BlackCadillac', 'hafts', 'Cliffu', 'SmallFan', 'skrr', 'Lou', 'AiOhto', 'CPBBAE', 'DimitriosPagourtzis', 'yodad', 'Exynos', 'LucidCapture', 'Erkis', 'Sylvbutold', 'panic', 'LucasPhelma', 'Dx8pi', 'soul', 'Cinder', 'akame.', 'ImmortaL', 'kan2', 'silent', 'CCChesterC', 'Jess', 'Dabbu', 'Marlene', 'LordofHollows', 'WhiteBoy', 'Dr.Senpips', 'AdrYuu', 'Aiz']
298954459172700181_muffins
python style_transfer_paraphrase/datasets/dataset2bpe.py --dataset style_transfer_paraphrase/datasets/298954459172700181_muffins

style_transfer_paraphrase/datasets/bpe2binary.sh style_transfer_paraphrase/datasets/298954459172700181_muffins

cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/style_transfer_paraphrase && python datasets/paraphrase_splits.py --dataset datasets/298954459172700181_muffins --model_dir models/paraphra

In [3]:
cmd = "cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/style_transfer_paraphrase" 
print(cmd)
cmd = "style_paraphrase/examples/298954459172700181_muffins/run_finetune_298954459172700181_muffins.sh"
print(cmd)
cmd = "style_paraphrase/examples/298954459172700181_muffins/run_finetune_298954459172700181_muffins_0.sh"
print(cmd)
cmd = "style_paraphrase/examples/298954459172700181_muffins/run_finetune_298954459172700181_muffins_1.sh"
print(cmd)

cd /home/svcl-oowl/brandon/classes/ECE_229/project/chatbot/style_transfer_paraphrase
style_paraphrase/examples/298954459172700181_muffins/run_finetune_298954459172700181_muffins.sh
style_paraphrase/examples/298954459172700181_muffins/run_finetune_298954459172700181_muffins_0.sh
style_paraphrase/examples/298954459172700181_muffins/run_finetune_298954459172700181_muffins_1.sh
