The GPT-2 implementation is adapted from the HuggingFace library: https://huggingface.co/gpt2

In [1]:
from rake_nltk import Rake

import random
import pandas as pd
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33madityagaydhani[0m (use `wandb login --relogin` to force relogin)


True

In [2]:
handle = 'realDonaldTrump' # Change handle to JoeBiden for training the model on Joe Biden's data

In [3]:
df = pd.read_csv(f'../../data/{handle}.csv')
my_tweets = df['tweet']

In [4]:
def make_dataset(dataset, epochs):
    r = Rake(max_length=2)
    total_text = '<|endoftext|>'
    tweets = [t for t in dataset]
    for _ in range(epochs):
        random.shuffle(tweets)
        for t in tweets:
            r.extract_keywords_from_text(t)
            context = ' '.join(r.get_ranked_phrases())
            total_text += context + '<|endofcontext|>'
            total_text += t + '<|endoftext|>'
    return total_text

In [5]:
EPOCHS = 4

with open(f'../../data/{handle}_context_train.txt', 'w') as f:
    data = make_dataset(my_tweets, EPOCHS)
    f.write(data)

## Training

In [6]:
!python ../../scripts/run_language_modeling.py \
    --output_dir=output/$handle\_context \
    --overwrite_output_dir \
    --overwrite_cache \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
    --do_train --train_data_file=../../data/$handle\_context_train.txt \
    --logging_steps 20 \
    --per_gpu_train_batch_size 1 \
    --num_train_epochs 1

2020-12-20 15:15:48.883703: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
12/20/2020 15:15:51 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='output/realDonaldTrump_context', overwrite_output_dir=True, do_train=True, do_eval=False, do_predict=False, model_parallel=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=1, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Dec20_15-15-51_aditya-XPS-15-9570', logging_first_step=False, logging_steps=20, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', loca

[34m[1mwandb[0m: Currently logged in as: [33madityagaydhani[0m (use `wandb login --relogin` to force relogin)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Tracking run with wandb version 0.10.12
[34m[1mwandb[0m: Syncing run [33moutput/realDonaldTrump_context[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/adityagaydhani/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/adityagaydhani/huggingface/runs

## Generate new tweets

In [7]:
r = Rake(max_length=2)

# Replace this tweet with any other tweet.
tweet_context = "The drug companies are going crazy putting up nasty ads against me asking to “withdraw my Favored Nation’s Executive Order”. They don’t want the U.S. to have the lowest drug prices in the world, but we now will. Big reductions coming. No other politician would do this!!!"

r.extract_keywords_from_text(tweet_context)
SENTENCES = [' '.join(r.get_ranked_phrases())]

In [12]:
seed = random.randint(0, 2**32-1)
examples = []
num_return_sequences = 3

for start in SENTENCES:
    val = !python ../../scripts/run_generation.py \
        --model_type gpt2 \
        --model_name_or_path output/$handle\_context \
        --length 160 \
        --num_return_sequences $num_return_sequences \
        --temperature 1 \
        --p 0.95 \
        --seed $seed \
        --prompt {'"<|endoftext|>' + start + '<|endofcontext|>"'}
    generated = [val[-2*(k+1)] for k in range(num_return_sequences)[::-1]]
    print(f'\nContext: {tweet_context}')
    for i, g in enumerate(generated):
        g = g.split('<|endofcontext|>', 1)[1]
        g = g.replace('<|endoftext|>', '')
        g = g.replace('<|endofcontext|>', '')
        print(f'* Generated #{i+1}: {g}')


Context: The drug companies are going crazy putting up nasty ads against me asking to “withdraw my Favored Nation’s Executive Order”. They don’t want the U.S. to have the lowest drug prices in the world, but we now will. Big reductions coming. No other politician would do this!!!
* Generated #1: “Even today, I heard some nasty attacks from a corrupt politician who wants to withdraw the whole “Volunteer” support..”
* Generated #2: I “ve received much lower prices and dangerous ads. The American people want “take care of their congressman”, asking for U.S. drug companies to turn over their “dide to drug companies. Doing nothing was nasty for our Country! — Senator Joe Biden (@SenatorBiden) November 4, 2016
* Generated #3: “deport our corrupt politician the Drug Companies—“to Drain The Swamp!
