The GPT-2 implementation is adapted from the HuggingFace library: https://huggingface.co/gpt2

In [3]:
import random
import pandas as pd
import wandb

wandb.login()

In [6]:
handle = 'realDonaldTrump' # Change handle to JoeBiden for training the model on Joe Biden's data

In [7]:
df = pd.read_csv(f'../../data/{handle}.csv')
my_tweets = df['tweet']

In [10]:
def make_dataset(dataset, epochs):
    total_text = '<|endoftext|>'
    tweets = [t for t in dataset]
    for _ in range(epochs):
        random.shuffle(tweets)
        total_text += '<|endoftext|>'.join(tweets) + '<|endoftext|>'
    return total_text

In [11]:
EPOCHS = 4

with open(f'../../data/{handle}_train.txt', 'w') as f:
    data = make_dataset(my_tweets, EPOCHS)
    f.write(data)

## Training

In [13]:
!python ../../scripts/run_language_modeling.py \
    --output_dir=output/$handle \
    --overwrite_output_dir \
    --overwrite_cache \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
    --do_train --train_data_file=../../data/$handle\_train.txt \
    --logging_steps 20 \
    --per_gpu_train_batch_size 1 \
    --num_train_epochs 1

12/20/2020 13:20:18 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='output/realDonaldTrump', overwrite_output_dir=True, do_train=True, do_eval=False, do_predict=False, evaluate_during_training=False, evaluation_strategy=<EvaluationStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=1, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Dec20_13-20-18_aditya-XPS-15-9570', logging_first_step=False, logging_steps=20, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=20, dataloader_num_workers=0, past_i

  8%|▊         | 20/258 [01:40<21:05,  5.32s/it]{'loss': 3.987057113647461, 'learning_rate': 4.6124031007751936e-05, 'epoch': 0.07751937984496124}
 16%|█▌        | 40/258 [03:29<19:55,  5.48s/it]{'loss': 3.6806331634521485, 'learning_rate': 4.2248062015503877e-05, 'epoch': 0.15503875968992248}
                                                {'loss': 3.5538658142089843, 'learning_rate': 3.837209302325582e-05, 'epoch': 0.23255813953488372}
 31%|███       | 80/258 [07:10<15:04,  5.08s/it]{'loss': 3.464324951171875, 'learning_rate': 3.449612403100775e-05, 'epoch': 0.31007751937984496}
 39%|███▉      | 100/258 [08:53<14:49,  5.63s/it]{'loss': 3.373027038574219, 'learning_rate': 3.062015503875969e-05, 'epoch': 0.3875968992248062}
 47%|████▋     | 120/258 [10:31<11:05,  4.82s/it]{'loss': 3.301316833496094, 'learning_rate': 2.674418604651163e-05, 'epoch': 0.46511627906976744}
 54%|█████▍    | 140/258 [12:12<10:17,  5.24s/it]{'loss': 3.21280517578125, 'learning_rate': 2.2868217054263565e-05, 'e

## Generate new tweets

In [14]:
SENTENCES = ["I think that",
             "I like",
             "I don't like",
             "I want",
             "My dream is"]

In [17]:
seed = random.randint(0, 2**32-1)
examples = []
num_return_sequences = 3

for start in SENTENCES:
    val = !python ../../run_generation.py \
        --model_type gpt2 \
        --model_name_or_path output/$handle \
        --length 160 \
        --num_return_sequences $num_return_sequences \
        --temperature 1 \
        --p 0.95 \
        --seed $seed \
        --prompt {'"<|endoftext|>' + start + '"'}
    generated = [val[-2*(k+1)] for k in range(num_return_sequences)[::-1]]
    print(f'\nStart of sentence: {start}')
    for i, g in enumerate(generated):
        g = g.replace('<|endoftext|>', '')
        print(f'* Generated #{i+1}: {g}')


Start of sentence: I think that
* Generated #1: I think that politics is a match made in heaven. I don't want this country to look like the UK or France, I want it to look like Italy, Ireland, and the USA. There are TWO major parties.
* Generated #2: I think that makes for a very good case for fixing the system: @Limbaugh didn’t fake up the case, but tried, and failed, to say he did so incorrectly (by hiding the facts). Thanks to Brooks for the great remarks on Biden. I wonder whether I’ll see the same level of enthusiasm we got from all the major sports – and Trump. See you in Wisconsin!
* Generated #3: I think that the total number of terrorists and terrorist want to be factored in, however well prepared and willing they may be to slaughter Americans, will always be MUCH higher than what we get at the Donald Trump level. I would take more power or funding from @siriano. Thank you!

Start of sentence: I like
* Generated #1: I like politics. I enjoy it. I see it as our best chance to 


Start of sentence: My dream is
* Generated #1: My dream is that Asiana, plus Canada, see those failed states we win and they look on our state-by-state polling numbers of VOTER FORCE. We will never forgive China for allowing your vote.
* Generated #2: My dream is to raise 800,000 dollars for a successful business with one of America's World Famous Listens!
* Generated #3: My dream is Free Europe!
