In [1]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 2.0.0


In [2]:
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 256  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 16
    BATCH_UPDATE    = 128
else:
    TRAIN_BATCHSIZE = 8
    BATCH_UPDATE    = 256

EPOCHS          = 3
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020


DEVIDE_BY = 16

os.environ['WANDB_DISABLED'] = 'true'

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [4]:
train_df = pd.read_csv('D:/amazon_1gb/train.csv',header=None)
test_df = pd.read_csv('D:/amazon_1gb/test.csv',header=None)

In [5]:
train_df = train_df.dropna()
train_df = train_df.astype('str')
test_df = test_df.dropna()
test_df = test_df.astype('str')

In [6]:
sum = 0
sample_num = 1000
for review in train_df.sample(sample_num).iloc[:, 2]:
    sum += len(review.split(' '))
print(sum/sample_num)

74.244


In [7]:
# For debug
train_df = train_df.sample(int(len(train_df) / DEVIDE_BY))
test_df = test_df.sample(int(len(test_df) / DEVIDE_BY / 5))
f'There are {len(train_df) :,} samples for training, and {len(test_df) :,} samples for validation testing'

'There are 224,995 samples for training, and 4,999 samples for validation testing'

In [8]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):
        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.title     = data.iloc[:, 1].tolist()
        self.text      = data.iloc[:, 2].tolist()


    #---------------------------------------------#

    def __len__(self):
        return len(self.text)

    #---------------------------------------------#
    
    def __getitem__(self, i):
        input = SPECIAL_TOKENS['bos_token'] + self.title[i] + SPECIAL_TOKENS['sep_token'] + self.text[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [9]:
def split_data(data, S=TRAIN_SIZE):
    train_data = data.sample(frac = TRAIN_SIZE)
    val_data = data.drop(train_data.index)

    return train_data, val_data

In [10]:
def get_tokenizer(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    # Set eos_token as the padding token
    tokenizer.pad_token = tokenizer.eos_token

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config, from_tf=True)
    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model


In [11]:
%%time

tokenizer = get_tokenizer(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                #   load_model_path='pytorch_model.bin'
                 )

Special tokens added


All TF 2.0 model weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.


CPU times: total: 7.98 s
Wall time: 4.56 s


In [12]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [13]:
train_dataset = myDataset(train_df, tokenizer)
val_dataset = myDataset(test_df, tokenizer, randomize=False)

In [14]:
#UNCOMMENT THESE TO TRAIN THE GPT MODEL (ETA: 7 HRS)
#%%time

training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy = 'epoch',
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to = None,
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


  0%|          | 0/1755 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.2619, 'learning_rate': 0.00037915407854984895, 'epoch': 0.85}


  0%|          | 0/1667 [00:00<?, ?it/s]

{'eval_loss': 1.346152424812317, 'eval_runtime': 83.9003, 'eval_samples_per_second': 59.583, 'eval_steps_per_second': 19.869, 'epoch': 1.0}
{'loss': 1.388, 'learning_rate': 0.0002280966767371601, 'epoch': 1.71}


  0%|          | 0/1667 [00:00<?, ?it/s]

{'eval_loss': 1.330527901649475, 'eval_runtime': 83.9503, 'eval_samples_per_second': 59.547, 'eval_steps_per_second': 19.857, 'epoch': 2.0}
{'loss': 1.359, 'learning_rate': 7.70392749244713e-05, 'epoch': 2.56}


  0%|          | 0/1667 [00:00<?, ?it/s]

{'eval_loss': 1.325377345085144, 'eval_runtime': 83.2588, 'eval_samples_per_second': 60.042, 'eval_steps_per_second': 20.022, 'epoch': 3.0}
{'train_runtime': 48920.2208, 'train_samples_per_second': 13.798, 'train_steps_per_second': 0.036, 'train_loss': 1.6227318744713763, 'epoch': 3.0}


In [15]:
tokenizer = get_tokenizer(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='pytorch_model.bin')


Special tokens added


All TF 2.0 model weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.


In [22]:
title = "Wings of Fire"
prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token'] 
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [28]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=10
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title)  
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: of Fire<|SEP|>Lance(S)^[C]{3}H*EJX'I,6O\R9KQ5Y-F2G4ZU?V8D.A7N0@M!T+#$1 M=&_`;%',/:~"^^'/",/- '?''; - I'-':,,/,,-:- "'".:' (i-'.'..''...), ;■• ■£.-/. :,-. i'.-,j:, /,. a.: e.* Jn,'.; >-- = ou *., ua 1 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
—–»« » « ——— | ^r r nf


2: of Fire<|SEP|>Lords Of War</b><i>"War"/"~^[a]$'(.&=\@+#%/S,0T;?4I_9Z1X5V6Y3C-J2MQ8KG)7A`E*RHUO!D}F:NKK MIXED "** WATER](\\.*)/\/



3: of Fire<|SEP|>Achievements[0-19]
 (1) "Fire" Level 1, Skill Points: 100% | [100%] Lv. 4 - 9 XP Required to level up the skill in any game mode or by using a special ability that grants you additional points from skills and abilities for your own purposes! <~=The first time I ever played with it on my 3DS was at E3 2014 when we were playing Supe

In [29]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title) 
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: of Fire<|SEP|>Razor-Wraith</b><br /> <a href="http://www.netflix.com/browse/genre/1,539" rel="nofollow">Fantastic Beasts and Where to Find Them</a></p><p align='Left'><font face='Strat2Medium' color='#FFFFFF' size='14'></font></p><p align='Left'><font face='Strat2Medium' color='#CCCCCC' size='14'>Locked<br>Next Rank Cost: 1 points</font></p> STRIPPED TEXT = FANTASTIC BEINGS AND WHERE TO Find Them Locked Next Rank Cost: 1 points RANKUP HTML = <p align='Left'><font face='Strat2Medium' color='#CCCCCC' size='14'>Next Rank Cost: 1 points</font></p><p align='Left'><font face='Strat2Medium' color='#28AA00' size='14'>+10% Base Damage Rating when you take damage from a Critical Strike.</font></p> RANKUP STRIPPED TEXT = Next




In [30]:
#Generating raw text with GPT-2
tokenizer = get_tokenizer()
model = get_model(tokenizer)

All TF 2.0 model weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.


In [32]:
prompt = title

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval()
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0: Wings of Fire

In addition to being a great way to get your feet wet, it can also help reduce the amount of time you'll have to clean up after yourself. Here are some tips on how to keep your hands dry:

Clean Your Hands After Cleaning Up

Don't just wash your hands with soap and water. If you don't want to do that, try using an old toothbrush or scrubber instead. In fact, if you've ever washed your hands before, this is probably one of the best ways to make sure they're getting used to washing them properly.


