In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
# from fastai.text.all import *
# from fastai.text.all import get_text_files

In [4]:
import transformers
transformers.__version__

'3.4.0'

In [2]:
pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

In [14]:
from nltk.corpus import words

In [15]:
'in' in words.words()

True

In [4]:
positive_train_path = '/home/advaitmb/.fastai/data/imdb/train/pos/'
positive_train_files = get_text_files(positive_train_path)

negative_train_path = '/home/advaitmb/.fastai/data/imdb/train/neg/'
negative_train_files = get_text_files(negative_train_path)


positive_test_path = '/home/advaitmb/.fastai/data/imdb/test/pos/'
positive_test_files = get_text_files(positive_test_path)

negative_test_path = '/home/advaitmb/.fastai/data/imdb/test/neg/'
negative_test_files = get_text_files(negative_test_path)

In [5]:
def files_to_list(files):
    ls = []
    for file in files:
        with open(file, 'r') as f:
            ls.append(f.read())
    return ls


In [6]:
positive_train_text_list = files_to_list(positive_train_files)
negative_train_text_list = files_to_list(negative_train_files)
positive_test_text_list = files_to_list(positive_test_files)
negative_test_text_list = files_to_list(negative_test_files)

In [9]:
import re

def clean_html(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [10]:
def clean_newlines(raw_text):
    return raw_text.replace('\n', '')

In [11]:
for i in range(len(positive_train_text_list)):
    positive_train_text_list[i] = clean_newlines(clean_html(positive_train_text_list[i]))
    
for i in range(len(negative_train_text_list)):
    negative_train_text_list[i] = clean_newlines(clean_html(negative_train_text_list[i]))
    
for i in range(len(positive_test_text_list)):
    positive_test_text_list[i] = clean_newlines(clean_html(positive_test_text_list[i]))
    
for i in range(len(negative_test_text_list)):
    negative_test_text_list[i] = clean_newlines(clean_html(negative_test_text_list[i]))

In [13]:
with open('positive_train_text', 'w') as f:
    for text in positive_train_text_list:
        f.write(text + '\n')
        
with open('negative_train_text', 'w') as f:
    for text in negative_train_text_list:
        f.write(text + '\n')
        
with open('positive_test_text', 'w') as f:
    for text in positive_test_text_list:
        f.write(text + '\n')
        
with open('negative_test_text', 'w') as f:
    for text in negative_test_text_list:
        f.write(text + '\n')

In [15]:
with open('positive_train_text', 'r') as f:
    print(f.read()[:5000])

Steven Speilberg's adaptation of Alice Walkers popular novel is not without its share of controversy. When first released members of the black community criticised its treatment of black men, while others questioned why a white man was directing this film about black women.This is the story of a young black woman named Celie, growing up in rural America after the turn of the century. She has two children by her abusive father which are snatched from her arms at birth. Her only solace in her miserable life comes from her sister.Celie (played in later years by newcomer Whoopie Goldberg) is married off to an abusive husband (Danny Glover). The husband is humiliated by the sister and so she is quickly removed from Celie's life.The story is often heartbreaking as Celie keeps up hope that she may one day be reunited with her sister and with her children. Throughout her life she meets an assortment of characters, including Sophia, a tough as nails wife to her step son, and Shug, a loud and lu

In [4]:
train_path = 'negative_train_text'
test_path = 'negative_test_text'

In [6]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [9]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./gpt2-imdb-negative-sentiment", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    prediction_loss_only=True,
)

In [10]:
trainer.train()

Step,Training Loss
500,4.020727
1000,3.946644
1500,3.902403
2000,3.869069
2500,3.872636
3000,3.834459
3500,3.822205
4000,3.684422
4500,3.703885
5000,3.686973


TrainOutput(global_step=10485, training_loss=3.724238122913686)

In [11]:
trainer.save_model()

In [10]:
prompt = "acomplete"
prompt_ids = tokenizer.encode(prompt)
inp = tensor(prompt_ids)[None].cuda()

In [11]:
inp

tensor([[ 330,  296, 6677]], device='cuda:0')

In [19]:
beam_outputs = trainer.model.generate(
    inp, 
    max_length=20, 
    num_beams=10, 
    repeatition_penalty=2.,  
    early_stopping=True,
    do_sample=True
)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [20]:
for i, beam_output in enumerate(beam_outputs):
    output = tokenizer.decode(beam_output.cpu().numpy(), skip_special_tokens=True)
#     sentiment = sentiment_model.forward(sentiment_tokenizer.encode(prompt[len(prompt):], return_tensors="pt"))
    
    print("{}: \n{}".format(i, output))
    

0: 
this movie is the worst movie I have ever seen in my life. The acting is terrible, the
1: 
this movie is a waste of your time and money. If you want to see a good movie,
2: 
this movie is so bad it's good, but it's not even good enough to be good.
3: 
this movie is so bad, it's hard to believe that it was made in the first place.
4: 
this movie is so bad, it's hard to believe that it was made in the first place.
5: 
this movie is a waste of time and money.
This movie is a waste of time and money
6: 
this movie is so bad that it's not even funny. The acting is so bad that it's
7: 
this movie is a waste of time and money.
This is the worst movie I have ever seen
8: 
this movie is a waste of time. If you want to watch a bad movie, don't watch
9: 
this movie is a waste of time. It's not funny, it's not funny, it's
