In [None]:
# Import libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from string import punctuation
import re
import nltk
import torch
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, Dataset
from keras.preprocessing.text import Tokenizer
from tqdm import tqdm
tqdm.pandas()
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
# Read data
nekdoru = pd.read_csv('datasets/nekdoru.csv')
# anekdot = pd.read_csv('datasets/anekdotyru.csv')
# concat two dataframes
df = pd.concat([nekdoru, anekdot])
df = df[df['text'].notna()]
df.head()

In [None]:
# ref: Lab5_More_on_LSTM.ipynb
#ref: https://www.kaggle.com/alxmamaev/how-to-easy-preprocess-russian-text 
# Preporcess data
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9а-я #+_]')
STOPWORDS = set(stopwords.words('russian'))
# mystem = Mystem() 

def preprocessing(text):
  # lowercase text
  text = text.lower()
  # replace REPLACE_BY_SPACE_RE symbols by space in text
  text = REPLACE_BY_SPACE_RE.sub(' ', text)
  # remove symbols which are in BAD_SYMBOLS_RE from text
  text = BAD_SYMBOLS_RE.sub('', text) 
  # text = text.replace('x', '')
  
  # Single character removal
  text = re.sub(r"\s+[а-яА-Я]\s+", ' ', text)
  text = re.sub(r'<[^>]+>', '', text)

  # remove stopwors from text 
  # tokens = mystem.lemmatize(text)
  tokens = text.split()
  tokens = [token for token in tokens if token not in STOPWORDS\
              and token != " " \
              and token.strip() not in punctuation]
  text = ' '.join(tokens)
  return text
  
df['text'] = df['text'].progress_apply(preprocessing)

In [None]:
df.to_csv('preprocessed.csv')

In [None]:
# tokenize
sent_tokens = []
total_words = 0
for sent in df['text']:
  tokens = word_tokenize(sent)
  total_words += len(tokens)
  sent_tokens.append(tokens)

In [None]:
# ref: Lab5_More_on_LSTM.ipynb 
# Convert all words into numbers using keras tokenizer
all_words = np.ravel(sent_tokens)
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(all_words)
X = tokenizer.texts_to_sequences(sent_tokens)
Y = df['rate'].values

In [None]:
# Convert csv files to txt by adding end of text tokens
import csv
csv_file1 = 'datasets/anekdotyru.csv'
csv_file2 = 'datasets/nekdoru.csv'
txt_file = 'input.txt'
with open(txt_file, "w") as my_output_file:
  with open(csv_file1, "r") as my_input_file:
    for row in csv.reader(my_input_file):
      if len(row) > 1 and row[1] != 'text':
        my_output_file.write(row[1] + '<|endoftext|>')
  with open(csv_file2, "r") as my_input_file:
    for row in csv.reader(my_input_file):
      if len(row) > 1 and row[1] != 'text':
        my_output_file.write(row[1]  + '<|endoftext|>')
  my_output_file.close()

In [None]:
%pip install transformers

In [None]:
import csv
import logging
logging.getLogger().setLevel(logging.CRITICAL)
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import torch
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400
device = 'cpu'
if torch.cuda.is_available():
  device = 'cuda'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

class JokesDataset(Dataset):
  def __init__(self, jokes_dataset_paths):
    super().__init__()
    self.joke_list = []
    self.end_of_text_token = "<|endoftext|>"  
    for path in jokes_dataset_paths:
      with open(path) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
          if len(row) > 1 and row[1] != 'text':
            joke_str = f"JOKE:{row[1]}{self.end_of_text_token}"
            self.joke_list.append(joke_str)
        
  def __getitem__(self, item):
    return self.joke_list[item]
  def __len__(self):
    return len(self.joke_list)

dataset = JokesDataset(jokes_dataset_paths=['anekdotyru.csv', 'nekdoru.csv'])
joke_loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Function to first select topN tokens from the probability list and then based on the selected N word distribution get random token ID
def choose_from_top(probs, n=5):
  ind = np.argpartition(probs, -n)[-n:]
  top_prob = probs[ind]
  top_prob = top_prob / np.sum(top_prob) # Normalize
  choice = np.random.choice(n, 1, p = top_prob)
  token_id = ind[choice][0]
  return int(token_id)

def generate_some_text(input_str, text_len = 250):
  cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)
  model.eval()
  with torch.no_grad():
    for i in range(text_len):
      outputs = model(cur_ids, labels=cur_ids)
      loss, logits = outputs[:2]
      softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(only one) batch and the last predicted embedding
      next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=10) #Randomly(from the given probability distribution) choose the next word from the top n words
      cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word

    output_list = list(cur_ids.squeeze().to('cpu').numpy())
    output_text = tokenizer.decode(output_list)
    print(output_text)
    
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_jokes_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
  os.mkdir(models_folder)

for epoch in range(EPOCHS):  
  print(f"EPOCH {epoch} started" + '=' * 30)
    
  for idx, joke in enumerate(joke_loader):
#     if idx > 5000:
#       break
      
    joke_tens = torch.tensor(tokenizer.encode(joke[0])).unsqueeze(0).to(device)
    # Skip sample from dataset if it is longer than MAX_SEQ_LEN
    if joke_tens.size()[1] > MAX_SEQ_LEN:
      continue
        
    # The first joke sequence in the sequence
    if not torch.is_tensor(tmp_jokes_tens):
      tmp_jokes_tens = joke_tens
      continue
    else:
      # The next joke does not fit in so we process the sequence and leave the last joke as the start for next sequence 
      if tmp_jokes_tens.size()[1] + joke_tens.size()[1] > MAX_SEQ_LEN:
        work_jokes_tens = tmp_jokes_tens
        tmp_jokes_tens = joke_tens
      else:
        #Add the joke to sequence, continue and try to add more
        tmp_jokes_tens = torch.cat([tmp_jokes_tens, joke_tens[:,1:]], dim=1)
        continue

    outputs = model(work_jokes_tens, labels=work_jokes_tens)
    loss, logits = outputs[:2]                        
    loss.backward()
    sum_loss = sum_loss + loss.detach().data
                    
    proc_seq_count = proc_seq_count + 1
    if proc_seq_count == BATCH_SIZE:
      proc_seq_count = 0    
      batch_count += 1
      optimizer.step()
      scheduler.step() 
      optimizer.zero_grad()
      model.zero_grad()

    if batch_count == 100:
      print(f"sum loss {sum_loss}")
      batch_count = 0
      sum_loss = 0.0

  # Store the model after each epoch to compare the performance of them
  torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_joker_{epoch}.pt"))

In [None]:
MODEL_EPOCH = 4

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_joker_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

jokes_output_file_path = f'generated_{MODEL_EPOCH}.jokes'

model.eval()
if os.path.exists(jokes_output_file_path):
  os.remove(jokes_output_file_path)
    
joke_num = 0
with torch.no_grad():   
  for joke_idx in range(1000):
    joke_finished = False
    cur_ids = torch.tensor(tokenizer.encode("JOKE:")).unsqueeze(0).to(device)

    for i in range(100):
      outputs = model(cur_ids, labels=cur_ids)
      loss, logits = outputs[:2]
      softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
      if i < 3:
        n = 20
      else:
        n = 3
      next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
      cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

      if next_token_id in tokenizer.encode('<|endoftext|>'):
        joke_finished = True
        break

    
    if joke_finished: 
      joke_num = joke_num + 1
      
      output_list = list(cur_ids.squeeze().to('cpu').numpy())
      output_text = tokenizer.decode(output_list)

      with open(jokes_output_file_path, 'a') as f:
        f.write(f"{output_text} \n\n")