## Lab 4

### Imports

In [15]:
from bs4 import BeautifulSoup
import requests
import json
import string
import pandas as pd
import pyarabic.araby as araby
from nltk.corpus import stopwords
from tashaphyne.stemming import ArabicLightStemmer
import qalsadi.lemmatizer as lem
from gensim.models import KeyedVectors
from torch import nn
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, BertModel, BertTokenizer
import logging
import warnings
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
import os
import json
import csv
import gzip
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import time

## Part 1: Classification Regression

### Web Scraping

In [88]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'}
result = requests.get(f'https://ar.wikipedia.org/w/index.php?limit=500&offset=0&profile=default&search=%D8%A7%D9%84%D8%AD%D8%B1%D8%A8+%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85%D9%8A%D8%A9+%D8%A7%D9%84%D8%A3%D9%88%D9%84%D9%89&title=%D8%AE%D8%A7%D8%B5:%D8%A8%D8%AD%D8%AB&ns0=1', headers=headers)
doc = BeautifulSoup(result.text, 'html.parser')

### Retrieving titles and article links

In [89]:
main_div = doc.find_all(name='div',attrs={'class':'mw-search-results-container'})
posts = main_div[0].find_all(name='li',attrs={'class':'mw-search-result mw-search-result-ns-0'})
link_div = main_div[0].find_all(name='div',attrs={'class':'searchResultImage-text'})
articles = []
for article in posts:
    # links
    link = article.find(name='a').get('href')
    # article titles
    title = article.find(name='a').get('title')
    articles.append({'title' : title, 'link': 'https://ar.wikipedia.org'+ link})

### Removing punctuations and storing the articles' contents into a JSON file

In [90]:
data = []
ar_punct = ''')(+`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”،.”…“–ـ”.\n'''
en_punct = string.punctuation
punct_lst = ar_punct + en_punct
for article in articles:
    art_result = requests.get(article['link'], headers=headers)
    art_doc = BeautifulSoup(art_result.text, 'html.parser')
    art_main_div = art_doc.find_all(name='div',attrs={'class':'mw-content-rtl'})
    art_posts = art_main_div[0].find_all(name='p')
    article_paragraphs = []
    for art_post in art_posts:
        article_paragraphs.append(art_post.text)
    full_article = ''.join(article_paragraphs).translate(str.maketrans('', '', punct_lst))
    data.append({'title': article['title'], 'article': full_article})
    
with open('data/articles.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [8]:
with open('data/articles_scores.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
data[:5]

[{'title': 'الحرب العالمية الأولى',
  'article': '\xa0فرنسا \xa0الإمبراطورية البريطانية\xa0الإمبراطورية الروسية 191417\xa0المملكة الإيطالية191518\xa0الولايات المتحدة 191718\xa0اليابان\xa0مملكة صربيا\xa0مملكة الجبل الأسود\xa0رومانيا191618\xa0بلجيكا\xa0اليونان191718 البرتغال 191618وآخرون\xa0الإمبراطورية الألمانية\xa0الإمبراطورية النمساوية المجرية\xa0الدولة العثمانية\xa0مملكة بلغاريا191518المتحاربين المشاركين دارفور 191416 دولة الدراويش إمارة جبل شمروآخرون جورج كليمانصو ريمون بوانكاريه فرديناند فوش هربرت أسكويث ديفيد لويد جورج دوغلاس هيج إيرل هيج الأول نيقولا الثاني إمبراطور روسيا نيكولاي نيكولايفيتش ألكسي بروسيلوف فيكتور عمانويل الثالث فيتوريو إمانويلي أورلاندو لويجي كادورنا وودرو ويلسون جون بيرشنغ فرديناند الأول ملك رومانيا كونستانتين بريزان الإمبراطور تايشو بيتر الأول ملك صربيا رادومير يوتنيك ألبرت الأولوآخرون فيلهلم الثاني باول فون هيندنبورغ إريك لودندورف إريش فون فالكنهاين هيلموت فون مولتكه الأصغر فرانتس يوزف الأول 191416 كارل الأول إمبراطور النمسا 191618 فرانز كونراد فون هوتزيندروف 

In [9]:
data[-5:]

[{'title': 'آر إم إس لوسيتينيا',
  'article': '51°25′N 8°33′W\ufeff  \ufeff51417°N 8550°W\ufeff  51417 8550آر إم إس لوسيتانيا كانت سفينةً بريطانيةً عابرةً للمحيطات غرقت في السابع من مايو من عام 1915 بواسطة غواصة ألمانية على بعد 11 ميلًا 18 كم من الساحل الجنوبي لأيرلندا مما أسفر عن مقتل 1198 شخصًا من الركاب وطاقم السفينة تنبأ غرق السفينة بإعلان الولايات المتحدة الأمريكية الحرب على ألمانيا يُعتبر غرق لوسيتانيا عاملًا رئيسيًا في حشد الدعم لشنّ الحرب إلا أن الحرب لم تبدأ إلا بعد مرور سنتين من غرقها أي بعد تكرار الهجمات الألمانية ولجوء الجانب الألماني للحرب المفتوحة ضد النقل البحري الأمريكيحملت لوسيتانيا لقب بلو ريباند المخصص لأسرع عبور للمحيط الأطلسي بالإضافة إلى تسميتها أكبر سفينة ركاب في العالم قبل أن تخسر هذه التسمية لصالح شقيقتها سفينة موريتانيا بعد ثلاثة أشهر أطلقت شركة كونارد لاين للنقل البحري سفينة لوسيتانيا في عام 1906 في الفترة التي اشتدت فيها المنافسة بشأن التبادلات التجارية في شمالي الأطلسي غرقت السفينة في أثناء رحلتها ال 202 عبر المحيط الأطلسي1اعتُبرت خطوط النقل البحري الألماني

### Stemming

In [11]:
ArListem = ArabicLightStemmer()
for article in data:
    article['article'] = araby.strip_tashkeel(article['article'])

In [20]:
stop_words = set(stopwords.words('arabic'))
for article in data:
    article['tokens'] = araby.tokenize(article['article'])
    article['tokens'] = [word for word in article['tokens'] if word not in stop_words]
    article['tokens'] = [token for token in article['tokens'] if not token.isdigit()]

### Lemmatization

In [23]:
lemmer = lem.Lemmatizer()
for article in data:
    article['stem_tokens'] = [ArListem.light_stem(token) for token in article['tokens']]
    article['lemm_tokens'] = [lemmer.lemmatize(token) for token in article['tokens']]

In [24]:
dataset = pd.DataFrame(data)

In [None]:
dataset.to_csv('data/articles_lem_stem.csv')

In [4]:
dataset = pd.read_csv('data/articles_lem_stem.csv')

In [5]:
dataset['tokens'] = dataset['tokens'].apply(ast.literal_eval)
dataset['lemm_tokens'] = dataset['lemm_tokens'].apply(ast.literal_eval)
dataset['stem_tokens'] = dataset['stem_tokens'].apply(ast.literal_eval)

### Loading pretrained Arabic GloVe model vectors

In [6]:
glove_model = KeyedVectors.load_word2vec_format('../Lab2/.vector_cache/vectors.txt', encoding='utf-8', no_header=True)

In [15]:
glove_model.vector_size

256

### creating a vector for each article

In [17]:
def vectorize_answer(answer_tokens, glove_model):
    answer_vector = np.mean([glove_model[word] for word in answer_tokens if word in glove_model.index_to_key] \
        or [np.zeros(glove_model.vector_size)], axis=0)
    return answer_vector

glv_df = pd.DataFrame(dataset['lemm_tokens'].apply(lambda x: vectorize_answer(x, glove_model)).values.tolist(), index=dataset.index)

In [18]:
glv_df.to_csv('data/glv_df.csv')

In [4]:
glv_df = pd.read_csv('data/glv_df.csv')
glv_df.drop(columns='Unnamed: 0', inplace=True)
glv_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,-0.045712,-0.069546,0.059009,0.119636,-0.035486,0.034761,-0.001638,0.115784,0.022254,0.035903,...,-0.075161,-0.037112,0.012052,0.000604,-0.045405,-0.025346,0.172854,-0.047291,0.065166,-0.053155
1,-0.108380,-0.125604,0.113100,0.118680,-0.103005,0.026697,0.065201,0.191656,0.073542,-0.080996,...,-0.068041,-0.037716,-0.010979,-0.034415,-0.039049,-0.056449,0.128181,-0.000969,0.103187,-0.119481
2,-0.035257,-0.069494,0.085163,0.118556,-0.052860,0.043455,-0.005653,0.114180,0.023462,0.039482,...,-0.066921,-0.036711,0.009005,-0.034652,-0.028195,-0.011753,0.149848,-0.041861,0.076292,-0.044770
3,-0.138193,-0.074626,0.107438,0.038833,-0.006635,-0.026867,-0.024015,0.158411,-0.028221,0.025840,...,-0.088717,0.029817,0.063312,0.022100,0.002694,0.033558,0.141532,-0.029723,0.049480,-0.072720
4,-0.120171,-0.158840,0.068961,0.169084,-0.077470,0.015443,-0.104746,0.155835,0.027153,-0.067546,...,-0.073168,-0.080024,0.036555,0.097047,-0.049957,-0.088272,0.271670,0.080682,-0.050711,-0.067093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,-0.033617,-0.075582,-0.007582,0.062043,-0.067786,0.068978,0.114187,0.056189,0.032998,0.008507,...,-0.094918,-0.022640,0.057202,-0.020241,-0.052173,-0.023458,0.166491,-0.008717,0.069817,-0.022519
488,-0.071867,-0.124637,0.088479,0.095598,-0.074769,0.053775,0.083204,0.052336,0.036085,0.047462,...,0.017140,-0.077419,0.023855,-0.153386,-0.035361,-0.048467,0.089661,-0.070474,0.082474,0.020870
489,-0.092404,-0.016087,0.109831,-0.108128,0.008528,0.066899,-0.015516,0.128211,-0.025145,-0.048706,...,-0.023103,-0.112768,0.086285,0.018365,-0.026911,0.033841,0.148054,0.025628,0.138151,-0.175401
490,-0.103633,0.011962,0.051125,0.109153,-0.050284,0.036209,-0.013091,0.136588,0.190557,0.060464,...,-0.045718,-0.049924,0.081672,-0.000560,-0.083853,-0.028424,0.150196,-0.167311,0.073862,0.002172


In [5]:
data = glv_df.values.tolist()

In [6]:
scores = pd.read_csv('data/articles_lem_stem.csv')['score'].values.tolist()

### Defining the training function

In [29]:
def train(model, model_type, n_steps, criterion, optimizer, print_every):
    hidden= None
    x = data
    y = scores
    x_tensor = torch.Tensor(x).unsqueeze(0)
    y_tensor = torch.Tensor(y)
    
    for batch_i in range(n_steps):
        if model_type == 'lstm':
            model.train()
            prediction = model(x_tensor)
            loss = criterion(prediction, y_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Validation
            if batch_i % print_every != 0:
                continue
            model.eval()
        else:
            prediction , hidden = model(x_tensor,hidden)
            hidden = hidden.data
            loss = criterion(prediction,y_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if batch_i % print_every == 0:
            print('Epoch: ', batch_i, ' Loss: ',loss.item(), ' RMSE: ', torch.sqrt(loss).item())
    return model

### RNNs

In [38]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.RNN(
            input_size=256,
            hidden_size=256,
            num_layers=1,
            batch_first=True
        )
        self.out = nn.Linear(256, 1)

    def forward(self, x, hidden):
        r_out, hidden = self.rnn(x, hidden)
        
        r_out = r_out.view(-1, 256)  
        
        output = self.out(r_out)
        
        return output, hidden

### Training RNNs

In [40]:
rnn = RNN()
criterion_rnn = nn.MSELoss()
optimizer_rnn = torch.optim.Adam(rnn.parameters(),lr=0.01)
n_steps = 100
print_every = 10
train(rnn, 'rnn', n_steps, criterion_rnn, optimizer_rnn, print_every)

Epoch:  0  Loss:  25.44704818725586  RMSE:  5.044506549835205
Epoch:  10  Loss:  8.597662925720215  RMSE:  2.9321770668029785
Epoch:  20  Loss:  7.579993724822998  RMSE:  2.753178834915161
Epoch:  30  Loss:  7.526830196380615  RMSE:  2.743506908416748
Epoch:  40  Loss:  7.505816459655762  RMSE:  2.7396745681762695
Epoch:  50  Loss:  7.49566125869751  RMSE:  2.737820625305176
Epoch:  60  Loss:  7.494852542877197  RMSE:  2.737672805786133
Epoch:  70  Loss:  7.494898796081543  RMSE:  2.7376813888549805
Epoch:  80  Loss:  7.49446439743042  RMSE:  2.7376019954681396
Epoch:  90  Loss:  7.494415760040283  RMSE:  2.737593173980713


RNN(
  (rnn): RNN(256, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=1, bias=True)
)

### Bidirectional RNNs

In [10]:
class BidirectionalRNN(nn.Module):
    def __init__(self):
        super(BidirectionalRNN, self).__init__()

        self.rnn = nn.RNN(
            input_size=256,
            hidden_size=256,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.out = nn.Linear(256, 1)

    def forward(self, x, hidden):
        r_out, hidden = self.rnn(x, hidden)
        
        r_out = r_out.view(-1, 256)  
        
        output = self.out(r_out)
        
        return output, hidden

### Training Bidirectional RNNs

In [11]:
birnn = BidirectionalRNN()
criterion_birnn = nn.MSELoss()
optimizer_birnn = torch.optim.Adam(birnn.parameters(),lr=0.01)
n_steps = 100
print_every = 10
train(birnn, 'birnn', n_steps, criterion_birnn, optimizer_birnn, print_every)

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch:  0  Loss:  25.454334259033203  RMSE:  5.045228958129883
Epoch:  10  Loss:  8.720856666564941  RMSE:  2.9531097412109375
Epoch:  20  Loss:  7.766345024108887  RMSE:  2.786816358566284
Epoch:  30  Loss:  7.6300368309021  RMSE:  2.76225209236145
Epoch:  40  Loss:  7.509574890136719  RMSE:  2.7403602600097656
Epoch:  50  Loss:  7.509927749633789  RMSE:  2.740424633026123
Epoch:  60  Loss:  7.496352195739746  RMSE:  2.7379467487335205
Epoch:  70  Loss:  7.496115207672119  RMSE:  2.737903356552124
Epoch:  80  Loss:  7.494965076446533  RMSE:  2.7376933097839355
Epoch:  90  Loss:  7.494449615478516  RMSE:  2.7375991344451904


BidirectionalRNN(
  (rnn): RNN(256, 256, batch_first=True, bidirectional=True)
  (out): Linear(in_features=256, out_features=1, bias=True)
)

### GRU

In [12]:
class GRU(nn.Module):
    def __init__(self):
        super(GRU, self).__init__()

        self.gru = nn.GRU(
            input_size=256,
            hidden_size=256,
            num_layers=1,
            batch_first=True
        )
        self.out = nn.Linear(256, 1)

    def forward(self, x, hidden):
        r_out, hidden = self.gru(x, hidden)
        
        r_out = r_out.view(-1, 256)  
        
        output = self.out(r_out)
        
        return output, hidden

### Training GRU

In [13]:
gru = GRU()
n_steps = 100
criterion_gru = nn.MSELoss()
optimizer_gru = torch.optim.Adam(gru.parameters(),lr=0.01)
print_every = 10
train(gru, 'gru', n_steps, criterion_gru, optimizer_gru, print_every)

Epoch:  0  Loss:  24.550174713134766  RMSE:  4.954813480377197
Epoch:  10  Loss:  7.865761756896973  RMSE:  2.804596424102783
Epoch:  20  Loss:  7.717563629150391  RMSE:  2.778050422668457
Epoch:  30  Loss:  7.582731246948242  RMSE:  2.753675937652588
Epoch:  40  Loss:  7.521261215209961  RMSE:  2.7424917221069336
Epoch:  50  Loss:  7.498653411865234  RMSE:  2.7383668422698975
Epoch:  60  Loss:  7.501723289489746  RMSE:  2.7389273643493652
Epoch:  70  Loss:  7.495872497558594  RMSE:  2.737859010696411
Epoch:  80  Loss:  7.494462490081787  RMSE:  2.7376015186309814
Epoch:  90  Loss:  7.49442720413208  RMSE:  2.7375950813293457


GRU(
  (gru): GRU(256, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=1, bias=True)
)

### LSTM

In [14]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()

        self.lstm = nn.LSTM(
            input_size=256,
            hidden_size=256,
            num_layers=1
        )
        self.out = nn.Linear(256, 1)

    def forward(self, x):
        r_out, hidden = self.lstm(x)
        output = self.out(r_out)
        return output

### Training LSTMs

In [15]:
lstm = LSTM()
n_steps = 100
criterion_lstm = nn.MSELoss()
optimizer_lstm = torch.optim.Adam(lstm.parameters(),lr=0.01)
print_every = 10
train(lstm, 'lstm', n_steps, criterion_lstm, optimizer_lstm, print_every)

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch:  0  Loss:  25.73954200744629  RMSE:  5.073415279388428
Epoch:  10  Loss:  8.455005645751953  RMSE:  2.9077491760253906
Epoch:  20  Loss:  7.738240718841553  RMSE:  2.781769275665283
Epoch:  30  Loss:  7.522417068481445  RMSE:  2.7427024841308594
Epoch:  40  Loss:  7.5138702392578125  RMSE:  2.7411439418792725
Epoch:  50  Loss:  7.525935649871826  RMSE:  2.7433438301086426
Epoch:  60  Loss:  7.5205078125  RMSE:  2.742354393005371
Epoch:  70  Loss:  7.508261203765869  RMSE:  2.7401206493377686
Epoch:  80  Loss:  7.504075050354004  RMSE:  2.739356756210327
Epoch:  90  Loss:  7.503660202026367  RMSE:  2.739280939102173


LSTM(
  (lstm): LSTM(256, 256)
  (out): Linear(in_features=256, out_features=1, bias=True)
)

## Part 2: Transformers (Text generation)

### Checking whether cuda is available

In [3]:
logging.getLogger().setLevel(logging.CRITICAL)

warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

### Loading the GPT2 tokenizer and base model

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

In [5]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

### Fake News dataset loader

In [7]:
class FakeNewsDataset(Dataset):
    def __init__(self):
        super().__init__()

        short_jokes_path = os.path.join('data/Fake.csv')

        self.joke_list = []
        self.end_of_text_token = "<|endoftext|>"
        
        with open(short_jokes_path,encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            
            x = 0
            for i, row in enumerate(csv_reader):
              joke_str = f"Fake News:{row[1]}{self.end_of_text_token}"
              self.joke_list.append(joke_str)
              if i == 1000:
                break
        
    def __len__(self):
        return len(self.joke_list)

    def __getitem__(self, item):
        return self.joke_list[item]

In [8]:
dataset = FakeNewsDataset()
news_loader = DataLoader(dataset, batch_size=1, shuffle=True)

### Defining training variables

In [9]:
BATCH_SIZE = 16
EPOCHS = 100
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [11]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

### Fine tuning loop and saving model

In [55]:
tmp_news_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,news in enumerate(news_loader):
        
        #################### "Fit as many news sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        news_tens = torch.tensor(tokenizer.encode(news[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if news_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first news sequence in the sequence
        if not torch.is_tensor(tmp_news_tens):
            tmp_news_tens = news_tens
            continue
        else:
            #The next news does not fit in so we process the sequence and leave the last news 
            #as the start for next sequence 
            if tmp_news_tens.size()[1] + news_tens.size()[1] > MAX_SEQ_LEN:
                work_news_tens = tmp_news_tens
                tmp_news_tens = news_tens
            else:
                #Add the news to sequence, continue and try to add more
                tmp_news_tens = torch.cat([tmp_news_tens, news_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_news_tens, labels=work_news_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Store the model after each epoch to compare the performance of them
torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_news_{epoch}.pt"))



Token indices sequence length is longer than the specified maximum sequence length for this model (1152 > 1024). Running this sequence through the model will result in indexing errors


sum loss 5096.57568359375
sum loss 4888.392578125
sum loss 4678.61572265625
sum loss 4478.1904296875
sum loss 4290.79443359375
sum loss 4101.33544921875
sum loss 3907.399169921875
sum loss 3703.618896484375
sum loss 3484.5693359375
sum loss 3248.380615234375
sum loss 2992.611083984375


### Generating Fake news using the fine tuned model

In [None]:
MODEL_EPOCH = 99

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_news_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

news_output_file_path = f'generated_{MODEL_EPOCH}.news.txt'

model.eval()
if os.path.exists(news_output_file_path):
    os.remove(news_output_file_path)
    
news_num = 0
with torch.no_grad():
   
        for news_idx in range(1000):
        
            news_finished = False

            cur_ids = torch.tensor(tokenizer.encode("news:")).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    news_finished = True
                    break

            
            if news_finished:
                
                news_num = news_num + 1
                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(news_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")

2024-05-24 11:23:01.880471: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-24 11:23:01.880614: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-24 11:23:02.033009: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-24 11:23:01.880471: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-24 11:23:01.880614: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor