In [16]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [17]:
trainNews = pd.read_csv('TrainNews.csv')

In [18]:
trainNews

Unnamed: 0,label,text,title
0,1,the head of a conservative republican faction...,as u s budget fight looms republicans flip the...
1,1,transgender people will be allowed for the fi...,u s military to accept transgender recruits on...
2,1,the special counsel investigation of links be...,senior u s republican senator let mr mueller d...
3,1,trump campaign adviser george papadopoulos to...,fbi russia probe helped by australian diplomat...
4,1,president donald trump called on the u s post...,trump wants postal service to charge much more...
...,...,...,...
44893,0,21st century wire says as 21wire reported earl...,mcpain john mccain furious that iran treated u...
44894,0,21st century wire says it s a familiar theme w...,justice yahoo settles e mail privacy class act...
44895,0,patrick henningsen 21st century wireremember w...,sunnistan us and allied safe zone plan to take...
44896,0,21st century wire says al jazeera america will...,how to blow 700 million al jazeera america fin...


In [19]:
trainNews = trainNews[trainNews.text.str.len() >= 5]

trainNews['fullNewsText'] = trainNews['title'] + ". " + trainNews['text']

In [20]:
trainNews

Unnamed: 0,label,text,title,fullNewsText
0,1,the head of a conservative republican faction...,as u s budget fight looms republicans flip the...,as u s budget fight looms republicans flip the...
1,1,transgender people will be allowed for the fi...,u s military to accept transgender recruits on...,u s military to accept transgender recruits on...
2,1,the special counsel investigation of links be...,senior u s republican senator let mr mueller d...,senior u s republican senator let mr mueller d...
3,1,trump campaign adviser george papadopoulos to...,fbi russia probe helped by australian diplomat...,fbi russia probe helped by australian diplomat...
4,1,president donald trump called on the u s post...,trump wants postal service to charge much more...,trump wants postal service to charge much more...
...,...,...,...,...
44893,0,21st century wire says as 21wire reported earl...,mcpain john mccain furious that iran treated u...,mcpain john mccain furious that iran treated u...
44894,0,21st century wire says it s a familiar theme w...,justice yahoo settles e mail privacy class act...,justice yahoo settles e mail privacy class act...
44895,0,patrick henningsen 21st century wireremember w...,sunnistan us and allied safe zone plan to take...,sunnistan us and allied safe zone plan to take...
44896,0,21st century wire says al jazeera america will...,how to blow 700 million al jazeera america fin...,how to blow 700 million al jazeera america fin...


In [21]:
trainNews = trainNews.drop(['title', 'text'], axis=1)
trainNews = trainNews.sort_values(by=['fullNewsText'])

In [22]:
trainNews['fullNewsText'] = trainNews['fullNewsText'].apply(lambda x: " ".join(x.split()[:512]))
trainNews.to_csv("FinalTrainNews.csv", index=False)

In [23]:
validData = pd.read_csv("news.csv")
validData = validData.drop(['Unnamed: 0'], axis=1)

encode_label = {'FAKE' : 0, 'REAL' : 1}

# Discard items with less than 5 words in text.
validData = validData[validData.text.str.len() >= 5]

validData['label'] = validData['label'].map(encode_label)
validData['fullNewsText'] = validData['title'] + ". " + validData['text']
validData['fullNewsText'] = validData['fullNewsText'].apply(lambda x: " ".join(x.split()[:512]))

In [24]:
validData = validData.drop(['title', 'text'], axis=1)
validData.to_csv("FinalValidNews.csv", index=False)

In [25]:
import torch

class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(ROBERTAClassifier, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 2)
        
    def forward(self, input_ids, attention_mask):
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        x = self.d2(x)
        x = self.l2(x)
        
        return x  

In [27]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
MAX_SEQ_LEN = 256
BATCH_SIZE = 16
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

def pretrain(model, optimizer, trainNews, validNews, scheduler = None, valid_period = len(trainNews),
             num_epochs = 5):
    
    # Pretrain linear layers, do not train bert
    for param in model.roberta.parameters():
        param.requires_grad = False
    
    model.train()
    
    # Initialize losses and loss histories
    train_loss = 0.0
    valid_loss = 0.0   
    global_step = 0  
    
    # Train loop
    for epoch in range(num_epochs):
        for row in trainNews:
            print(row)
#             mask = (source != PAD_INDEX).type(torch.uint8)
            
#             y_pred = model(input_ids=source,  
#                            attention_mask=mask)
            
#             loss = torch.nn.CrossEntropyLoss()(y_pred, target)
   
#             loss.backward()
            
#             # Optimizer and scheduler step
#             optimizer.step()    
#             scheduler.step()
                
#             optimizer.zero_grad()
            
#             # Update train loss and global step
#             train_loss += loss.item()
#             global_step += 1

#             # Validation loop. Save progress and evaluate model performance.
#             if global_step % valid_period == 0:
#                 model.eval()
                
#                 with torch.no_grad():                    
#                     for (source, target), _ in valid_iter:
#                         mask = (source != PAD_INDEX).type(torch.uint8)
                        
#                         y_pred = model(input_ids=source, 
#                                        attention_mask=mask)
                        
#                         loss = torch.nn.CrossEntropyLoss()(y_pred, target)
                        
#                         valid_loss += loss.item()

#                 # Store train and validation loss history
#                 train_loss = train_loss / valid_period
#                 valid_loss = valid_loss / len(valid_iter)
                
#                 model.train()

#                 # print summary
#                 print('Epoch [{}/{}], global step [{}/{}], PT Loss: {:.4f}, Val Loss: {:.4f}'
#                       .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
#                               train_loss, valid_loss))
                
#                 train_loss = 0.0                
#                 valid_loss = 0.0
    
#     # Set bert parameters back to trainable
#     for param in model.roberta.parameters():
#         param.requires_grad = True
        
#     print('Pre-training done!')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=898823.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1355863.0), HTML(value='')))


