In [1]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup,RobertaForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [2]:
trainNews = pd.read_csv('OutputFiles/TrainNews.csv')

In [3]:
trainNews

Unnamed: 0,label,text,title
0,1,the head of a conservative republican faction ...,as u s budget fight looms republicans flip the...
1,1,transgender people will be allowed for the fir...,u s military to accept transgender recruits on...
2,1,the special counsel investigation of links bet...,senior u s republican senator let mr mueller d...
3,1,trump campaign adviser george papadopoulos tol...,fbi russia probe helped by australian diplomat...
4,1,president donald trump called on the u s posta...,trump wants postal service to charge much more...
...,...,...,...
44893,0,21st century wire says as 21wire reported earl...,mcpain john mccain furious that iran treated u...
44894,0,21st century wire says it s a familiar theme w...,justice yahoo settles e mail privacy class act...
44895,0,patrick henningsen 21st century wireremember w...,sunnistan us and allied safe zone plan to take...
44896,0,21st century wire says al jazeera america will...,how to blow 700 million al jazeera america fin...


In [4]:
trainNews = trainNews[trainNews.text.str.len() >= 5]

trainNews['fullNewsText'] = trainNews['title'] + ". " + trainNews['text']

In [5]:
trainNews

Unnamed: 0,label,text,title,fullNewsText
0,1,the head of a conservative republican faction ...,as u s budget fight looms republicans flip the...,as u s budget fight looms republicans flip the...
1,1,transgender people will be allowed for the fir...,u s military to accept transgender recruits on...,u s military to accept transgender recruits on...
2,1,the special counsel investigation of links bet...,senior u s republican senator let mr mueller d...,senior u s republican senator let mr mueller d...
3,1,trump campaign adviser george papadopoulos tol...,fbi russia probe helped by australian diplomat...,fbi russia probe helped by australian diplomat...
4,1,president donald trump called on the u s posta...,trump wants postal service to charge much more...,trump wants postal service to charge much more...
...,...,...,...,...
44893,0,21st century wire says as 21wire reported earl...,mcpain john mccain furious that iran treated u...,mcpain john mccain furious that iran treated u...
44894,0,21st century wire says it s a familiar theme w...,justice yahoo settles e mail privacy class act...,justice yahoo settles e mail privacy class act...
44895,0,patrick henningsen 21st century wireremember w...,sunnistan us and allied safe zone plan to take...,sunnistan us and allied safe zone plan to take...
44896,0,21st century wire says al jazeera america will...,how to blow 700 million al jazeera america fin...,how to blow 700 million al jazeera america fin...


In [6]:
trainNews = trainNews.drop(['title', 'text'], axis=1)
trainNews = trainNews.sort_values(by=['fullNewsText'])

In [7]:
trainNews.to_csv("OutputFiles/FinalTrainNews.csv", index=False)

In [8]:
validData = pd.read_csv("OriginalInputFiles/news.csv")
validData = validData.drop(['Unnamed: 0'], axis=1)

encode_label = {'FAKE' : 0, 'REAL' : 1}

validData = validData[validData.text.str.len() >= 5]

validData['label'] = validData['label'].map(encode_label)
validData['fullNewsText'] = validData['title'] + ". " + validData['text']

In [9]:
validData = validData.drop(['title', 'text'], axis=1)
validData.to_csv("OutputFiles/FinalValidNews.csv", index=False)

In [10]:
trainNewsSentences = trainNews["fullNewsText"].values
targetLabels = trainNews["label"].values

In [11]:
validNewsSentences = validData["fullNewsText"].values
validLabels = validData["label"].values

In [12]:
robertaModel = RobertaForSequenceClassification.from_pretrained("roberta-base",
                                                                    num_labels = 2,
                                                                    output_attentions = False,
                                                                    output_hidden_states = False)
robertaTokenizer = RobertaTokenizer.from_pretrained("roberta-base")

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)
print('RoBERTa loaded')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

cpu
RoBERTa loaded


In [18]:
from torch.utils.data import TensorDataset

def convertToTensor(newsSentences,labels):
    idsRoberta = []
    robertaMasks = []
    newsIDs = []
    counter = 0
    for news in newsSentences:
        robertaEncoded = robertaTokenizer.encode_plus(news,add_special_tokens = True,max_length = 512,
                                                             pad_to_max_length = True,
                                                             return_attention_mask = True,
                                                             return_tensors = 'pt')
        idsRoberta.append(robertaEncoded['input_ids'])
        robertaMasks.append(robertaEncoded['attention_mask'])
        newsIDs.append(counter)
        counter+=1

    idsRoberta = torch.cat(idsRoberta, dim=0)
    robertaMasks = torch.cat(robertaMasks, dim=0)

    labels = torch.tensor(labels)

    tensonDataset = TensorDataset(idsRoberta, robertaMasks, labels)
    return tensonDataset

In [19]:
torch.manual_seed(0)
trainTensor = convertToTensor(trainNewsSentences,targetLabels)
validTensor = convertToTensor(validNewsSentences,validLabels)

In [21]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batchSize = 32
trainDataloader = DataLoader(trainTensor,sampler = RandomSampler(trainTensor),
                                      batch_size = batchSize)
validationDataloader = DataLoader(validTensor,sampler = SequentialSampler(validTensor)
                                  ,batch_size = batchSize)

In [22]:
optimizer = AdamW(robertaModel.parameters(),lr = 5e-5,eps = 1e-8)


In [23]:
from transformers import get_linear_schedule_with_warmup
epochs = 2
totalSteps = len(trainDataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0,
                                                    num_training_steps = totalSteps)

In [24]:
import numpy as np

def getAccuracy(preds, labels):
    flatPredictions = np.argmax(preds, axis=1).flatten()
    flatLabels = labels.flatten()
    return np.sum(flatPredictions == flatLabels) / len(flatLabels)

In [25]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random
seed_val = 100

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

roberta_training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0

    robertaModel.train()

    for step, batch in enumerate(roberta_train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(roberta_train_dataloader), elapsed))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        roberta_model.zero_grad()        
        loss, logits = roberta_model(b_input_ids, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(roberta_model.parameters(), 1.0)
        roberta_optimizer.step()
        roberta_scheduler.step()
    avg_train_loss = total_train_loss / len(roberta_train_dataloader)            
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    print("")
    print("Running Validation...")

    t0 = time.time()
    roberta_model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    for batch in roberta_validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            (loss, logits) = roberta_model(b_input_ids, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(roberta_validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(roberta_validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    roberta_training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))