In [49]:
!pip install torchtext==0.6.0 --quiet
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re
import string
import nltk
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/fake-news-classification/WELFake_Dataset.csv


In [50]:
data = pd.read_csv('/kaggle/input/fake-news-classification/WELFake_Dataset.csv')

In [51]:
data.sample(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
64490,64490,Finger Wagging Maxine Waters on Illegal Aliens...,Maxine Waters is one angry woman. She has gone...,1
34593,34593,Labor activists target Midwest politicians opp...,CHICAGO (Reuters) - U.S. activists plan protes...,0
59997,59997,"Germany drops mass U.S., UK spying probe on la...",BERLIN (Reuters) - German prosecutors have clo...,0
22769,22769,House Democrats seek documents on former Trump...,WASHINGTON (Reuters) - U.S. Democratic lawmake...,0
68616,68616,WATCH: A GOP Candidate Just Released This Ama...,Republican congressional candidates have been ...,1
70807,70807,The economy of Wolfgang Schaeuble - at a glance,"LONDON (Reuters) - Wolfgang Schaeuble, Germany...",0
13869,13869,Lebanon's president says crisis over with PM H...,BEIRUT (Reuters) - Lebanese President Michel A...,0
61562,61562,Netanyahu’s Critical Foreign Tour,Netanyahu’s Critical Foreign Tour Israel's str...,1
8580,8580,"MARKO, SOON, ET AL: To Put America First Is to...",The article below was contributed by Istvan M...,0
43604,43604,DETROIT AIRPORT USING NEW HIGH-TECH SYSTEM: TA...,Home › POLICE STATE | SCIENCE & TECHNOLOGY | U...,1


In [52]:
data.drop(columns=['Unnamed: 0','title'],inplace=True)

In [53]:
data.sample(10)

Unnamed: 0,text,label
5307,The U.S. Revolution is Underway - Hillary Clin...,1
40548,#CRUX NCA REF 1122930JG AIFL IPR TO USA FEC IN...,1
59178,Hearing about violence at Trump s rallies was ...,1
33991,"In NBC s attempt to discredit Trump, they ve s...",1
18088,Here s just one more piece to the puzzle Hilla...,1
52065,LIMA (Reuters) - Three Peruvian policemen were...,0
50843,WASHINGTON (Reuters) - Thirteen U.S. industry ...,0
51299,JAKARTA (Reuters) - Six police were wounded an...,0
5348,NEW YORK (Reuters) - Dominican-born Adriano Es...,0
31042,Long time Trump opponent Sen. John McCain ( ) ...,0


In [54]:
# Defining function for cleaning the string 

def clean_string(text, stem="None"):
    # Check if the input is a string
    if not isinstance(text, str):
        return "" 
    final_string = ""
    text = text.lower()
    text = re.sub(r'\n', '', text)
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    text_filtered = [word for word in text if word not in useless_words]
    final_string = ' '.join(text_filtered)
    
    return final_string


In [55]:
data['cleaned_text'] = data['text'].apply(clean_string)

In [56]:
data.drop(columns=['text'])

Unnamed: 0,label,cleaned_text
0,1,comment expected barack obama members fyf911 f...
1,1,post votes hillary already
2,1,demonstrators gathered last night exercising c...
3,0,dozen politically active pastors came private ...
4,1,rs28 sarmat missile dubbed satan 2 replace ss1...
...,...,...
72129,0,washington reuters hackers believed working ru...
72130,1,know fantasyland republicans never questioned ...
72131,0,migrants refuse leave train refugee camp hunga...
72132,0,mexico city reuters donald trump’s combative s...


In [57]:
from torch.utils.data import Dataset
import torch

class TextClassificationDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.texts = df['cleaned_text'].tolist()  # Convert the cleaned_text column to a list
        self.labels = df['label'].tolist()  # Convert the label column to a list
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        text = self.texts[idx]  # Access the text by index
        label = self.labels[idx]  # Access the label by index
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)  # Convert label to tensor
        }

    def __len__(self):
        return len(self.texts)  # Length of the dataset


In [58]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, n_classes):
        super(FakeNewsClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0, :]  # Extract the [CLS] token representation
        output = self.dropout(pooled_output)
        return self.fc(output)

In [59]:
from tqdm import tqdm

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    # Use tqdm for the training loop
    for batch in tqdm(data_loader, desc="Training", leave=False):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        # Use tqdm for the evaluation loop
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        return "positive" if preds.item() == 1 else "negative"


In [67]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 128
num_epochs = 5
learning_rate = 2e-5

In [68]:
texts = data['cleaned_text']
labels = data['label']

In [69]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_df = pd.DataFrame({
    'cleaned_text': train_texts,
    'label': train_labels
})

val_df = pd.DataFrame({
    'cleaned_text': val_texts,
    'label': val_labels
})

In [70]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_df, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_df, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FakeNewsClassifier(num_classes).to(device)

In [72]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [73]:
from tqdm import tqdm
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    model.eval() 
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        # Use tqdm for the validation dataloader
        for batch in tqdm(val_dataloader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(actual_labels, predictions)
    report = classification_report(actual_labels, predictions)
    
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)


Epoch 1/5


                                                             

Validation Accuracy: 0.9786
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      7089
           1       0.97      0.99      0.98      7338

    accuracy                           0.98     14427
   macro avg       0.98      0.98      0.98     14427
weighted avg       0.98      0.98      0.98     14427

Epoch 2/5


                                                             

Validation Accuracy: 0.9784
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      7089
           1       0.99      0.97      0.98      7338

    accuracy                           0.98     14427
   macro avg       0.98      0.98      0.98     14427
weighted avg       0.98      0.98      0.98     14427

Epoch 3/5


                                                             

Validation Accuracy: 0.9827
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      7089
           1       0.98      0.99      0.98      7338

    accuracy                           0.98     14427
   macro avg       0.98      0.98      0.98     14427
weighted avg       0.98      0.98      0.98     14427

Epoch 4/5


                                                             

Validation Accuracy: 0.9834
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7089
           1       0.98      0.98      0.98      7338

    accuracy                           0.98     14427
   macro avg       0.98      0.98      0.98     14427
weighted avg       0.98      0.98      0.98     14427

Epoch 5/5


                                                             

Validation Accuracy: 0.9837
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7089
           1       0.98      0.98      0.98      7338

    accuracy                           0.98     14427
   macro avg       0.98      0.98      0.98     14427
weighted avg       0.98      0.98      0.98     14427



