In [1]:
import pandas as pd


In [2]:
data = pd.read_csv('/kaggle/input/newsdata/dataset_fakenews.csv')

In [3]:
!pip install transformers



In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as functional
import matplotlib.pyplot as plt
from transformers import BertForSequenceClassification, AdamW, BertConfig
import gc
from transformers import BertModel
from sklearn.metrics import roc_auc_score,f1_score
import time
import datetime



In [5]:
torch.manual_seed(0)
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
if use_cuda:
    torch.cuda.manual_seed(0)

print("Using GPU: {}".format(use_cuda))

Using GPU: True


In [6]:
from transformers import BertTokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
tweets = data.Tweet.values
labels = data.label.values

In [8]:
import re
from string import punctuation
def preprocess(data):
    #remove url and hashtag
    for i in range(data.shape[0]):
        text=data[i].lower()
        text1=''.join([word+" " for word in text.split()])
        data[i]=text1
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    space_pattern = '\s+'

    for i in range(data.shape[0]):
        text_string = data[i]
        parsed_text = re.sub(hashtag_regex, '', text_string)
        parsed_text = re.sub(giant_url_regex, '', parsed_text)
        parsed_text = re.sub(mention_regex, '', parsed_text)
        #remove punctuation
        parsed_text = re.sub(r"[{}]+".format(punctuation), '', parsed_text)
        parsed_text = re.sub(space_pattern, ' ', parsed_text)
        data[i] = parsed_text
    return data
tweets = preprocess(tweets)
print(tweets)

['the cdc currently reports 99031 deaths in general the discrepancies in death counts between different sources are small and explicable the death toll stands at roughly 100000 people today '
 'states reported 1121 deaths a small rise from last tuesday southern states reported 640 of those deaths '
 'politically correct woman almost uses pandemic as excuse not to reuse plastic bag '
 ...
 'breaking news according to documents released to the press was connected to an earlier robbery url '
 'ebola vaccines url '
 'concerned airport passenger suits up in homemade hazmat before flight url url ']


In [9]:
input_ids = []
attention_masks = []
for tweet in tweets:
    encoded_dict = tokenizer.encode_plus(
                        tweet,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', tweets[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  the cdc currently reports 99031 deaths in general the discrepancies in death counts between different sources are small and explicable the death toll stands at roughly 100000 people today 
Token IDs: tensor([  101,  1996, 26629,  2747,  4311,  5585,  2692, 21486,  6677,  1999,
         2236,  1996,  5860,  2890,  9739,  9243,  1999,  2331,  9294,  2090,
         2367,  4216,  2024,  2235,  1998,  4654, 24759,  5555,  3468,  1996,
         2331,  9565,  4832,  2012,  5560,  6694,  8889,  2111,  2651,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
         

In [10]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size],generator=torch.Generator().manual_seed(42))

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

9,045 training samples
1,005 validation samples


In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
            train_dataset,
            shuffle = True,
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            shuffle = False,
            batch_size = batch_size
        )

In [12]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [13]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [14]:
optimizer = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8
                )
epochs = 1
criterion = nn.CrossEntropyLoss()



In [15]:
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()
best_accuracy = 0
for epoch_i in range(0, epochs):
    #Training
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model.train()
    for step, batch in enumerate(train_dataloader):

        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)

        model.zero_grad()
        out = model(input_ids, token_type_ids=None, attention_mask=input_mask, labels=labels)
        loss = out[0]
        logits = out[1]

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        pred = torch.argmax(logits, dim = 1)
        total_train_accuracy +=  torch.sum(pred == labels).item()

    avg_train_accuracy = total_train_accuracy / len(train_dataloader.dataset)
    avg_train_loss = total_train_loss / len(train_dataloader.dataset)
    print("  Accuracy: {}".format(avg_train_accuracy))
    print("  Training loss: {}".format(avg_train_loss))


    # Validation
    print("")
    print("Validation...")
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    y_true = []
    y_pred = []

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)

        with torch.no_grad():
            out = model(input_ids, token_type_ids=None, attention_mask=input_mask,labels=labels)
            loss = out[0]
            logits = out[1]

        total_eval_loss += loss.item()
        pred = torch.argmax(logits, dim = 1)
        total_eval_accuracy += torch.sum(pred == labels).item()
        y_true.append(labels.flatten())
        y_pred.append(pred.flatten())

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)
    print("  Accuracy: {}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)
    print("  Validation loss: {}".format(avg_val_loss))
    training_time = format_time(time.time() - t0)
    print()

    y_true = torch.cat(y_true).tolist()
    y_pred = torch.cat(y_pred).tolist()
    print("This epoch took: {:}".format(training_time))
    print('roc_auc score: ', roc_auc_score(y_true,y_pred))
    print('F1 score:',f1_score(y_true, y_pred))
    print()

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Train Accur.': avg_train_accuracy,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
        }
    )
    print()

    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        best_model = model

print()
print("="*10)
print("Summary")
print("Total time {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Epoch 1 / 10
Training...
  Accuracy: 0.8725262576008844
  Training loss: 0.018541454331822707

Validation...
  Accuracy: 0.9014925373134328
  Validation loss: 0.017291091498339652

This epoch took: 0:13:55
roc_auc score:  0.9005151575193185
F1 score: 0.8950159066808059



Epoch 2 / 10
Training...
  Accuracy: 0.9551133222775013
  Training loss: 0.008771878010030905

Validation...
  Accuracy: 0.9233830845771144
  Validation loss: 0.020284295202214364

This epoch took: 0:13:58
roc_auc score:  0.9235585496334456
F1 score: 0.9221435793731041



Epoch 3 / 10
Training...
  Accuracy: 0.9828634604754007
  Training loss: 0.004135872849818846

Validation...
  Accuracy: 0.9203980099502488
  Validation loss: 0.025912592157782913

This epoch took: 0:13:59
roc_auc score:  0.919407568852784
F1 score: 0.9150743099787685



Epoch 4 / 10
Training...
  Accuracy: 0.9931453841901603
  Training loss: 0.0020053668732077823

Validation...
  Accuracy: 0.9243781094527364
  Validation loss: 0.02809630734969072



In [16]:
# v="model_5"
# model.save(f"/kaggle/working/{v}")

In [21]:
PATH1 = "/kaggle/working/text_classification_model2.pt"
torch.save(model, PATH1)

In [18]:
def preprocess_text(data):
    text=data.lower()
    text1=''.join([word+" " for word in text.split()])
    data=text1
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
    '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    space_pattern = '\s+'


    text_string = data
    parsed_text = re.sub(hashtag_regex, '', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    #remove punctuation
    parsed_text = re.sub(r"[{}]+".format(punctuation), '', parsed_text)
    parsed_text = re.sub(space_pattern, ' ', parsed_text)
    data = parsed_text
    return data

In [19]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Define the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the pre-trained model architecture
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, output_attentions=False, output_hidden_states=False)

# Move the model to the appropriate device
device = "cpu"
model.to(device)

# Create a function to predict for a single text input
def predict_single_text(text):
    # Preprocess the text input
    text = preprocess_text(text)  # Replace with your preprocessing function

    # Tokenize and convert the input to the required format
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # Make the prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()

    return predicted_class, probabilities

# Example usage
text_input = "abc reportedly gave darren wilson 6-figures for an interview. if true, they essentially paid the bounty for killing a black child. #ferguson"
predicted_class, class_probabilities = predict_single_text(text_input)
print("Predicted Class:", predicted_class)
print("Class Probabilities:", class_probabilities)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Class: 0
Class Probabilities: tensor([[0.5691, 0.4309]])


In [20]:
text_input = "a good relationship: â˜‘ calls you to just say hi â˜‘ wants to see you â˜‘ brings you bk"
predicted_class, class_probabilities = predict_single_text(text_input)
print("Predicted Class:", predicted_class)
print("Class Probabilities:", class_probabilities)

Predicted Class: 0
Class Probabilities: tensor([[0.6160, 0.3840]])


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re
from string import punctuation

# Define the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Load the trained model
model_path = 'path_to_your_saved_model.pth'  # Replace with the actual path to your saved model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()  # Set the model to evaluation mode

# Example raw tweet (not preprocessed)
raw_tweet = "???Clearly, the Obama administration did not leave any kind of game plan for something like this.??�"

# Preprocessing function to replicate training preprocessing steps
def preprocess_text(data):
    text = data.lower()
    text1 = ''.join([word + " " for word in text.split()])
    data = text1
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
    '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    space_pattern = '\s+'

    text_string = data
    parsed_text = re.sub(hashtag_regex, '', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    # Remove punctuation
    parsed_text = re.sub(r"[{}]+".format(punctuation), '', parsed_text)
    parsed_text = re.sub(space_pattern, ' ', parsed_text)
    data = parsed_text
    return data

# Preprocess the raw tweet using your provided function
preprocessed_tweet = preprocess_text(raw_tweet)

# Tokenize the preprocessed tweet
encoded_dict = tokenizer.encode_plus(
                    preprocessed_tweet, add_special_tokens=True, max_length=512,
                    pad_to_max_length=True, return_attention_mask=True,
                    return_tensors='pt'
               )

# Extract the encoded input IDs and attention mask
input_ids = encoded_dict['input_ids'].to(device)
attention_mask = encoded_dict['attention_mask'].to(device)

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

# Print the predicted class (0 for fake news, 1 for real news)
if predicted_class == 1:
    print("Predicted: Fake News")
else:
    print("Predicted: Real News")


In [13]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import re
from string import punctuation

# Define the device (CPU or GPU)
device = torch.device("cpu")

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Replace 'path_to_your_saved_model.pth' with the actual path to your saved model checkpoint
model_path = '/kaggle/input/test-analysis-model/text_classification_model2.pt'

# Load the entire model
loaded_model = torch.load(model_path, map_location=device)

# Set the model to evaluation mode
loaded_model.eval()

# Example raw tweet (not preprocessed)
raw_tweet = ""
# Preprocessing function to replicate training preprocessing steps
def preprocess_text(data):
    text = data.lower()
    text1 = ''.join([word + " " for word in text.split()])
    data = text1
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
    '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    space_pattern = '\s+'

    text_string = data
    parsed_text = re.sub(hashtag_regex, '', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    # Remove punctuation
    parsed_text = re.sub(r"[{}]+".format(punctuation), '', parsed_text)
    parsed_text = re.sub(space_pattern, ' ', parsed_text)
    data = parsed_text
    return data

# Preprocess the raw tweet using your provided function
preprocessed_tweet = preprocess_text(raw_tweet)

# Tokenize the preprocessed tweet
encoded_dict = tokenizer.encode_plus(
                    preprocessed_tweet, add_special_tokens=True, max_length=512,
                    pad_to_max_length=True, return_attention_mask=True,
                    return_tensors='pt'
               )

# Extract the encoded input IDs and attention mask
input_ids = encoded_dict['input_ids'].to(device)
attention_mask = encoded_dict['attention_mask'].to(device)

with torch.no_grad():
    outputs = loaded_model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

# Print the predicted class (1 for fake news, 0 for real news)
if predicted_class == 1:
    print("Predicted: Fake News")
else:
    print("Predicted: Real News")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicted: Fake News
