In [None]:
!pip install transformers datasets
!pip install accelerate -U

# Import Libraries

In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Import Data

In [2]:
train_df = pd.read_csv('/content/train (1).csv')
test_df = pd.read_csv('/content/test (1).csv')

In [3]:
train_df

Unnamed: 0,label,news
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10235,True,There are a larger number of shark attacks in ...
10236,True,Democrats have now become the party of the [At...
10237,True,Says an alternative to Social Security that op...
10238,False,On lifting the U.S. Cuban embargo and allowing...


In [4]:
train_df.shape, test_df.shape

((10240, 2), (1267, 2))

In [5]:
train_df.isna().sum(), test_df.isna().sum()

(label    0
 news     0
 dtype: int64,
 label    0
 news     0
 dtype: int64)

In [6]:
train_df.columns

Index(['label', 'news'], dtype='object')

# Data PreProcessing

In [7]:
def preprocessing(text):
    text = text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub('<.*?>+',b'',text)
    text = re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text = re.sub('\w*\d\w*','',text)
    return text

In [8]:
train_df['news'] = train_df['news'].apply(preprocessing)

# Model Building and Evaluation

**Decision Tree**

In [9]:
x = train_df['news']
y = train_df['label']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.25)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [12]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(xv_train, y_train)

tree_pred = tree.predict(xv_test)
tree.score(xv_test, y_test)

0.55234375

In [13]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

       False       0.46      0.46      0.46      1062
        True       0.62      0.62      0.62      1498

    accuracy                           0.55      2560
   macro avg       0.54      0.54      0.54      2560
weighted avg       0.55      0.55      0.55      2560



**Grid Search CV**

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(tree, param_grid, cv=5)
grid_search.fit(xv_train, y_train)

best_tree = grid_search.best_estimator_
best_tree_pred = best_tree.predict(xv_test)
print(classification_report(y_test, best_tree_pred))

              precision    recall  f1-score   support

       False       0.48      0.45      0.47      1062
        True       0.63      0.65      0.64      1498

    accuracy                           0.57      2560
   macro avg       0.55      0.55      0.55      2560
weighted avg       0.57      0.57      0.57      2560



**Random Forest Classifier**

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(xv_train, y_train)

rf_pred = rf.predict(xv_test)
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

       False       0.57      0.44      0.50      1062
        True       0.66      0.77      0.71      1498

    accuracy                           0.63      2560
   macro avg       0.62      0.61      0.61      2560
weighted avg       0.62      0.63      0.62      2560



**Logistic Regression**

In [14]:
from sklearn.linear_model import LogisticRegression

Logistic_reg = LogisticRegression()
Logistic_reg.fit(xv_train, y_train)

lr_pred = Logistic_reg.predict(xv_test)
Logistic_reg.score(xv_test, y_test)

0.625390625

In [15]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

       False       0.56      0.45      0.50      1062
        True       0.66      0.75      0.70      1498

    accuracy                           0.63      2560
   macro avg       0.61      0.60      0.60      2560
weighted avg       0.62      0.63      0.62      2560



**Distilbert PreTraining**

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset

train_df = pd.read_csv('/content/test (1).csv')
test_df = pd.read_csv('/content/train (1).csv')

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize_function(data):
    return tokenizer(data['news'], padding="max_length", truncation=True)

tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)
tokenized_test_datasets = test_dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_train_datasets.shuffle(seed=42).select(range(1267))
test_dataset = tokenized_test_datasets.shuffle(seed=42).select(range(8000, 10000))

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.Tensor(logits)
    predictions = torch.argmax(logits, dim=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

model.save_pretrained("fake_news_classifier")
tokenizer.save_pretrained("fake_news_classifier")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.6499,0.64
2,No log,0.665014,0.635
3,No log,0.693581,0.6315


('fake_news_classifier/tokenizer_config.json',
 'fake_news_classifier/special_tokens_map.json',
 'fake_news_classifier/vocab.txt',
 'fake_news_classifier/added_tokens.json',
 'fake_news_classifier/tokenizer.json')