<a href="https://colab.research.google.com/github/azizamirsaidova/fake-news-detection/blob/main/notebooks/fake_news_detection_using_machine_learning_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Fake & True News Detection Classification Modeling**

In [None]:
!git clone https://github.com/azizamirsaidova/fake-news-detection.git

Cloning into 'fake-news-detection'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 80 (delta 24), reused 16 (delta 4), pack-reused 0[K
Unpacking objects: 100% (80/80), done.


In [None]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow_hub as hub
import tensorflow_text

In [None]:
#Read csv files
false = pd.read_csv('/content/false_results.csv')
true = pd.read_csv('/content/true_results.csv')

In [None]:
#Pre-process the data 
false_replaced = map(lambda x: x.replace("'generated_text':",''), false)
false = false['text'].replace("'generated_text':",'', regex=True).to_frame()
true = true['text'].replace("'generated_text':",'', regex=True).to_frame()
true['label'] = 1
false['label'] = 0
frames = [true, false]
df = pd.concat(frames)

In [None]:
#Create train-test split
x_train,x_test,y_train,y_test=train_test_split(df['text'], df['label'], test_size=0.2, random_state=8)

In [None]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [None]:
#DataFlair - Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=1000)
pac.fit(tfidf_train,y_train)
#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 59.25%


## **Other Classification Techniques**

1. Logistic Regression
2. Multinomial Bayesian
3. Decision Tree Classifier

In [None]:
x_df = df['text']
y_df = df['label']

In [None]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(x_df)
freq_term_matrix = count_vectorizer.transform(x_df)
tfidf = TfidfTransformer(norm = "l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(tf_idf_matrix,y_df, random_state=2)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Accuracy = logreg.score(X_test, Y_test)
print(Accuracy*100)

58.199999999999996


In [None]:
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, Y_train)
Accuracy_NB = NB.score(X_test, Y_test)
print(Accuracy_NB*100)

59.199999999999996


In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
Accuracy_DT = clf.score(X_test, Y_test)
print(Accuracy_DT*100)

56.39999999999999


## **BERT**

In [None]:
!pip install "tensorflow>=1.7.0"
!pip install tensorflow-hub

In [None]:
#Split data into training and testing dataset 
title_train, title_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2,random_state=1000)

In [None]:
# Use the bert preprocesser and bert encoder from tensorflow_hub 
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3") 
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [None]:
import tensorflow as tf

# Input Layers
input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='news')

# BERT layers
processed = bert_preprocess(input_layer)
output = bert_encoder(processed)

# # Fully Connected Layers
layer = tf.keras.layers.Dropout(0.2, name='dropout')(output['pooled_output'])
layer = tf.keras.layers.Dense(10,activation='relu', name='hidden')(layer)
layer = tf.keras.layers.Dense(1,activation='sigmoid', name='output')(layer)


model = tf.keras.Model(inputs=[input_layer],outputs=[layer])

#Compile model on adam optimizer, binary_crossentropy loss, and accuracy metrics
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
#Train model on 5 epochs
model.fit(title_train, y_train, epochs= 5)

#Evaluate model on test data
model.evaluate(title_test,y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.6913196444511414, 0.49000000953674316]