<a href="https://colab.research.google.com/github/arnabbiswas66/multimodal-fake-news-classifier/blob/main/Twitter_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U "tensorflow-text==2.11.*"

In [2]:
!pip install -q tf-models-official==2.11.0

In [3]:

import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os
import re
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
tf.get_logger().setLevel('ERROR')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
def get_df(file):
    return pd.read_csv(file,sep = '\t')

In [27]:
train_df = get_df('/content/drive/MyDrive/multimodal-news/mediaeval2015/devset/tweets.txt')
test_df = get_df('/content/drive/MyDrive/multimodal-news/mediaeval2015/testset/tweets.txt')

In [28]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     14277 non-null  int64 
 1   tweetText   14277 non-null  object
 2   userId      14277 non-null  int64 
 3   imageId(s)  14277 non-null  object
 4   username    14277 non-null  object
 5   timestamp   14277 non-null  object
 6   label       14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


In [29]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     3755 non-null   int64 
 1   tweetText   3755 non-null   object
 2   userId      3755 non-null   int64 
 3   imageId(s)  3755 non-null   object
 4   username    3755 non-null   object
 5   timestamp   3755 non-null   object
 6   label       3755 non-null   object
dtypes: int64(2), object(5)
memory usage: 205.5+ KB


In [30]:
#Changing 'humor' to 'fake'
train_df.loc[(train_df.label == 'humor'),'label'] = 'fake'
test_df.loc[(test_df.label == 'humor'),'label'] = 'fake'

In [31]:
emojis = re.compile("["
                    u"\U0001F600-\U0001F64F"
                    u"\U0001F300-\U0001F5FF"
                    u"\U0001F680-\U0001F6FF"
                    u"\U0001F1E0-\U0001F1FF"
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)

In [32]:
def preprocess(df): 
  #Removing remaining twitter handles @username
  df['tweetText'] = df['tweetText'].apply(lambda text: re.sub(r'@\w*', "", text))
  #Removing emojis
  df['tweetText'] = df['tweetText'].apply(lambda text: emojis.sub(r'', text) if emojis.search(text) else text)
  #Cleaning symbols - ampersand and newline
  df['tweetText'] = df['tweetText'].apply(lambda text: re.sub(r'&amp;|\\n', '', text))  
  #Removing urls
  df['tweetText'] = df['tweetText'].apply(lambda text: re.sub(r'http\S+', '', text))
  df['tweetText'] = df['tweetText'].apply(lambda text: re.sub(r'\\\/\S+', '', text))
  #Removing whitespace
  df['tweetText'] = df['tweetText'].apply(lambda text: " ".join(text.split()))
  return df

In [33]:
train_df = preprocess(train_df)
train_df.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,: Miren a Sandy en NY! Tremenda imagen del hur...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


In [34]:
test_df = preprocess(test_df)
test_df.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,578854927457349632,kereeen RT : Eclipse from ISS....,70824972,eclipse_01,peay_s,Fri Mar 20 09:45:43 +0000 2015,fake
1,578874632670953472,Absolutely beautiful! RT : Eclipse from ISS....,344707006,eclipse_01,JaredUcanChange,Fri Mar 20 11:04:02 +0000 2015,fake
2,578891261353984000,“: Eclipse from ISS.... 3.20 Wow! amazing!,224839607,eclipse_01,tpjp1231,Fri Mar 20 12:10:06 +0000 2015,fake
3,578846612312748032,Eclipse from ISS....,134543073,eclipse_01,Shyman33,Fri Mar 20 09:12:41 +0000 2015,fake
4,578975333841551360,: Éclipse vue de l'ISS... Autre chose... créat...,1150728872,eclipse_01,Epimethee_,Fri Mar 20 17:44:11 +0000 2015,fake


In [35]:
train_df['label'] = train_df['label'].apply(lambda x: 0 if x=='fake' else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x=='fake' else 1)

In [36]:
final_train_df = train_df[['tweetText', 'label']]
final_train_df.head()

Unnamed: 0,tweetText,label
0,¿Se acuerdan de la película: “El día después d...,0
1,: Miren a Sandy en NY! Tremenda imagen del hur...,0
2,"Buena la foto del Huracán Sandy, me recuerda a...",0
3,Scary shit #hurricane #NY,0
4,My fave place in the world #nyc #hurricane #sa...,0


In [37]:
final_test_df = test_df[['tweetText', 'label']]
final_test_df.head()

Unnamed: 0,tweetText,label
0,kereeen RT : Eclipse from ISS....,0
1,Absolutely beautiful! RT : Eclipse from ISS....,0
2,“: Eclipse from ISS.... 3.20 Wow! amazing!,0
3,Eclipse from ISS....,0
4,: Éclipse vue de l'ISS... Autre chose... créat...,0


In [38]:
# BERT
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=False, name='encoder')

In [55]:
# Electra
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')
bert_encoder = hub.KerasLayer("https://tfhub.dev/google/electra_small/2", trainable=False, name='encoder')

In [160]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text-layer')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
#d_layer = tf.keras.layers.Dense(16, activation='relu', name="hidden")(outputs['pooled_output'])
d_layer = tf.keras.layers.Dropout(0.1, name="dropout-layer")(outputs['pooled_output'])
d_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(d_layer)
model = tf.keras.Model(inputs=[text_input], outputs = [d_layer])

In [161]:
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text-layer (InputLayer)        [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_mask': (Non  0           ['text-layer[0][0]']             
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                               

In [162]:
m= [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

In [163]:
# Optimizer
epochs = 10
batch_size = 32


In [173]:
import math

steps_per_epoch = math.ceil(final_train_df.shape[0]/batch_size)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 0.01
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')
#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=m)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=m)

In [174]:
X_train = final_train_df['tweetText']
y_train = final_train_df['label']
X_test = final_test_df['tweetText']
y_test = final_test_df['label']

In [175]:
final_train_df.groupby(['label'])['label'].count()

label
0    9356
1    4921
Name: label, dtype: int64

In [176]:
final_test_df.groupby(['label'])['label'].count()

label
0    2546
1    1209
Name: label, dtype: int64

In [177]:
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe02f12dca0>

In [103]:
y_pred = model.predict(X_test)
y_pred = [1 if i>=0.5 else 0 for i in y_pred]



In [104]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score

In [105]:
print(f'Accuracy == {accuracy_score(y_test,y_pred)}')
print(f'F1 == {f1_score(y_test,y_pred,average=None)}')
print(f'Precision == {precision_score(y_test,y_pred,average=None)}')
print(f'Recall == {recall_score(y_test,y_pred,average=None)}')

Accuracy == 0.6756324900133156
F1 == [0.80006566 0.14104372]
Precision == [0.68725324 0.4784689 ]
Recall == [0.95718775 0.08271299]
