<a href="https://colab.research.google.com/github/arnabbiswas66/multimodal-fake-news-classifier/blob/main/Twitter_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install -q -U "tensorflow-text==2.11.*"

In [5]:
!pip install -q tf-models-official==2.11.0

In [6]:

import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os
import re
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
tf.get_logger().setLevel('ERROR')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
def get_df(file):
    return pd.read_csv(file,sep = '\t')

In [9]:
train_df = get_df('/content/drive/MyDrive/multimodal-news/mediaeval2015/devset/tweets.txt')
test_df = get_df('/content/drive/MyDrive/multimodal-news/mediaeval2015/testset/tweets.txt')

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     14277 non-null  int64 
 1   tweetText   14277 non-null  object
 2   userId      14277 non-null  int64 
 3   imageId(s)  14277 non-null  object
 4   username    14277 non-null  object
 5   timestamp   14277 non-null  object
 6   label       14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     3755 non-null   int64 
 1   tweetText   3755 non-null   object
 2   userId      3755 non-null   int64 
 3   imageId(s)  3755 non-null   object
 4   username    3755 non-null   object
 5   timestamp   3755 non-null   object
 6   label       3755 non-null   object
dtypes: int64(2), object(5)
memory usage: 205.5+ KB


In [12]:
#Changing 'humor' to 'fake'
train_df.loc[(train_df.label == 'humor'),'label'] = 'fake'
test_df.loc[(test_df.label == 'humor'),'label'] = 'fake'

In [13]:
#Removing retweets, reposts, and modified tweets
rtPattern1 = "(RT|rt|MT|mt|RP|rp):? @\w*:?"
rtPattern2 = "(\bRT\b|\brt\b|\bMT\b|\bmt\b|\bRP\b|\brp\b)"
rtPattern3 = "(@\w*:)"
rtPattern4 = "(#rt|#RT|#mt|#MT|#rp|#retweet|#Retweet|#modifiedtweet|#modifiedTweet|#ModifiedTweet|#repost|#Repost)"
rtPattern5 = "(via @\w*)"

retweets = train_df['tweetText'].str.contains(rtPattern1)
train_df = train_df[~retweets]

retweets = train_df['tweetText'].str.contains(rtPattern2)
train_df = train_df[~retweets]

retweets = train_df['tweetText'].str.contains(rtPattern3)
train_df = train_df[~retweets]

retweets = train_df['tweetText'].str.contains(rtPattern4)
train_df = train_df[~retweets]

retweets = train_df['tweetText'].str.contains(rtPattern5)
train_df = train_df[~retweets]

train_df.reset_index(drop=True, inplace=True)
train_df.shape

  retweets = train_df['tweetText'].str.contains(rtPattern1)
  retweets = train_df['tweetText'].str.contains(rtPattern2)
  retweets = train_df['tweetText'].str.contains(rtPattern3)
  retweets = train_df['tweetText'].str.contains(rtPattern4)
  retweets = train_df['tweetText'].str.contains(rtPattern5)


(11550, 7)

In [14]:

#Removing remaining twitter handles @username
train_df['tweetText'] = train_df['tweetText'].apply(lambda text: re.sub(r'@\w*', "", text))

In [15]:
#Removing emojis
emojis = re.compile("["
                    u"\U0001F600-\U0001F64F"
                    u"\U0001F300-\U0001F5FF"
                    u"\U0001F680-\U0001F6FF"
                    u"\U0001F1E0-\U0001F1FF"
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)

train_df['tweetText'] = train_df['tweetText'].apply(lambda text: emojis.sub(r'', text) if emojis.search(text) else text)

In [16]:
#Cleaning symbols - ampersand and newline
train_df['tweetText'] = train_df['tweetText'].apply(lambda text: re.sub(r'&amp;|\\n', '', text))

In [17]:
#Removing urls
train_df['tweetText'] = train_df['tweetText'].apply(lambda text: re.sub(r'http\S+', '', text))
train_df['tweetText'] = train_df['tweetText'].apply(lambda text: re.sub(r'\\\/\S+', '', text))

In [18]:
#Removing whitespace
train_df['tweetText'] = train_df['tweetText'].apply(lambda text: " ".join(text.split()))

In [19]:
#Initialise stopwords

stopwords = nltk.corpus.stopwords.words()
stopwords.extend([':', ';', '[', ']', '"', "'", '(', ')', '.', '?', '#', '@', '...'])

In [20]:
#Removing stopwords
train_df['filteredTweet'] = train_df['tweetText'].apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

In [21]:
train_df['label'].unique()

array(['fake', 'real'], dtype=object)

In [22]:
train_df['label'] = train_df['label'].apply(lambda x: 0 if x=='fake' else 1)

In [23]:
train_df['label'].unique()

array([0, 1])

In [24]:
final_train_df = train_df[['tweetText', 'label']]
final_train_df.head()

Unnamed: 0,tweetText,label
0,¿Se acuerdan de la película: “El día después d...,0
1,"Buena la foto del Huracán Sandy, me recuerda a...",0
2,Scary shit #hurricane #NY,0
3,My fave place in the world #nyc #hurricane #sa...,0
4,42nd #time #square #NYC #subway #hurricane,0


In [25]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [26]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text-layer')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
d_layer = tf.keras.layers.Dropout(0.1, name="dropout-layer")(outputs['pooled_output'])
d_layer = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(d_layer)
model = tf.keras.Model(inputs=[text_input], outputs = [d_layer])

In [27]:
m= [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=m)

In [28]:
X_train = final_train_df['tweetText']
y_train = final_train_df['label']

In [29]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbb41725c70>