# Code for predicting type of fake news

Following two datasets used for this study:

**All Fakes -->**
1:"Disinformation", 2:"Hoax", 3:"Propaganda", 4:"Trusted"

**TOVS -->**
1:"Satire", 2:"Hoax", 3:"Propaganda", 4:"Trusted"



In [22]:
import numpy as np
import os
import pickle
import pandas as pd
import re
import tensorflow as tf
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score

In [24]:
data_path = 'data/'
intermed_path = 'intermediate/'
model_path = 'model/'

In [62]:
df = pd.read_csv(os.path.join(data_path, 'allfakes_test.csv'))
print(df.shape)
df[:5]

(772, 2)


Unnamed: 0,label,content
0,1,what with warships in our harbour and troops i...
1,2,hillary vp pick turns on her and humiliates he...
2,2,angry liberals tried to take down ivanka so sh...
3,4,prices of chinas domestically made automobiles...
4,4,a medical doctor in taichung recently pubished...


In [63]:

import string
df['content'] = df['content'].apply(lambda x: str(x))
df['content'] = df['content'].apply(lambda x: x.lower())
df['content'] = df['content'].apply(lambda x: re.sub('\\d+', ' ', x))
df['content'] = df['content'].apply(lambda x: x.replace('\n', ' '))
df['content'] = df['content'].apply(lambda x: x.replace('\r', ' '))
df['content'] = df['content'].apply(lambda x: x.replace('-', ' '))
df['content'] = df['content'].apply(lambda x: x.replace('::', ' '))
df['content'] = df['content'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
df['content'] = df['content'].apply(lambda x: x.encode("ascii", errors="ignore").decode())
df['content'] = df['content'].apply(lambda x: re.sub('<[^<]+?>', ' ', x))
df['content'] = df['content'].apply(lambda x: x.strip())
print(df.shape)
df.head()
df.label.value_counts()

(772, 2)


4    213
2    213
3    199
1    147
Name: label, dtype: int64

In [64]:
#Load Tokenizer
field_lengths = [1000]
field_names = ['content']

with open(os.path.join(intermed_path, 'allfakes_tokenizer.pickle'), 'rb') as handle:
  tokenizer = pickle.load(handle)

num_of_cols = sum(field_lengths)
test_data_result = np.zeros(shape=(df.shape[0], num_of_cols), dtype=np.float32)
cur_start = 0
cur_end = 0

for field_name,field_len in zip(field_names,field_lengths):
       cur_start = cur_end
       cur_end += field_len
       seq = tokenizer.texts_to_sequences(df[field_name].values)

       test_data_result[:, cur_start:cur_end] = tf.keras.preprocessing.sequence.pad_sequences(
               seq,
               maxlen=field_len,
               padding='post',
               truncating='post',
               value=0)

print(test_data_result)


[[4.1000e+01 1.2000e+01 1.2964e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.6000e+02 8.4050e+03 2.3830e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [2.3120e+03 1.6670e+03 9.3200e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [3.7000e+01 2.2400e+03 2.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [4.0384e+04 6.4580e+03 2.1396e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [6.4100e+02 2.0400e+03 6.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]]


In [72]:
#Load model

model = tf.keras.models.load_model(
    filepath= os.path.join(model_path, 'lstm_model_weight_allfakes.h5'))

model.summary()


Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
lstm_input (InputLayer)         [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embeddings (Embedding)          (None, 1000, 100)    30000000    lstm_input[0][0]                 
__________________________________________________________________________________________________
lstm_11 (LSTM)                  (None, 512)          1255424     embeddings[0][0]                 
__________________________________________________________________________________________________
lstm_12 (LSTM)                  (None, 512)          1255424     embeddings[0][0]                 
____________________________________________________________________________________________

In [73]:
predictions = model.predict(test_data_result,batch_size=32)
print('Number of predictions by model:', len(predictions))
print(predictions.shape)

if predictions.shape[-1] > 1:
  preds = np.argmax(predictions, axis=-1).tolist()
else:
  preds = (predictions > 0.5).astype('int32')
scores = np.max(predictions, axis=-1).tolist()

with open(os.path.join(intermed_path, 'allfakes_lb.pkl'), 'rb') as handle:
       mlb = pickle.load(handle)

classes = mlb.classes_
labels = [classes[c] for c in preds]


Number of predictions by model: 772
(772, 4)


In [74]:
result = df[['content','label']]
result['predict'] = labels
result['predict_score'] = scores

result['true'] = result.apply(lambda row: str(row['label']) == str(row['predict']), axis=1)

result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,content,label,predict,predict_score,true
0,what with warships in our harbour and troops i...,1,1,0.816495,True
1,hillary vp pick turns on her and humiliates he...,2,2,0.999942,True
2,angry liberals tried to take down ivanka so sh...,2,2,0.999993,True
3,prices of chinas domestically made automobiles...,4,4,0.999995,True
4,a medical doctor in taichung recently pubished...,4,4,0.992795,True
...,...,...,...,...,...
767,breaking its not over gop stand up to overturn...,2,2,0.999976,True
768,predicting the future is always notoriously di...,1,1,0.679003,True
769,its nice to see people coming out and supporti...,1,2,0.998994,False
770,reefer madness grips bureaucratsrady ananda fo...,3,3,0.997850,True


In [75]:
overall_df = result[result['true']==True]
acc = overall_df.shape[0]/result.shape[0]

print(acc)

0.8756476683937824


In [76]:
y_true = result['label'].astype('int32').tolist()
y_preds = result['predict'].astype('int32').tolist()
n_labels = result['label'].unique().astype('int32').tolist()
cm = confusion_matrix(y_true, y_preds, labels = n_labels)
print('Confusion Matrix \n')
print(cm)

f1_each= f1_score(y_true, y_preds, average=None)
print('F1 Each score:', f1_each)

f1_all= f1_score(y_true, y_preds, average='weighted')
print('F1 All score:', f1_all)

Confusion Matrix 

[[127  10   3   7]
 [  7 191   4  11]
 [  9   1 185  18]
 [  4   8  14 173]]
F1 Each score: [0.86394558 0.90307329 0.84803922 0.88305489]
F1 All score: 0.8759133497082275
