**All Fakes -->**
1:"Disinformation", 2:"Hoax", 3:"Propaganda", 4:"Trusted"

**TOVS -->**
1:"Satire", 2:"Hoax", 3:"Propaganda", 4:"Trusted"



In [None]:
import numpy as np
import os
import pandas as pd
import pickle
import re
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
data_path = 'data/'
intermed_path = 'intermediate/'
model_path = 'model/'

In [6]:
df = pd.read_csv(os.path.join(data_path,'all_fakes.csv'))
print(df.shape)
df.head()

(3856, 3)


Unnamed: 0,label,content,file
0,3,CISA Systemic Domestic SpyingBy SARTRE Coercio...,
1,4,The proposed trade agreement with China will b...,
2,4,"The program, launched after the terrorist atta...",
3,2,Obama Administration Threatening Ben Carsons C...,
4,2,Black Student President Forced To Resign After...,


## **Preprocess All Fakes Train Set**

In [None]:
train = pd.read_csv(os.path.join(data_path, 'allfakes_train.csv'))
print(train.shape)
train.head()

(3084, 2)


Unnamed: 0,label,content
0,1,donate as of midday on march th the number o...
1,1,president trump on saturday slammed the mainst...
2,2,state requires welfare recipients to work nort...
3,2,young boy flags down cop gives him this and le...
4,4,hungarys marton fucsovics won the wimbledon bo...


In [None]:
import string
train['content'] = train['content'].apply(lambda x: str(x))
train['content'] = train['content'].apply(lambda x: x.lower())
train['content'] = train['content'].apply(lambda x: re.sub('\\d+', ' ', x))
train['content'] = train['content'].apply(lambda x: x.replace('\n', ' '))
train['content'] = train['content'].apply(lambda x: x.replace('\r', ' '))
train['content'] = train['content'].apply(lambda x: x.replace('-', ' '))
train['content'] = train['content'].apply(lambda x: x.replace('::', ' '))
train['content'] = train['content'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
train['content'] = train['content'].apply(lambda x: x.encode("ascii", errors="ignore").decode())
train['content'] = train['content'].apply(lambda x: re.sub('<[^<]+?>', ' ', x))
train['content'] = train['content'].apply(lambda x: x.strip())
print(train.shape)
train.head()

(3084, 2)


Unnamed: 0,label,content
0,1,donate as of midday on march th the number o...
1,1,president trump on saturday slammed the mainst...
2,2,state requires welfare recipients to work nort...
3,2,young boy flags down cop gives him this and le...
4,4,hungarys marton fucsovics won the wimbledon bo...


In [None]:
train = train.drop_duplicates(subset='content', inplace=False, keep = 'first')
train.shape

(3072, 2)

In [None]:
corpus = []
corpus.extend(train['content'].tolist())
print(len(corpus))

3072


In [None]:
vocab_size_src = 300000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size_src)
tokenizer.fit_on_texts(corpus)


In [None]:
with open(os.path.join(intermed_path, 'allfakes_tokenizer.pickle'), 'wb') as handle:
       pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
field_lengths = [1000]
field_names = ['content']

num_of_cols = sum(field_lengths)
data_result = np.zeros(shape=(train.shape[0], num_of_cols), dtype=np.float32)
cur_start = 0
cur_end = 0
for field_name,field_len in zip(field_names,field_lengths):
  cur_start = cur_end
  cur_end += field_len
  seq = tokenizer.texts_to_sequences(train[field_name].values)
  data_result[:, cur_start:cur_end] = tf.keras.preprocessing.sequence.pad_sequences(seq,
                                      maxlen=field_len, padding='post', truncating='post', value=0)

np.savez(os.path.join(intermed_path, 'allfakes_feat.npz'), data_result)
print(data_result)
print(data_result.shape)

[[3.6130e+03 1.3000e+01 3.0000e+00 ... 1.0000e+00 3.8280e+03 3.0000e+00]
 [7.7000e+01 8.1000e+01 1.0000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [8.7000e+01 1.9960e+03 2.1030e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [8.2000e+01 9.2000e+01 9.1000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.5353e+04 1.4530e+03 2.5400e+02 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [3.3200e+02 8.1000e+01 6.6000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
(3072, 1000)


In [None]:
train['label'] = train['label'].astype(int).astype(str).apply(lambda x : [x])
mlb = MultiLabelBinarizer()
mlb.fit_transform(train['label'])

with open(os.path.join(intermed_path,'allfakes_lb.pkl'), 'wb') as f:
  pickle.dump(mlb, f)


In [None]:
num_of_rows = train.shape[0]
num_of_labels = mlb.classes_.shape[0] 
label_result = np.zeros(shape=(num_of_rows, num_of_labels), dtype=np.int)
label_result = mlb.transform(train['label']).astype(np.int)

np.savez(os.path.join(intermed_path, 'allfakes_lb.npz'), label_result)
print(mlb.classes_)

['1' '2' '3' '4']


## **Preprocess TOVS Train Set**

In [None]:
train = pd.read_csv(os.path.join(data_path, 'tovs_train.csv'))
print(train.shape)
train.head()

(16000, 2)


Unnamed: 0,label,content
0,1,"GREEN BAY, WIDavid Horsted, 45, announced Mond..."
1,3,CISA Systemic Domestic SpyingBy SARTRE Coercio...
2,1,A local resident's search for a public bathroo...
3,1,A five-minute sampling of Hindi-language chann...
4,4,The proposed trade agreement with China will b...


In [None]:
import string
train['content'] = train['content'].apply(lambda x: str(x))
train['content'] = train['content'].apply(lambda x: x.lower())
train['content'] = train['content'].apply(lambda x: re.sub('\\d+', ' ', x))
train['content'] = train['content'].apply(lambda x: x.replace('\n', ' '))
train['content'] = train['content'].apply(lambda x: x.replace('\r', ' '))
train['content'] = train['content'].apply(lambda x: x.replace('-', ' '))
train['content'] = train['content'].apply(lambda x: x.replace('::', ' '))
train['content'] = train['content'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
train['content'] = train['content'].apply(lambda x: x.encode("ascii", errors="ignore").decode())
train['content'] = train['content'].apply(lambda x: re.sub('<[^<]+?>', ' ', x))
train['content'] = train['content'].apply(lambda x: x.strip())
print(train.shape)
train.head()

(16000, 2)


Unnamed: 0,label,content
0,1,green bay widavid horsted announced monday t...
1,3,cisa systemic domestic spyingby sartre coercio...
2,1,a local residents search for a public bathroom...
3,1,a five minute sampling of hindi language chann...
4,4,the proposed trade agreement with china will b...


In [None]:
train = train.drop_duplicates(subset='content', inplace=False, keep = 'first')
train.shape

(15962, 2)

In [None]:
corpus = []
corpus.extend(train['content'].tolist())
print(len(corpus))

15962


In [None]:
vocab_size_src = 300000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size_src)
tokenizer.fit_on_texts(corpus)


In [None]:
with open(os.path.join(intermed_path, 'tovs_tokenizer.pickle'), 'wb') as handle:
       pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
field_lengths = [1000]
field_names = ['content']

num_of_cols = sum(field_lengths)
data_result = np.zeros(shape=(train.shape[0], num_of_cols), dtype=np.float32)
cur_start = 0
cur_end = 0
for field_name,field_len in zip(field_names,field_lengths):
  cur_start = cur_end
  cur_end += field_len
  seq = tokenizer.texts_to_sequences(train[field_name].values)
  data_result[:, cur_start:cur_end] = tf.keras.preprocessing.sequence.pad_sequences(seq,
                                      maxlen=field_len, padding='post', truncating='post', value=0)

np.savez(os.path.join(intermed_path, 'tovs_feat.npz'), data_result)
print(data_result)
print(data_result.shape)

[[1.2050e+03 2.2430e+03 6.3857e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.6472e+04 8.2350e+03 1.0700e+03 ... 8.1560e+03 1.4600e+02 1.0000e+00]
 [5.0000e+00 2.2500e+02 1.0290e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [2.9700e+02 4.0760e+03 2.6000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.3300e+03 3.9200e+02 1.0260e+03 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [6.7600e+02 1.1284e+04 1.4000e+01 ... 0.0000e+00 0.0000e+00 0.0000e+00]]
(15962, 1000)


In [None]:
train['label'] = train['label'].astype(int).astype(str).apply(lambda x : [x])
mlb = MultiLabelBinarizer()
mlb.fit_transform(train['label'])

with open(os.path.join(intermed_path,'tovs_lb.pkl'), 'wb') as f:
  pickle.dump(mlb, f)


In [None]:
num_of_rows = train.shape[0]
num_of_labels = mlb.classes_.shape[0] 
label_result = np.zeros(shape=(num_of_rows, num_of_labels), dtype=np.int)
label_result = mlb.transform(train['label']).astype(np.int)

np.savez(os.path.join(intermed_path, 'tovs_lb.npz'), label_result)
print(mlb.classes_)

['1' '2' '3' '4']
