In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Imports**

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Functions**

In [0]:
# Functions
def get_details(nested_list):
  '''
  Returns locc and flatten;
  '''
  #flatten
  flat_ = [item for sublist in nested_list for item in sublist]
  locc = []
  for i in range(len(nested_list)):
    locc.append(len(nested_list[i]))
  # locc contain sentence count for each article

  print('Number of sentences {}'.format(len(flat_)))
  print('First five articles sentence count {} '.format(locc[:5]))
  return (locc,flat_)

# google embeddings
def get_sentence_embeddings(x):
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(google_embed(x))
  return message_embeddings

sentence_encoder_module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" 
google_embed = hub.Module(sentence_encoder_module_url)


INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/3'.
INFO:tensorflow:Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/3, Total size: 810.60MB
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/3'.


# **Dataset-3**

In [0]:
# Modules
tf.logging.set_verbosity(tf.logging.ERROR)

In [0]:
dic = {'FAKE':1,'REAL':0}

In [0]:
data = pd.read_csv('drive/My Drive/CSV/fake_or_real_news.csv')[['text','label']]
data['label'] = data['label'].apply(lambda x: dic[x])
data['sent_c'] = data['text'].astype(str).apply(sent_tokenize).apply(len)
data.head()

Unnamed: 0,text,label,sent_c
0,"Daniel Greenfield, a Shillman Journalism Fello...",1,87
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,1,26
2,U.S. Secretary of State John F. Kerry said Mon...,0,16
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1,17
4,It's primary day in New York and front-runners...,0,21


In [0]:
# Maximum number of sentences an article can have.
np.max(data['sent_c']),np.mean(data['sent_c']) #1 unreliable

(1344, 34.31349644830308)

In [0]:
data['label'].value_counts()

0    3171
1    3164
Name: label, dtype: int64

In [0]:
# Vectorized operations.
# Selecting only those articles which have atleast 7 sentences and maximum of 50 sentences.
# These are stored under different series.
reliable = data[(data['label'] == 0) & ((data['sent_c'] > 7 ) & (data['sent_c'] < 50))]['text'].astype(str).reset_index(drop = True).apply(sent_tokenize)
fake = data[(data['label'] == 1) & ((data['sent_c'] > 7 ) & (data['sent_c'] < 50))]['text'].astype(str).reset_index(drop = True).apply(sent_tokenize)
print('Type\t Count \nFake\t {} \nReal\t {} '.format(len(fake),len(reliable)))

Type	 Count 
Fake	 1993 
Real	 2005 


In [0]:
# Variables to store Sentence count of each articles and also flatten sentences belonging to particular class.
# locc_x stores number of sentences in article at index 'i'.
# flat_x list of sentences belonging to a particular class.
locc_r,flat_r = get_details(reliable)
locc_f,flat_f = get_details(fake)

Number of sentences 59622
First five articles sentence count [16, 21, 32, 33, 13] 
Number of sentences 42579
First five articles sentence count [26, 17, 18, 15, 18] 


In [0]:
%%time
# fake
# get flatten embeddings
emb_l = []
for i in range(100,2100,100):
  g = get_sentence_embeddings(flat_f[np.sum(locc_f[:i-100],dtype=int):np.sum(locc_f[:i],dtype=int)])
  emb_l.extend(g)
  print('Done - ',i)

Done -  100
Done -  200
Done -  300
Done -  400
Done -  500
Done -  600
Done -  700
Done -  800
Done -  900
Done -  1000
Done -  1100
Done -  1200
Done -  1300
Done -  1400
Done -  1500
Done -  1600
Done -  1700
Done -  1800
Done -  1900
Done -  2000
CPU times: user 3min 20s, sys: 29 s, total: 3min 49s
Wall time: 4min 18s


In [0]:
# pad and add
# Pad each article by size (50,) 
def fix(arr,shp=50):
  dif = shp - arr.shape[0]
  temp = np.concatenate((arr, np.zeros((dif,512))))
  return temp

In [0]:
# Converts emb_l to numpy.ndarray
kf = np.array(emb_l)

In [0]:
# Stores Embeddings article wise.
# 3-D numpy array
# First axis is for article, sencond for sentence in that article, last for embedding of that sentence.
# This block saves the sentence embeddings for fake articles in ndarray of size (2100,50,512) 
# For Fake.
fake_eb = np.zeros((2100,50,512))
for i in range(2100):
  fake_eb[i] = fix(kf[np.sum(locc_f[:i],dtype=int):np.sum(locc_f[:i+1],dtype=int)])
  if i % 1000 == 0:
    print(i)

0
1000
2000


In [0]:
locc_f[-1]

49

In [0]:
# Embedding of article 1992
fake_eb[1992]

array([[ 0.00516833,  0.04501137,  0.01362785, ...,  0.01684136,
        -0.0797141 ,  0.08928163],
       [-0.00821706,  0.0650119 ,  0.02585351, ..., -0.01776878,
        -0.08600824,  0.03477053],
       [ 0.00519345,  0.06998067,  0.04389189, ...,  0.00185938,
        -0.06818746,  0.02950338],
       ...,
       [-0.02150729,  0.0653121 ,  0.0278233 , ...,  0.06458346,
        -0.09412916, -0.01395573],
       [ 0.05143762, -0.00100844,  0.01426676, ...,  0.05077499,
        -0.08185079, -0.04849487],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [0]:
# Save
np.save('drive/My Drive/CSV/emb/gm_fake_art_emb.npy',fake_eb[:1992])
print('DONE')

DONE


In [0]:
# For Real articles
%%time
# get flatten embeddings
emb_l = []
for i in range(100,2100,100):
  g = get_sentence_embeddings(flat_r[np.sum(locc_r[:i-100],dtype=int):np.sum(locc_r[:i],dtype=int)])
  emb_l.extend(g)
  print('Done - ',i)

kr = np.array(emb_l)
# store article wise
# real
print('Storing article wise')
real_eb = np.zeros((2005,50,512))
for i in range(2005):
  real_eb[i] = fix(kr[np.sum(locc_r[:i],dtype=int):np.sum(locc_r[:i+1],dtype=int)])
  if i % 1000 == 0:
    print(i)



Done -  500
Done -  600
Done -  700
Done -  800
Done -  900
Done -  1000
Done -  1100
Done -  1200
Done -  1300
Done -  1400
Done -  1500
Done -  1600
Done -  1700
Done -  1800
Done -  1900
Done -  2000
Storing article wise
0
1000
2000
CPU times: user 7min 48s, sys: 38 s, total: 8min 26s
Wall time: 9min 5s


In [0]:
locc_r[-1]

40

In [0]:
real_eb[1900]

array([[-0.00738448,  0.06155591, -0.01431142, ...,  0.04967426,
        -0.0654976 , -0.0503073 ],
       [ 0.03444352,  0.02450357, -0.02920063, ...,  0.0609739 ,
        -0.06556787, -0.02084291],
       [-0.05341914,  0.05327339, -0.02250584, ...,  0.00187581,
        -0.06644455, -0.02493873],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [0]:
np.save('drive/My Drive/CSV/emb/gm_real_art_emb.npy',real_eb[:1900])
print('DONE')

DONE


In [0]:
# sanity check
real_eb[0] - fake_eb[0]

array([[-0.01181953, -0.0329638 , -0.0759801 , ...,  0.01908643,
        -0.03077014, -0.02270482],
       [-0.04236378,  0.04218095, -0.03740076, ..., -0.03504987,
        -0.07494948, -0.07400515],
       [-0.02795526, -0.02811907, -0.09401192, ..., -0.00776716,
        -0.06690258, -0.11275351],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## Done