In [36]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [37]:
!pip install np_utils

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting np_utils
  Downloading np_utils-0.6.0.tar.gz (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: np_utils
  Building wheel for np_utils (setup.py) ... [?25l[?25hdone
  Created wheel for np_utils: filename=np_utils-0.6.0-py3-none-any.whl size=56460 sha256=0325bca05c9967892883f936f2065ce6889d3674841ab04d5be32aae82b7ab0b
  Stored in directory: /root/.cache/pip/wheels/65/07/4b/1c96f437e1bec60b3d2acd0b81d0a7969505f5251efbb5a060
Successfully built np_utils
Installing collected packages: np_utils
Successfully installed np_utils-0.6.0


In [38]:
import numpy as np
import pandas as pd
from plotly.offline import iplot
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from tqdm import tqdm

In [39]:
politifact = pd.read_json(r'/content/politifact.json')
snopes = pd.read_json(r'/content/snopes.json')

In [40]:
# column names
poli_cols = politifact.columns.to_list()
snop_cols = snopes.columns.to_list()

poli_row_count, poli_col_count = politifact.shape
snop_row_count, snop_col_count = snopes.shape

In [41]:
common_cols = set(poli_cols) & set(snop_cols)

td = pd.concat([snopes[list(common_cols)], politifact[list(common_cols)]])
td = td.reindex(columns = ['doc', 'claim', 'factchecker', 'url', 'sources', 'topic', 'published' ,'label' ])


In [42]:
td2 =td.dropna(subset=['doc','claim','label'])

In [43]:
print(td2.shape, td.shape)

(22298, 8) (22298, 8)


In [48]:
td2['label']=td2.label.map(lambda x: x.lower())

In [51]:
valid_labels = ['false','true','half-true','mostly-true','barely-true','pants-fire','mixture']

td2 = td2[td2.label.isin(valid_labels)]

In [55]:
td2 = td2.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
td2['claim'] = td2['claim'].apply(clean_text)
td2['doc'] = td2['doc'].apply(clean_text)

In [57]:
td2['body'] = td2['doc'].str.cat(td2['claim'], sep=' ')

In [59]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(td2['body'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 223445 unique tokens.


In [60]:
X = tokenizer.texts_to_sequences(td2['body'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (20874, 250)


In [61]:
Y = pd.get_dummies(td2['label']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (20874, 7)


In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(18786, 250) (18786, 7)
(2088, 250) (2088, 7)


In [65]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 250, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_4 (LSTM)               (None, 100)               80400     
                                                                 
 dense_4 (Dense)             (None, 7)                 707       
                                                                 
Total params: 5,081,107
Trainable params: 5,081,107
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
     

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
 31/265 [==>...........................] - ETA: 3:25 - loss: 0.0663 - accuracy: 0.9839

In [15]:
embedding_vector_fetures = 64
vocab_size = 5000

model = Sequential()
model.add(Embedding(vocab_size,embedding_vector_fetures,input_length= sent_len))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(24, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 64)           320000    
                                                                 
 lstm_1 (LSTM)               (None, 100)               66000     
                                                                 
 dense_1 (Dense)             (None, 24)                2424      
                                                                 
Total params: 388,424
Trainable params: 388,424
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
model.fit(x_train, final_y_train, epochs=1, batch_size=32, validation_data=(x_test,final_y_test))

InvalidArgumentError: ignored