In [1]:
import pandas as pd
import tensorflow as tf
import os
import re
import json
import numpy as np
from string import punctuation
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# importing neural network libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GRU, LSTM, RNN, SpatialDropout1D, Concatenate, Bidirectional

In [3]:
train_tweets = []
validation_tweets = []
test_tweets = []

for line in open(r'C:\Users\Dell\Desktop\train.json','r'):
    train_tweets.append(json.loads(line))
    
for line in open(r'C:\Users\Dell\Desktop\val.json','r'):
    validation_tweets.append(json.loads(line))

for line in open(r'C:\Users\Dell\Desktop\test.json', 'r'):
    test_tweets.append(json.loads(line))   

In [4]:
train_data = train_tweets.copy()

In [5]:
train_data = pd.DataFrame(train_data)
validation_df = pd.DataFrame(validation_tweets)
test_df = pd.DataFrame(test_tweets)

In [6]:
train_data.columns

Index(['claim', 'id', 'json_file_id', 'justification', 'label', 'originator',
       'party', 'title', 'topics'],
      dtype='object')

In [7]:
len(train_data)

10269

In [8]:
# swc_y_train = train_data['label']
# swc_y_validation = validation_df['label']
# swc_y_test = test_df['label']

In [9]:
# swc_y_train

In [8]:
#train data
train_data['new'] = train_data['claim'].map(str) + train_data['justification'].map(str)
for i in range(len(train_data['label'])):
    train_data['label'].replace(['mostly-true','half-true'], 'true', regex=True, inplace=True)
    train_data['label'].replace(['barely-true','pants-fire'], 'false', regex=True, inplace=True)
    
y_train = pd.get_dummies(train_data['label'], drop_first=True)

#validation data 
for i in range(len(validation_df['label'])):
    validation_df['label'].replace(['mostly-true','half-true'], 'true', regex=True, inplace=True)
    validation_df['label'].replace(['barely-true','pants-fire'], 'false', regex=True, inplace=True)

y_validation = pd.get_dummies(validation_df['label'], drop_first=True)

#test data
for i in range(len(test_df['label'])):
    test_df['label'].replace(['mostly-true','half-true'], 'true', regex=True, inplace=True)
    test_df['label'].replace(['barely-true','pants-fire'], 'false', regex=True, inplace=True)
    
y_test = pd.get_dummies(test_df['label'], drop_first=True)

In [9]:
#combining claim and justification
train_data['new'] = train_data['claim'].map(str) + train_data['justification'].map(str)
validation_df['new'] = validation_df['claim'].map(str) + validation_df['justification'].map(str)
test_df['new'] = test_df['claim'].map(str) + test_df['justification'].map(str)

In [10]:
train_data.columns

Index(['claim', 'id', 'json_file_id', 'justification', 'label', 'originator',
       'party', 'title', 'topics', 'new'],
      dtype='object')

In [11]:
len(train_data['new'])

10269

In [12]:
length = []
for text in train_data['new']:
    length.append(len(str(text))) 
train_data['length'] = length
train_data.head()

Unnamed: 0,claim,id,json_file_id,justification,label,originator,party,title,topics,new,length
0,Says the Annies List political group supports ...,0,2635.json,That's a premise that he fails to back up. Ann...,False,dwayne-bohac,Texas,State representative,[abortion],Says the Annies List political group supports ...,331
1,When did the decline of coal start? It started...,1,10540.json,"""Surovell said the decline of coal """"started w...",True,scott-surovell,Virginia,State delegate,"[energy, history, job-accomplishments]",When did the decline of coal start? It started...,783
2,"""Hillary Clinton agrees with John McCain """"by ...",2,324.json,"""Obama said he would have voted against the am...",True,barack-obama,Illinois,President,[foreign-policy],"""Hillary Clinton agrees with John McCain """"by ...",521
3,Health care reform legislation is likely to ma...,3,1123.json,"""The release may have a point that Mikulskis c...",False,blog-posting,,,[health-care],Health care reform legislation is likely to ma...,711
4,The economic turnaround started at the end of ...,4,9028.json,"""Crist said that the economic """"turnaround sta...",True,charlie-crist,Florida,,"[economy, jobs]",The economic turnaround started at the end of ...,618


In [13]:
min(train_data['length']), max(train_data['length']), round(sum(train_data['length'])/len(train_data['length']))

(32, 9579, 535)

In [14]:
max_features = 520

TOKENIZING FOR CLAIMS

In [15]:
#training data
tokenizer = Tokenizer(num_words = max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(texts = train_data['claim'])
X_claim = tokenizer.texts_to_sequences(texts = train_data['claim'])
X_claim = pad_sequences(sequences = X_claim, maxlen = max_features, padding = 'pre')

#validation data
X_claim_validation = tokenizer.texts_to_sequences(texts = validation_df['claim'])
X_claim_validation = pad_sequences(sequences = X_claim_validation, maxlen = max_features, padding = 'pre')

#testing data
X_claim_test = tokenizer.texts_to_sequences(texts = test_df['claim'])
X_claim_test = pad_sequences(sequences = X_claim_test, maxlen = max_features, padding = 'pre')


In [16]:
print(X_claim.shape)
print(y_train.shape)
print(X_claim_validation.shape)
print(y_validation.shape)
print(X_claim_test.shape)
print(y_test.shape)

(10269, 520)
(10269, 1)
(1284, 520)
(1284, 1)
(1283, 520)
(1283, 1)


In [20]:
#BiLSTM model

bilstm_model = Sequential(name = 'lstm_nn_model')
bilstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
bilstm_model.add(layer = Bidirectional(LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer')))
bilstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
bilstm_model.add(layer = Dense(units = 100,  activation = 'softmax', name = '4th_layer'))
bilstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
bilstm_model.add(layer = Dense(units = len(set(y_train)),  activation = 'sigmoid', name = 'output_layer'))
# compiling the model
bilstm_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [21]:
lstm_model_val_lstm_model_fit = bilstm_model.fit(X_claim, y_train,validation_split=0.1, batch_size= 64,epochs = 1)



In [22]:
#bidirectional
accr1 = bilstm_model.evaluate(X_claim_validation,y_validation)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr1[0],accr1[1]))
accr2 = bilstm_model.evaluate(X_claim_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr2[0],accr2[1]))

Test set
  Loss: 0.695
  Accuracy: 0.520
Test set
  Loss: 0.684
  Accuracy: 0.567


In [20]:
#LSTM model

lstm_model = Sequential(name = 'lstm_nn_model')
lstm_model.add(layer = Embedding(input_dim = max_features, output_dim = 120, name = '1st_layer'))
lstm_model.add(layer = LSTM(units = 120, dropout = 0.2, recurrent_dropout = 0.2, name = '2nd_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '3rd_layer'))
lstm_model.add(layer = Dense(units = 100,  activation = 'softmax', name = '4th_layer'))
lstm_model.add(layer = Dropout(rate = 0.5, name = '5th_layer'))
lstm_model.add(layer = Dense(units = len(set(y_train)),  activation = 'sigmoid', name = 'output_layer'))
# compiling the model
lstm_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [24]:
claim_val_lstm_model_fit = lstm_model.fit(X_claim, y_train,validation_split=0.1, batch_size= 64,epochs = 1)



In [25]:
accr1 = lstm_model.evaluate(X_claim_validation,y_validation)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr1[0],accr1[1]))
accr2 = lstm_model.evaluate(X_claim_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr2[0],accr2[1]))

Test set
  Loss: 0.695
  Accuracy: 0.520
Test set
  Loss: 0.684
  Accuracy: 0.567


TFIDF FOR CLAIMS

In [17]:
#only for claims

stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer() 

claim_train_corpus = []
for i in range(0, len(train_data['claim'])):
    new_data = re.sub('[^a-zA-Z]'," ", train_data['claim'][i])
    new_data = new_data.lower()
    new_data = new_data.split()
    
    new_data = [lemmatizer.lemmatize(word) for word in new_data if not word in stop_words]
    new_data = ' '.join(new_data)
    claim_train_corpus.append(new_data)
    
claim_val_corpus = []
for i in range(0, len(validation_df['claim'])):
    new_data = re.sub('[^a-zA-Z]'," ", validation_df['claim'][i])
    new_data = new_data.lower()
    new_data = new_data.split()
    
    new_data = [lemmatizer.lemmatize(word) for word in new_data if not word in stop_words]
    new_data = ' '.join(new_data)
    claim_val_corpus.append(new_data)
    
claim_test_corpus = []
for i in range(0, len(test_df['claim'])):
    new_data = re.sub('[^a-zA-Z]'," ", test_df['claim'][i])
    new_data = new_data.lower()
    new_data = new_data.split()
    
    new_data = [lemmatizer.lemmatize(word) for word in new_data if not word in stop_words]
    new_data = ' '.join(new_data)
    claim_test_corpus.append(new_data)

In [18]:
#tfidf
vectorizer = TfidfVectorizer(max_features = 700)
tfidf_x_train = vectorizer.fit_transform(claim_train_corpus).toarray()
tfidf_x_train = pad_sequences(sequences = tfidf_x_train, maxlen = max_features, padding = 'pre')

#tfidf validation data
tfidf_x_val = vectorizer.transform(claim_val_corpus).toarray()
tfidf_x_val = pad_sequences(sequences = tfidf_x_val, maxlen = max_features, padding = 'pre')
# tfidf_claim_val_lstm_model_fit = lstm_model.fit(tfidf_x_train, y_train,
#                                                 validation_data = (tfidf_x_val,y_validation), epochs = 1)

#tfidf test data
tfidf_x_test = vectorizer.transform(claim_test_corpus).toarray()
tfidf_x_test = pad_sequences(sequences = tfidf_x_test, maxlen = max_features, padding = 'pre')
#tfidf_claim_test_lstm_model_fit = lstm_model.fit(tfidf_x_train, y_train,
#                                                validation_data = (tfidf_x_test,y_test), epochs = 1)

In [21]:
tfidf_claim_val_lstm_model_fit = lstm_model.fit(tfidf_x_train, y_train,validation_split=0.1, batch_size= 64,epochs = 1)



In [22]:
accr1 = lstm_model.evaluate(tfidf_x_val,y_validation)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr1[0],accr1[1]))
accr2 = lstm_model.evaluate(tfidf_x_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr2[0],accr2[1]))

Test set
  Loss: 0.694
  Accuracy: 0.520
Test set
  Loss: 0.685
  Accuracy: 0.567


In [32]:
#bidirectional

lstm_model_val_lstm_model_fit = bilstm_model.fit(tfidf_x_train, y_train,validation_split=0.1, batch_size= 64,epochs = 1)




TOKENIZE FOR NEW

In [23]:
#train
tokenizer = Tokenizer(num_words = max_features, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower = True, split = ' ')
tokenizer.fit_on_texts(texts = train_data['new'])
X_train = tokenizer.texts_to_sequences(texts = train_data['new'])
X_train = pad_sequences(sequences = X_train, maxlen = max_features, padding = 'pre')

#validation
X_validation = tokenizer.texts_to_sequences(texts = validation_df['new'])
X_validation = pad_sequences(sequences = X_validation, maxlen = max_features, padding = 'pre')

#test
test_text = tokenizer.texts_to_sequences(texts = test_df['new'])
test_text = pad_sequences(sequences = test_text, maxlen = max_features, padding = 'pre')

In [34]:
tfidf_claim_val_lstm_model_fit = lstm_model.fit(X_train, y_train,validation_split=0.1, batch_size= 10,epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [35]:
accr1 = lstm_model.evaluate(X_validation,y_validation)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr1[0],accr1[1]))
accr2 = lstm_model.evaluate(test_text,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr2[0],accr2[1]))

Test set
  Loss: 0.698
  Accuracy: 0.520
Test set
  Loss: 0.684
  Accuracy: 0.567


In [None]:
# val_lstm_model_fit = lstm_model.fit(X_train, y_train, validation_data = (X_validation,y_validation), epochs = 1)
# test_lstm_model_fit = lstm_model.fit(X_train, y_train, validation_data = (test_text,y_test), epochs = 1)

In [36]:
# print(val_lstm_model_fit.history['accuracy'])
# print(val_lstm_model_fit.history['loss'])
# print(val_lstm_model_fit.history['val_accuracy'])
# print(val_lstm_model_fit.history['val_loss'])

NameError: name 'val_lstm_model_fit' is not defined

TFIDF FOR NEW

In [24]:
#tfidf for new
new_train_corpus = []
for i in range(0, len(train_data['new'])):
    new_data = re.sub('[^a-zA-Z]'," ", train_data['new'][i])
    new_data = new_data.lower()
    new_data = new_data.split()
    
    new_data = [lemmatizer.lemmatize(word) for word in new_data if not word in stop_words]
    new_data = ' '.join(new_data)
    new_train_corpus.append(new_data)
    
new_validation_corpus = []
for i in range(0, len(validation_df['new'])):
    new_data = re.sub('[^a-zA-Z]'," ", validation_df['new'][i])
    new_data = new_data.lower()
    new_data = new_data.split()
    
    new_data = [lemmatizer.lemmatize(word) for word in new_data if not word in stop_words]
    new_data = ' '.join(new_data)
    new_validation_corpus.append(new_data)
    
new_test_corpus = []
for i in range(0, len(test_df['new'])):
    new_data = re.sub('[^a-zA-Z]'," ", test_df['new'][i])
    new_data = new_data.lower()
    new_data = new_data.split()
    
    new_data = [lemmatizer.lemmatize(word) for word in new_data if not word in stop_words]
    new_data = ' '.join(new_data)
    new_test_corpus.append(new_data)

In [25]:
#tfidf
vectorizer = TfidfVectorizer(max_features = 700)
tfidf_x_train_new = vectorizer.fit_transform(new_train_corpus).toarray()
tfidf_x_train_new = pad_sequences(sequences = tfidf_x_train_new, maxlen = max_features, padding = 'pre')

#tfidf validation data
tfidf_x_val_new = vectorizer.transform(new_validation_corpus).toarray()
tfidf_x_val_new = pad_sequences(sequences = tfidf_x_val_new, maxlen = max_features, padding = 'pre')
#tfidf_new_val_lstm_model_fit = lstm_model.fit(tfidf_x_train_new, y_train,
#                                               validation_data = (tfidf_x_val_new,y_validation), epochs = 1)

#tfidf test data
tfidf_x_test_new = vectorizer.transform(new_test_corpus).toarray()
tfidf_x_test_new = pad_sequences(sequences = tfidf_x_test_new, maxlen = max_features, padding = 'pre')
#tfidf_new_test_lstm_model_fit = lstm_model.fit(tfidf_x_train_new, y_train,
#                                                validation_data = (tfidf_x_test_new,y_test), epochs = 1)

In [26]:
tfidf_claim_val_lstm_model_fit = lstm_model.fit(tfidf_x_train_new, y_train,validation_split=0.1, batch_size= 10,epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [27]:
accr1 = lstm_model.evaluate(tfidf_x_val_new,y_validation)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr1[0],accr1[1]))
accr2 = lstm_model.evaluate(tfidf_x_test_new,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr2[0],accr2[1]))

Test set
  Loss: 0.694
  Accuracy: 0.520
Test set
  Loss: 0.685
  Accuracy: 0.567
