# Importing all required packages

In [None]:
# Ignore warning messages
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Computations
import itertools

import gensim

# Modelling Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Modelling Helpers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import KFold, cross_val_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


import seaborn as sns
import nltk
import re
from wordcloud import WordCloud

# Visualization
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

# Reading Input Data from CSV Files

In [None]:
# Load the training, test and validation files
train=pd.read_csv('./liar_plus_dataset/dataset/tsv/train2.tsv',delimiter='\t',encoding='utf-8', header=None)
test=pd.read_csv('./liar_plus_dataset/dataset/tsv/test2.tsv',delimiter='\t',encoding='utf-8', header=None)
valid=pd.read_csv('./liar_plus_dataset/dataset/tsv/val2.tsv',delimiter='\t',encoding='utf-8', header=None)

# Create table headers    
train.columns = ['values','id','label','statement','subject','speaker', 'job', 'state','party','barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c','venue','extracted_justification']
test.columns = ['values','id','label','statement','subject','speaker', 'job', 'state','party','barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c','venue','extracted_justification']
valid.columns = ['values','id','label','statement','subject','speaker', 'job', 'state','party','barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c','venue','extracted_justification']

## Appending Train and Validation Sets to Df

In [None]:
df = pd.DataFrame()
df = train.append(valid).sample(frac = 1, random_state = 1)
df.index = range(len(train) + len(valid))

In [None]:
# df

### Merging 'Statement' + 'Justification' Columns for Analysis 

In [None]:
df['total'] = df['statement'] + ' ' + df['extracted_justification'] 
df.head()

In [None]:
test['total'] = test['statement'] + ' ' + test['extracted_justification']
test.head()

### Dropping Null Values

In [None]:
# df['label'].unique()
# df[df['label'].isna()]
df = df.dropna()
test = test.dropna()

In [None]:
# df[df['label'].isna()]

### Transforming Classification Labels for Binary Analysis

In [None]:
truth_ = {'pants-fire':5, 'false':4, 'barely-true':3, 'half-true':2, 'mostly-true':1, 'true':0}
df['lblClass'] = df['label'].apply(lambda x: truth_[x])
test['lblClass'] = test['label'].apply(lambda x: truth_[x])

In [None]:
# df.info()
# test.info()

In [None]:
# df.isnull().sum()
# test.isnull().sum()

### 5-fold cross validation

In [None]:
cv = KFold(n_splits=5, random_state=1,shuffle=True)

### Train-Test 80-20 Split

In [None]:
X_train = df['total']
y_train = df['lblClass']
X_test = test['total']
y_test = test['lblClass']

### Removing Stop Words

In [None]:
y = df["lblClass"].values

X = []

stop_words = set(nltk.corpus.stopwords.words("english"))
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
for par in df["total"].values:
    tmp = []
    sentences = nltk.sent_tokenize(par)
    for sent in sentences:
        sent = sent.lower()
        tokens = tokenizer.tokenize(sent)
        filtered_words = [w.strip() for w in tokens if w not in stop_words and len(w) > 1]
        tmp.extend(filtered_words)
    X.append(tmp)

### Word2Vec Initialization

In [None]:
#Dimension of vectors we are generating
EMBEDDING_DIM = 100

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = gensim.models.Word2Vec(sentences=X, vector_size=EMBEDDING_DIM, window=5, min_count=1)

In [None]:
# len(w2v_model.wv.key_to_index)

### Word Tokenization for Input Data

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)

In [None]:
word_index = tokenizer.word_index

In [None]:
len(word_index)

### Padding / Chopping Data to max length of 500

In [None]:
#Lets keep all news to 500, add padding to news with less than 500 words and truncating long ones
maxlen = 500 

#Making all news of size maxlen defined above
X = pad_sequences(X, maxlen=maxlen)

Defining the size of the vocabulary

In [None]:
vocab_size = len(tokenizer.word_index) + 1

### Function to create weight matrix from word2vec gensim model

In [1]:

def get_weight_matrix(model, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    # step vocab, store vectors using the Tokenizer's integer mapping
    
    for word, i in vocab.items():
        print(word)
        weight_matrix[i] = model.wv[word]
    return weight_matrix

In [None]:
# #print(word_index)
# for word, i in word_index.items():
#     print(word)
#     weight_matrix[i] = w2v_model.wv[word]

Creating Embedding Vectors

In [None]:
embedding_vectors = get_weight_matrix(w2v_model, word_index)

In [None]:
len(embedding_vectors)

## Defining LSTM Model

In [None]:

model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=maxlen, trainable=False))
#LSTM 
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

### Train-Test 80-20 Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) 

### Evaluating LSTM

In [None]:
model.fit(X_train, y_train, validation_split=0.3, epochs=6)

In [None]:
y_pred = (model.predict(X_test) >= 0.5).astype("int")

### Defining Performance Evaluation Metrics

In [None]:
def precision_recall(true_label, predicted_label):
    precision = precision_score(true_label, predicted_label)
    print('Precision: %f' % precision)

    recall = recall_score(true_label, predicted_label)
    print('Recall: %f' % recall)
    
    accuracy = metrics.accuracy_score(true_label, predicted_label)
    print('Accuracy: %f' % accuracy)
    
    f1score = metrics.f1_score(true_label, predicted_label)
    print('F1 Score: %f' % f1score)

In [None]:
precision_recall(y_test, y_pred)

## Using Pre Trained Google News Word-Vectors

In [None]:
from gensim.models.keyedvectors import KeyedVectors

In [None]:
word_vectors = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
EMBEDDING_DIM=300

### Creating Embedding Matrix

In [None]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

In [None]:
print(len(y_train))

### Defining LSTM Model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=maxlen, trainable=False))
model.add(Conv1D(activation='relu', filters=4, kernel_size=4))
model.add(MaxPool1D())
model.add(LSTM(units=128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
model.fit(X_train, y_train, validation_split=0.3, epochs=12)

In [None]:
y_pred = (model.predict(X_test) >= 0.5).astype("int")

In [None]:
precision_recall(y_test, y_pred)