In [1]:
# One time installation of necessary libraries
#!pip install spacy
#!pip install tensorflow, keras
#!pip install nltk
#!pip install gensim

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Importing all necessary libraries
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.layers import TimeDistributed, GlobalAveragePooling1D, GlobalAveragePooling2D, BatchNormalization
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, AveragePooling1D
from keras.layers import Dropout, Flatten, Bidirectional, Dense, Activation, TimeDistributed
from keras.models import Model, Sequential
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from string import ascii_lowercase
from collections import Counter
from gensim.models import Doc2Vec
from gensim.models import doc2vec
from gensim.models import KeyedVectors
import itertools, nltk, snowballstemmer, re
from gensim.models.doc2vec import TaggedDocument
from gensim.models import KeyedVectors
TaggedDocument = doc2vec.TaggedDocument
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Reading data into a dataframe
df = pd.read_csv("/content/drive/MyDrive/Practicum/Amazon Reviews Dataset.csv")

In [5]:
# Converting polarity and deceptive field to binary
df['polarity'] = np.where(df['polarity']=='positive', 1, 0)
df['deceptive'] = np.where(df['deceptive']=='truthful', 1, 0)

In [6]:
# Classes and functions used for text vectorization
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # Checking uniqueness of the key
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

def create_class(c):
    if c['polarity'] == 1 and c['deceptive'] == 1:
        return [1,1]
    elif c['polarity'] == 1 and c['deceptive'] == 0:
        return [1,0]
    elif c['polarity'] == 0 and c['deceptive'] == 1:
        return [0,1]
    else:
        return [0,0]
    
def specific_class(c):
    if c['polarity'] == 1 and c['deceptive'] == 1:
        return "TRUE_POSITIVE"
    elif c['polarity'] == 1 and c['deceptive'] == 0:
        return "FALSE_POSITIVE"
    elif c['polarity'] == 0 and c['deceptive'] == 1:
        return "TRUE_NEGATIVE"
    else:
        return "FALSE_NEGATIVE"
    



df['final_class'] = df.apply(create_class, axis=1)
df['given_class'] = df.apply(specific_class, axis=1)

In [7]:
Y =df['final_class'].astype("string")

# Encoding class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# Converting integers to dummy variables
dummy_y = np_utils.to_categorical(encoded_Y)

In [8]:
# Treating each review as a document
textData = pd.DataFrame(list(df['text']))

In [9]:
# Initializing stemmer
stemmer = snowballstemmer.EnglishStemmer()

# Getting stopword list
stop = stopwords.words('english')

# Extending stopword list
stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
stoplist = stemmer.stemWords(stop)

# Converting it into a set for later
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist))) 

In [10]:
# Removing characters and stoplist words
textData[0].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ',inplace=True,regex=True)

# Generating dictionary of unique words
wordlist = filter(None, " ".join(list(set(list(itertools.chain(*textData[0].str.split(' ')))))).split(" "))
df['stemmed_text_data'] = [' '.join(filter(None,filter(lambda word: word not in stop, line))) for line in textData[0].str.lower().str.split(' ')]

In [11]:
# Removing all words that are appearing less frequently
minimum_count = 1
str_frequencies = pd.DataFrame(list(Counter(filter(None,list(itertools.chain(*df['stemmed_text_data'].str.split(' '))))).items()),columns=['word','count'])
low_frequency_words = set(str_frequencies[str_frequencies['count'] < minimum_count]['word'])
df['stemmed_text_data'] = [' '.join(filter(None,filter(lambda word: word not in low_frequency_words, line))) for line in df['stemmed_text_data'].str.split(' ')]
df['stemmed_text_data'] = [" ".join(stemmer.stemWords(re.sub('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\]',' ', next_text).split(' '))) for next_text in df['stemmed_text_data']]    

In [12]:
# Lemmatizing the text data
lmtzr = WordNetLemmatizer()
w = re.compile("\w+",re.I)

def label_sentences(df, input_point):
    labeled_sentences = []
    list_sen = []
    for index, datapoint in df.iterrows():
        tokenized_words = re.findall(w,datapoint[input_point].lower())
        labeled_sentences.append(TaggedDocument(words=tokenized_words, tags=['SENT_%s' %index]))
        list_sen.append(tokenized_words)
    return labeled_sentences, list_sen

def train_doc2vec_model(labeled_sentences):
    model = Doc2Vec(min_count=1, window=9, vector_size=512, sample=1e-4, negative=5, workers=7)
    model.build_vocab(labeled_sentences)
    pretrained_weights = model.wv.vectors
    vocab_size, embedding_size = pretrained_weights.shape
    model.train(labeled_sentences, total_examples=vocab_size, epochs=400)
    
    return model



In [13]:
textData = df['stemmed_text_data'].to_frame().reset_index()
sen, corpus = label_sentences(textData, 'stemmed_text_data')

In [14]:
# Training the Doc2Vec model
doc2vec_model = train_doc2vec_model(sen)

In [15]:
# Saving the Doc2Vec model
doc2vec_model.save("doc2vec_model_opinion_corpus.d2v")

In [16]:
# Loading the Doc2Vec model
doc2vec_model = Doc2Vec.load("doc2vec_model_opinion_corpus.d2v") 

In [17]:
# Further vectorizing the data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

tfidf1 = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False, ngram_range=(1,1))
result_train1 = tfidf1.fit_transform(corpus)

tfidf2 = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False, ngram_range=(1,2))
result_train2 = tfidf2.fit_transform(corpus)

tfidf3 = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False, ngram_range=(1,3))
result_train3 = tfidf3.fit_transform(corpus)

svd = TruncatedSVD(n_components=512, n_iter=40, random_state=34)
tfidf_data1 = svd.fit_transform(result_train1)
tfidf_data2 = svd.fit_transform(result_train2)
tfidf_data3 = svd.fit_transform(result_train3)

In [18]:
# Performing POS tagging on the data
from sklearn.feature_extraction.text import CountVectorizer
import spacy

nlp = spacy.blank('en')
temp_textData = pd.DataFrame(list(df['text']))

overall_pos_tags_tokens = []
overall_pos = []
overall_tokens = []
overall_dep = []

for i in range(1600):
    doc = nlp(temp_textData[0][i])
    given_pos_tags_tokens = []
    given_pos = []
    given_tokens = []
    given_dep = []
    for token in doc:
        output = "%s_%s" % (token.pos_, token.tag_)
        given_pos_tags_tokens.append(output)
        given_pos.append(token.pos_)
        given_tokens.append(token.tag_)
        given_dep.append(token.dep_)
        
    overall_pos_tags_tokens.append(given_pos_tags_tokens)
    overall_pos.append(given_pos)
    overall_tokens.append(given_tokens)
    overall_dep.append(given_dep)


In [19]:
# Normalizing the data

from sklearn.preprocessing import MinMaxScaler

count = CountVectorizer(tokenizer=lambda i:i, lowercase=False)
pos_tags_data = count.fit_transform(overall_pos_tags_tokens).todense()
pos_data = count.fit_transform(overall_pos).todense()
tokens_data = count.fit_transform(overall_tokens).todense()
dep_data = count.fit_transform(overall_dep).todense()
min_max_scaler = MinMaxScaler()
normalized_pos_tags_data = min_max_scaler.fit_transform(pos_tags_data)
normalized_pos_data = min_max_scaler.fit_transform(pos_data)
normalized_tokens_data = min_max_scaler.fit_transform(tokens_data)
normalized_dep_data = min_max_scaler.fit_transform(dep_data)

final_pos_tags_data = np.zeros(shape=(1600, 512)).astype(np.float32)
final_pos_data = np.zeros(shape=(1600, 512)).astype(np.float32)
final_tokens_data = np.zeros(shape=(1600, 512)).astype(np.float32)
final_dep_data = np.zeros(shape=(1600, 512)).astype(np.float32)
final_pos_tags_data[:normalized_pos_tags_data.shape[0],:normalized_pos_tags_data.shape[1]] = normalized_pos_tags_data
final_pos_data[:normalized_pos_data.shape[0],:normalized_pos_data.shape[1]] = normalized_pos_data
final_tokens_data[:normalized_tokens_data.shape[0],:normalized_tokens_data.shape[1]] = normalized_tokens_data
final_dep_data[:normalized_dep_data.shape[0],:normalized_dep_data.shape[1]] = normalized_dep_data



In [20]:
# Finding max no of words for each review

maxlength = []
for i in range(0,len(sen)):
    maxlength.append(len(sen[i][0]))
    
print(max(maxlength))   

370


In [21]:
# Function to vectorize
def vectorize_comments(df,d2v_model):
    y = []
    comments = []
    for i in range(0,df.shape[0]):
        label = 'SENT_%s' %i
        comments.append(d2v_model.docvecs[label])
    df['vectorized_comments'] = comments
    return df


textData = vectorize_comments(textData,doc2vec_model)
print (textData.head(2))

   index                                  stemmed_text_data  \
0      0  stay night getaway famili thursday tripl aaa r...   
1      1  tripl rate upgrad view room less $ includ brea...   

                                 vectorized_comments  
0  [0.73591495, 0.43036973, -0.45590287, 0.104917...  
1  [1.030476, -0.25929967, 0.3994606, 0.264935, -...  


In [22]:
# Splitting the reviews into train-test split randomly

from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(textData["vectorized_comments"].T.tolist(), 
                                                                     dummy_y, 
                                                                    test_size=0.1, 
                                                                     random_state=56)

In [23]:
# Reshaping the vectorized data to fit the input of the model

X = np.array(textData["vectorized_comments"].T.tolist()).reshape((1,1600,512))
y = np.array(dummy_y).reshape((1600,4))
X_train2 = np.array(X_train).reshape((1,1440,512))
y_train2 = np.array(y_train).reshape((1,1440,4))
X_test2 = np.array(X_test).reshape((1,160,512))
y_test2 = np.array(y_test).reshape((1,160,4))

In [24]:
# Stratified K fold repeated 10 times for cross validation
from sklearn.model_selection import StratifiedKFold
Xtemp = textData["vectorized_comments"].T.tolist()
ytemp = df['given_class']
training_indices = []
testing_indices = []

skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(Xtemp, ytemp)

for train_index, test_index in skf.split(Xtemp, ytemp):
    training_indices.append(train_index)
    testing_indices.append(test_index)

In [25]:
def extractTrainingAndTestingData(givenIndex):
    X_train3 = np.zeros(shape=(1440, max(maxlength)+10+379, 512)).astype(np.float32)
    Y_train3 = np.zeros(shape=(1440, 4)).astype(np.float32)
    X_test3 = np.zeros(shape=(160, max(maxlength)+10+379, 512)).astype(np.float32)
    Y_test3 = np.zeros(shape=(160, 4)).astype(np.float32)

    empty_word = np.zeros(512).astype(np.float32)

    cnt_i = 0
    for i in training_indices[givenIndex]:
        len1 = len(sen[i][0])
        avg_v1 = np.zeros(512).astype(np.float32)
        avg_v2 = np.zeros(512).astype(np.float32)
        avg_v3 = np.zeros(512).astype(np.float32)
        for j in range(max(maxlength)+10+379):
            if j < len1:
                X_train3[cnt_i,j,:] = doc2vec_model[sen[i][0][j]]
                avg_v1 += result_train1[i, tfidf1.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
                avg_v2 += result_train2[i, tfidf2.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
                avg_v3 += result_train3[i, tfidf3.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
            #elif j >= len1 and j < len1 + 379:
                #X_train3[cnt_i,j,:] = glove_data[i, j-len1, :]
            elif j == len1 + 379:
                X_train3[cnt_i,j,:] = tfidf_data1[i]
            elif j == len1 + 380:
                X_train3[cnt_i,j,:] = tfidf_data2[i]
            elif j == len1+381:
                X_train3[cnt_i,j,:] = tfidf_data3[i]
            elif j == len1+382:
                X_train3[cnt_i,j,:] = avg_v1
            elif j == len1+383:
                X_train3[cnt_i,j,:] = avg_v2
            elif j == len1+384:
                X_train3[cnt_i,j,:] = avg_v3
            elif j == len1+385:
                X_train3[cnt_i,j,:] = final_pos_tags_data[i] 
            elif j == len1+386:
                X_train3[cnt_i,j,:] = final_pos_data[i]
            elif j == len1+387:
                X_train3[cnt_i,j,:] = final_tokens_data[i]
            elif j == len1+388:
                X_train3[cnt_i,j,:] = final_dep_data[i]
            else:
                X_train3[cnt_i,j,:] = empty_word

        Y_train3[cnt_i,:] = dummy_y[i]
        cnt_i += 1


    cnt_i = 0
    for i in testing_indices[givenIndex]:
        len1 = len(sen[i][0])
        avg_v1 = np.zeros(512).astype(np.float32)
        avg_v2 = np.zeros(512).astype(np.float32)
        avg_v3 = np.zeros(512).astype(np.float32)
        for j in range(max(maxlength)+10+379):
            if j < len1:
                X_test3[cnt_i,j,:] = doc2vec_model[sen[i][0][j]]
                avg_v1 += result_train1[i, tfidf1.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
                avg_v2 += result_train2[i, tfidf2.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]  
                avg_v3 += result_train3[i, tfidf3.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
            #elif j >= len1 and j < len1 + 379:
               # X_test3[cnt_i,j,:] = glove_data[i, j-len1, :]
            elif j == len1 + 379:
                X_test3[cnt_i,j,:] = tfidf_data1[i]
            elif j == len1 + 380:
                X_test3[cnt_i,j,:] = tfidf_data2[i]
            elif j == len1+381:
                X_test3[cnt_i,j,:] = tfidf_data3[i]
            elif j == len1+382:
                X_test3[cnt_i,j,:] = avg_v1
            elif j == len1+383:
                X_test3[cnt_i,j,:] = avg_v2
            elif j == len1+384:
                X_test3[cnt_i,j,:] = avg_v3
            elif j == len1+385:
                X_test3[cnt_i,j,:] = final_pos_tags_data[i]
            elif j == len1+386:
                X_test3[cnt_i,j,:] = final_pos_data[i]
            elif j == len1+387:
                X_test3[cnt_i,j,:] = final_tokens_data[i]
            elif j == len1+388:
                X_test3[cnt_i,j,:] = final_dep_data[i]
            else:
                X_test3[cnt_i,j,:] = empty_word

        Y_test3[cnt_i,:] = dummy_y[i]
        cnt_i += 1
        
    return X_train3, X_test3, Y_train3, Y_test3
    

In [26]:
# Setting the layers of CNN-BiLSTM model and training the model
model = Sequential()
# Changing Activation Functions as required for the model
#model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu', input_shape=(max(maxlength)+10+379,512)))
#model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='selu', input_shape=(max(maxlength)+10+379,512)))
#model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='tanh', input_shape=(max(maxlength)+10+379,512)))
#model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='sigmoid', input_shape=(max(maxlength)+10+379,512)))
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='swish', input_shape=(max(maxlength)+10+379,512)))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
# Removing layers as the required output is achieved and computational power is reduced
#model.add(Conv1D(filters=32, kernel_size=7, padding='same', activation='swish'))
#model.add(Dropout(0.25))
#model.add(MaxPooling1D(pool_size=2))
#model.add(Dropout(0.25))

model.add(Bidirectional(LSTM(50, dropout=0.25, recurrent_dropout=0.2)))
model.add(Dense(4, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 759, 128)          327808    
                                                                 
 dropout (Dropout)           (None, 759, 128)          0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 379, 128)         0         
 )                                                               
                                                                 
 dropout_1 (Dropout)         (None, 379, 128)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 100)              71600     
 l)                                                              
                                                                 
 dense (Dense)               (None, 4)                 4

In [None]:
# Running the model and calculating the accuracy, loss, validation accuracy and validation loss at each iteration, each epoch

from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint 

final_accuracies = []

for i in range(4):
    filename = 'weights.best.from_scratch%s.hdf5' % i
    checkpointer = ModelCheckpoint(filepath=filename, verbose=1, save_best_only=True)
    X_train3, X_test3, Y_train3, Y_test3 = extractTrainingAndTestingData(i)
    model.fit(X_train3, Y_train3, epochs=10, batch_size=512, callbacks=[checkpointer], validation_data=(X_test3, Y_test3))
    model.load_weights(filename)
    predicted = np.rint(model.predict(X_test3))
    final_accuracies.append(accuracy_score(Y_test3, predicted))
    print(accuracy_score(Y_test3, predicted))

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.57060, saving model to weights.best.from_scratch0.hdf5
Epoch 2/10
Epoch 2: val_loss improved from 0.57060 to 0.54007, saving model to weights.best.from_scratch0.hdf5
Epoch 3/10
Epoch 3: val_loss improved from 0.54007 to 0.52463, saving model to weights.best.from_scratch0.hdf5
Epoch 4/10
Epoch 4: val_loss improved from 0.52463 to 0.50067, saving model to weights.best.from_scratch0.hdf5
Epoch 5/10
Epoch 5: val_loss improved from 0.50067 to 0.45532, saving model to weights.best.from_scratch0.hdf5
Epoch 6/10
Epoch 6: val_loss improved from 0.45532 to 0.41859, saving model to weights.best.from_scratch0.hdf5
Epoch 7/10
Epoch 7: val_loss improved from 0.41859 to 0.38379, saving model to weights.best.from_scratch0.hdf5
Epoch 8/10
Epoch 8: val_loss improved from 0.38379 to 0.35339, saving model to weights.best.from_scratch0.hdf5
Epoch 9/10
Epoch 9: val_loss improved from 0.35339 to 0.33471, saving model to weights.best.from_scratch0.hdf5
Epoch

In [31]:
# Calculating the overall accuracy
print(sum(final_accuracies) / len(final_accuracies)*100,"%")

91.38%
