In [1]:
from __future__ import print_function
import  random
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation ,Dropout, Conv1D, Conv2D, MaxPooling1D, Flatten, Embedding
from keras.callbacks import EarlyStopping 
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import sent_tokenize, word_tokenize

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

stop_words = set(stopwords.words('english')) 




def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def get_unbalance_data_set(container_path, truthful_percentage, deceptive_percentage):
    training_data = load_files(container_path, description=None, load_content=True,
                               shuffle=True, encoding='ISO-8859-1', decode_error='strict', random_state=0)

    filter_truthful_index = []
    filter_deceptive_index = []
    for index in range(0, len(training_data.data)):
        if (training_data.target[index] == True):
            filter_truthful_index.append(index)
        else:
            filter_deceptive_index.append(index)

    filter_data_truthful = random.sample(filter_truthful_index, (int)(len(training_data.data)/2 * truthful_percentage))
    filter_data_deceptive = random.sample(filter_deceptive_index, (int)(len(training_data.data)/2 * deceptive_percentage))

    filter_data = filter_data_truthful+filter_data_deceptive

    list_need_to_delete = []
    new_data_list = []

    for index in range(0, len(training_data.data)):
        if index not in filter_data:

            list_need_to_delete.append(index)
        else:
            new_data_list.append(training_data.data[index])
    training_data.target = np.delete(training_data.target, list_need_to_delete)
    training_data.data =  new_data_list

    return training_data


def words_tag_freq_calculation(container_path, truthful):

    #load data set from given directory path
    training_data = load_files(container_path, description=None,  load_content=True,
                              shuffle=True, encoding='ISO-8859-1', decode_error='strict', random_state=0)

    filter_data = []
    for index in range(0, len(training_data.data)) :
        if(training_data.target[index] == truthful):
            #print('Target: ', training_data.target[index], 'Content: ', training_data.data[index])
            filter_data.append(training_data.data[index])

    return filter_data

def topic_word_distribution(topic_word_list, container_path):
    training_data = load_files(container_path, description=None, load_content=True,
                               shuffle=True, encoding='ISO-8859-1', decode_error='strict', random_state=0)

    filter_data = []
    for each_word in topic_word_list:
        count_T = 0
        count_F = 0
        for index in range(0, len(training_data.data)):
            # print('Target: ', training_data.target[index], 'Content: ', training_data.data[index])
            str_data = training_data.data[index]
            if str_data.find(each_word) != -1 :
                if (training_data.target[index] == True):
                    count_T += 1
                else:
                    count_F += 1

        print(each_word, " appears in Truthful review : ", count_T, " and Deceptive review : ", count_F)

def get_data_sentence_containing_topic_model_words(container_path, topic_word_list, data):
    training_data = data
    if (training_data == None):
        training_data = load_files(container_path, description=None, load_content=True,
                               shuffle=True, encoding='ISO-8859-1', decode_error='strict', random_state=0)
    filter_data = []
    #print(training_data.data[2])
    for index in range(0, len(training_data.data)):
        #print("Before: ", training_data.data[index])
        document = sent_tokenize(training_data.data[index])
        new_document = ""
        for sentence in document:
            word_list = word_tokenize(sentence.lower())
            for word in topic_word_list:
                if word.lower() in word_list:
                    new_document += sentence+" "
                    break

        training_data.data[index] = new_document
        #print("After: ", new_document)

    #print(training_data.data[2])
    return training_data

def get_lemmatize_data_set(container_path):
    training_data = load_files(container_path, description=None, load_content=True,
                               shuffle=True, encoding='ISO-8859-1', decode_error='strict', random_state=0)

    lemmatizer = WordNetLemmatizer()

    for index in range(0, len(training_data.data)):
        # print("Before: ", training_data.data[index])
        #document = word_tokenize(training_data.data[index])
        #print(training_data.data[index])
        word_list = word_tokenize(training_data.data[index])
        new_document = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_list])
       # print(new_document)

        training_data.data[index] = new_document

    return training_data

def print_top_words(container_path, model, feature_names, n_top_words):
    full_str = ""
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        full_str += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        full_str += " "
        str_topic_word = " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        #topic_word_list = str_topic_word.split()
        #topic_word_distribution(topic_word_list, container_path)
        print(message)

    print()
    full_str = full_str.split()
    return full_str


def Remove(duplicate, fullRemove):
    final_list = []
    for num in duplicate:
        if num not in final_list:
            final_list.append(num)
        else:
            final_list.remove(num)
            if fullRemove == False:
                final_list.append(num)
    return final_list

def lemmatize_word_list(wordList):
    lemmatize_list = []
    lemmatizer = WordNetLemmatizer()
    for word in wordList:
        lemmatize_list.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

    return lemmatize_list


def get_top_words_from_topic_modeling(container_path, n_topic, n_top_words, data):
    training_data = data
    if(training_data == None):
        training_data = load_files(container_path, description=None, load_content=True,
                               shuffle=True, encoding='ISO-8859-1', decode_error='strict', random_state=0)

    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tf_review = tf_vectorizer.fit_transform(training_data.data)

    lda_review = LatentDirichletAllocation(n_components=n_topic, max_iter=20,
                                               learning_method='online',
                                               learning_offset=50.,
                                               random_state=0)
    lda_review.fit(tf_review)
    tf_feature_names = tf_vectorizer.get_feature_names()

    return print_top_words(container_path, lda_review, tf_feature_names, n_top_words)

def get_top_words_from_topic_modeling_(data, n_topic, n_top_words):

    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tf_review = tf_vectorizer.fit_transform(data)

    lda_review = LatentDirichletAllocation(n_components=n_topic, max_iter=20,
                                               learning_method='online',
                                               learning_offset=50.,
                                               random_state=0)
    lda_review.fit(tf_review)
    tf_feature_names = tf_vectorizer.get_feature_names()

    return print_top_words("", lda_review, tf_feature_names, n_top_words)

def remove_stopwords(dat):
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    output_array=[]
    for sentence in dat.data:
        temp_list=[]
        for word in sentence.split():
            if word.lower() not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

def punc_removal(data): 
    output_array=[]
    punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
    for article in data:
        temp=[]
        for word in article.split():
            temp_w=[]
            for char in word:
                if char not in punc:
                    temp_w.append(''.join(char))
            temp.append(''.join(temp_w))
            temp.append(''.join(" "))
        output_array.append(''.join(temp))
    return output_array



print("Loading dataset...")

container_path_neg = r"C:\Users\ayman\Downloads\MSU_Thesis-master\MSU_Thesis-master\dataset\trip_advisor_dataset\negative_polarity\negative_polarity"
container_path_pos = r"C:\Users\ayman\Downloads\MSU_Thesis-master\MSU_Thesis-master\dataset\trip_advisor_dataset\positive_polarity\positive_polarity"
container_path_comb = r"C:\Users\ayman\Downloads\MSU_Thesis-master\MSU_Thesis-master\dataset\trip_advisor_dataset\combined/"

container_path_temp = "../data/amazon/temp/"


categories = ['deceptive_from_MTurk', 'truthful_from_Web']

n_components = 2 #was 2
n_top_words =200 #was 200

 
data_path = container_path_comb

full_data = get_unbalance_data_set(container_path=data_path, truthful_percentage=1, deceptive_percentage=1)



top_word_list = get_top_words_from_topic_modeling(data_path,n_components,n_top_words, full_data)

top_word_list = Remove(top_word_list, False)

print("Total top word list:", len(top_word_list))


full_data = get_data_sentence_containing_topic_model_words(data_path,top_word_list, full_data)
data= remove_stopwords(full_data)
data=punc_removal(data)
vocabularyList = top_word_list


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading dataset...
Topic #0: hotel chicago room great stay staff location rooms stayed nice comfortable clean service friendly just city recommend good view best helpful place definitely night michigan really wonderful area excellent beautiful breakfast business bed free like time restaurant shopping downtown perfect bar loved enjoyed close lobby spacious hotels bathroom walk right floor food walking amazing day beds trip restaurants mile weekend modern nights away large home highly experience visit located got suite fantastic internet distance staying hilton husband ave tv little concierge feel park street water pool tower need views river magnificent check coffee huge dinner price small wife king ve family access quiet ambassador looking navy lake easy desk extremely pier want east overall center hard big amenities especially wine return decor say reviews better flat avenue blocks way felt luxurious recently rock screen absolutely amalfi rate love places morning bit sofitel suites fi

In [2]:
full_data.data

['My wife and I just spent the Labor Day weekend in the city, choosing the Hard Rock Hotel as our pad. We\'ve stayed at the HRH in Orlando twice and are blown away by the ambience there. It\'s not that way here. These may be all ticky tacky things, but they added up to us. First of all, the room. We upgraded our reservation thru the HRH website for a room with "an Incredible view". Well, our 9th floor view was of nothing more than the venting on top of the building next to us. So there was no view. The room was muggy. We had the air turned down all the way and it never got below 74. We woke up sweaty both days. When we opened the window to get a breeze, the noise from the rooftop next to us was overbearing. The room was small and dark, but it didn\'t bother us that much. The bed may have been the mosr comfortable bed I ever slept in. We had the same issue with the honor bar as a previous poster, but they removed the charges no prob. Throughout the entire hotel, I counted a total of 3 p

In [3]:
"""Word2vec Implementation"""
#imports

from gensim.models.word2vec import Word2Vec
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)



In [4]:
"""Data import and split"""

training_data = data
print(full_data.target_names)

#Attribute values for each tuple
X = pd.DataFrame(data)

#Target output for each tuple
Y = pd.DataFrame(full_data.target)
#making lowercase
X[0] = X[0].str.lower()
#checking for null values
X.isnull().sum()



"""Preprocessing"""

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# X= word_tokenize(X)

stop_words = set(stopwords.words('english')) 

filtered_sentence = [] 

# for w in X[0]: 
#     if
#     if w not in stop_words:
#         filtered_sentence.append(word) 

#Creates the relevant phrases [BIGRAMS] from the list of sentences:

from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in X[0]]
# filtered_sent=[]
# for s in sent:
#     for x in s:
#         if x not in stop_words:
#             filtered_sent[s].append(x)
        

# sent = filtered_sent
phrases = Phrases(sent, min_count=30, progress_per=10000) #min count was  30 
bigram = Phraser(phrases)
sentences = bigram[sent]



#split into train and test
X_train, X_test, y_train, y_test = train_test_split(sentences, Y, test_size=0.20, random_state=42)

print(X_train)

# print("-----------------------================================================================")
#print(filtered_sentence) 
# print("-----------------------================================================================")
# print(stop_words)



INFO - 00:11:35: collecting all words and their counts
INFO - 00:11:35: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 00:11:35: collected 90889 word types from a corpus of 117635 words (unigram + bigrams) and 1600 sentences


['deceptive_from_MTurk', 'truthful_from_Web']


INFO - 00:11:35: using 90889 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 00:11:35: source_vocab length 90889
INFO - 00:11:36: Phraser built with 46 phrasegrams




In [9]:

"""Training Model"""

import multiprocessing
from gensim.models import Word2Vec

cores = multiprocessing.cpu_count() # Count the number of cores in a computer



num_words = 5000
size=50 #was 50



w2v_model = Word2Vec(min_count=1,
                     window=50,
                     size=size,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.00007, 
                     negative=30, ##WAS 30 which gave 88%
                     workers=cores-1) ##alpha was 0.03, minalpha 0.0007
t = time()

clf=w2v_model.build_vocab(X_train, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

pect=w2v_model.train(X_train, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1) #epoch was 50
words=w2v_model.wv.vocab

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
w2v_model.init_sims(replace=True)

INFO - 00:12:46: collecting all words and their counts
INFO - 00:12:46: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 00:12:46: collected 9195 word types from a corpus of 92226 raw words and 1280 sentences
INFO - 00:12:46: Loading a fresh vocabulary
INFO - 00:12:46: effective_min_count=1 retains 9195 unique words (100% of original 9195, drops 0)
INFO - 00:12:46: effective_min_count=1 leaves 92226 word corpus (100% of original 92226, drops 0)
INFO - 00:12:46: deleting the raw counts dictionary of 9195 items
INFO - 00:12:46: sample=6e-05 downsamples 1111 most-common words
INFO - 00:12:46: downsampling leaves estimated 45199 word corpus (49.0% of prior 92226)
INFO - 00:12:46: estimated required memory for 9195 words and 50 dimensions: 8275500 bytes
INFO - 00:12:46: resetting layer weights
INFO - 00:12:48: training model with 7 workers on 9195 vocabulary and 50 features, using sg=0 hs=0 sample=6e-05 negative=30 window=50
INFO - 00:12:48: worker thread finished; a

Time to build vocab: 0.02 mins


INFO - 00:12:48: worker thread finished; awaiting finish of 2 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 1 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 0 more threads
INFO - 00:12:48: EPOCH - 2 : training on 92226 raw words (45152 effective words) took 0.1s, 432141 effective words/s
INFO - 00:12:48: worker thread finished; awaiting finish of 6 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 5 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 4 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 3 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 2 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 1 more threads
INFO - 00:12:48: worker thread finished; awaiting finish of 0 more threads
INFO - 00:12:48: EPOCH - 3 : training on 92226 raw words (45494 effective words) took 0.1s, 414444 effective words/s
INFO - 00:12:48:

INFO - 00:12:49: worker thread finished; awaiting finish of 4 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 3 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 2 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 1 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 0 more threads
INFO - 00:12:49: EPOCH - 15 : training on 92226 raw words (45182 effective words) took 0.1s, 471727 effective words/s
INFO - 00:12:49: worker thread finished; awaiting finish of 6 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 5 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 4 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 3 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 2 more threads
INFO - 00:12:49: worker thread finished; awaiting finish of 1 more threads
INFO - 00:12:49: worker thread finished; awaiting finish 

INFO - 00:12:51: worker thread finished; awaiting finish of 6 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 5 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 4 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 3 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 2 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 1 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 0 more threads
INFO - 00:12:51: EPOCH - 28 : training on 92226 raw words (45225 effective words) took 0.1s, 471432 effective words/s
INFO - 00:12:51: worker thread finished; awaiting finish of 6 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 5 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 4 more threads
INFO - 00:12:51: worker thread finished; awaiting finish of 3 more threads
INFO - 00:12:51: worker thread finished; awaiting finish 

INFO - 00:12:52: EPOCH - 40 : training on 92226 raw words (45494 effective words) took 0.1s, 419985 effective words/s
INFO - 00:12:52: worker thread finished; awaiting finish of 6 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 5 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 4 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 3 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 2 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 1 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 0 more threads
INFO - 00:12:52: EPOCH - 41 : training on 92226 raw words (45324 effective words) took 0.1s, 442071 effective words/s
INFO - 00:12:52: worker thread finished; awaiting finish of 6 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 5 more threads
INFO - 00:12:52: worker thread finished; awaiting finish of 4 more threads
INFO - 00:12:5

Time to train the model: 0.09 mins


In [6]:
# import gensim.downloader
# glove_vectors = gensim.downloader.load('glove-twitter-25')

INFO - 00:11:43: loading projection weights from C:\Users\ayman/gensim-data\glove-twitter-25\glove-twitter-25.gz
INFO - 00:12:07: loaded (1193514, 25) matrix from C:\Users\ayman/gensim-data\glove-twitter-25\glove-twitter-25.gz


In [10]:
print(w2v_model.corpus_count) #Checking document length (number of samples)
print(pect)
w2v_model.save("word2vec.model")
w2v_model.save("model.bin")
# wv = KeyedVectors.load("word2vec.model", mmap='r')
#print(w2v_model.wv["and"])

INFO - 00:12:57: saving Word2Vec object under word2vec.model, separately None
INFO - 00:12:57: not storing attribute vectors_norm
INFO - 00:12:57: not storing attribute cum_table
INFO - 00:12:57: saved word2vec.model
INFO - 00:12:57: saving Word2Vec object under model.bin, separately None
INFO - 00:12:57: not storing attribute vectors_norm
INFO - 00:12:57: not storing attribute cum_table
INFO - 00:12:57: saved model.bin


1280
(2259184, 4611300)


In [11]:
embeddings_index = {}
for w in glove_vectors.wv.vocab.keys():
    embeddings_index[w] = w2v_model.wv[w]

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words= num_words)
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
sequences_val = tokenizer.texts_to_sequences(X_test)


  for w in glove_vectors.wv.vocab.keys():


KeyError: "word '<user>' not in vocabulary"

In [None]:
sequences[:5]

In [None]:
length = []
for x in X_train:
    length.append(len(x))
max_length=max(length)

x_train_seq = pad_sequences(sequences, maxlen=max_length)
x_val_seq = pad_sequences(sequences_val, maxlen=max_length)

In [None]:
# """Scaling the data"""

# scaler=StandardScaler()
# scaler.fit(x_train_seq)
# x_train_seq = scaler.transform(x_train_seq)
# scaler.fit(x_val_seq)
# x_val_seq = scaler.transform(x_val_seq)
# """Normalizing the data"""
# x_train_seq=preprocessing.normalize(x_train_seq)
# x_val_seq=preprocessing.normalize(x_val_seq)

In [None]:
 
embedding_matrix = np.zeros((num_words, size))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
x_train_seq.shape

In [None]:
n_fold= 1  #was:10
result ={}
index = 1
accuracy = 0
for n in range(n_fold):
    print("Fold ",n, " is started...")
    """ 82 percent accuracy  """

    """ test size was 20 percent,three cnn layers: filters 1200 x3 and kernel size 15,15,13, maxpooling pool size= 6, dense:2-layers:250 neurons and last layer dense 1 with softmax"""

    # define model
    model = Sequential()
    e =Embedding(num_words, size, weights=[embedding_matrix], trainable= True)
    
    model.add(e)
#     model.add(Conv1D(filters=1200,kernel_size=15, activation='relu' ),)
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(Dropout(0.2))
    
    #ORIGINAL
    model.add(Conv1D(filters=600,kernel_size=18, activation='relu' , ),) #kernel size was 18, filters 600
    model.add(MaxPooling1D(pool_size=6))
    model.add(Dropout(0.2))

    model.add(Conv1D(filters=500, kernel_size=14, activation='relu' , ),) #kernel size was 14, filters 500
    model.add(MaxPooling1D(pool_size=6))
    model.add(Dropout(0.2))

    model.add(Conv1D(filters=400, kernel_size= 8, activation='relu', ) ,)#kernel size was 8, filters 400
    model.add(MaxPooling1D(pool_size=1))
    model.add(Dropout(0.2))


    callback = tf.keras.callbacks.EarlyStopping(monitor='sparse_categorical_accuracy',mode= 'max', patience=20)

    # model.add(Dense(750, activation='relu'))
#     model.add(Dense(250, activation='relu', kernel_regularizer=keras.regularizers.l2(l=0.1)))
#     model.add(Dense(500, activation='relu' ,kernel_regularizer=keras.regularizers.l2(l=0.1)))

    model.add(Dense(2, activation='softmax', kernel_regularizer=keras.regularizers.l2(l=0.01) , activity_regularizer=tf.keras.regularizers.l2(0.01)) )
    print(model.summary())
    # model.save("my_model-74")

    #Training the data: Compiling and fitting
    # opt = keras.optimizers.Nadam(learning_rate=0.0030)
    model.compile(loss="sparse_categorical_crossentropy", optimizer= 'Adam', metrics=[ "sparse_categorical_accuracy"])
    clf = model.fit(x_train_seq, y_train, validation_split =0.3 ,epochs=50, callbacks =[callback])#15
    print("Loss and test Accuracy in fold: ",n)
    model.evaluate(x_val_seq, y_test)
    predicted1 = model.predict(x_val_seq) 
    predicted1= np.squeeze(predicted1)
    predicted = []
    pred_final=[]
    
    """Turning one hot encoded output to array with each sample with one valued output"""
    for i,x in predicted1:
        if(i>x):
            pred_final.append([1,0])
            predicted.append(0)
        else:
            pred_final.append([0,1])
            predicted.append(1)
    (pred_final)
#     for i in predicted1:
#         if i<0.50:
#             predicted.append(0)
#         else:
#             predicted.append(1)
#     print(predicted)
    
    y_test= np.squeeze(np.array(y_test))
    y_test_ohe = tf.keras.utils.to_categorical(y_test, num_classes=2)

   
    print(y_test)
    """Accumulating accuracy for each fold"""
    
    accuracy += np.count_nonzero(predicted == y_test) *100/ len(y_test)
    
    """f1 score support for each fold"""
    result[index] = precision_recall_fscore_support(y_test, predicted)
    index = index +1
    print(metrics.classification_report(y_test, predicted,target_names=full_data.target_names))

"""calculating average scores for all the folds"""


precision = 0
recall = 0
f1_score = 0
#Adds up each fold precesion, recallm f1_score
for key, value in result.items():
    #print(key, " = ",  value)
    precision += value[0]
    recall += value[1]
    f1_score += value[2]
    

avg_accuracy = accuracy/n_fold
avg_precision = precision/n_fold
avg_recall = recall/n_fold
avg_f1_score = f1_score/n_fold

calculated_result = {}
calculated_result['accuracy'] = avg_accuracy
calculated_result['precision'] = avg_precision
calculated_result['recall'] = avg_recall
calculated_result['f1_score'] = avg_f1_score
#return mean of accuracy, precision, recall, f1_score
print(calculated_result)



In [None]:
"""Complete results"""

for i in calculated_result.items():
    print(i[0],":", i[1])

In [None]:
model.evaluate(x_val_seq, y_test)

In [None]:
# print(np.squeeze(y_test))

In [None]:
# z= predicted==y_test
# print(z[0])

In [None]:
y_test

In [None]:
# pred_final=[]
# for i,x in predicted1:
#     if(i>x):
#         pred_final.append([1,0])
#     else:
#         pred_final.append([0,1])
# (pred_final)

In [None]:
# y_test_ohe = tf.keras.utils.to_categorical(y_test, num_classes=2)


In [None]:
# model.save("my_model-89.06-p-1f")


### model.save("my_model-80-p-10f")