In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.layers import TimeDistributed, GlobalAveragePooling2D, BatchNormalization
from keras.layers.recurrent import LSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, AveragePooling1D
from keras.layers import Dropout, Flatten, Bidirectional, Dense, Activation, TimeDistributed
from keras.models import Model, Sequential
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from string import ascii_lowercase
from collections import Counter
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
from gensim.models import doc2vec
from gensim.models import KeyedVectors
import itertools, nltk, snowballstemmer, re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

class LabeledSentence(doc2vec.LabeledSentence):
    pass

class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        flipped = {}
        # make sure that keys are unique for key, value in sources.items():
        for value in sources.values():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception("Non-unique prefix encountered")

    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

data = pd.read_csv("deceptive-opinion.csv")
data.head()
data['polarity'] = np.where(data['polarity'] == 'positive', 1, 0)
data['deceptive'] = np.where(data['deceptive'] == 'truthful', 1, 0)


def create_class(c):
    if c['polarity'] == 1 and c['deceptive'] == 1:
        return [1, 1]
    elif c['polarity'] == 1 and c['deceptive'] == 0:
        return [1, 0]
    elif c['polarity'] == 0 and c['deceptive'] == 1:
        return [0, 1]
    else:
        return [0, 0]


def specific_class(c):
    if c['polarity'] == 1 and c['deceptive'] == 1:
        return "TRUE_POSITIVE"
    elif c['polarity'] == 1 and c['deceptive'] == 0:
        return "FALSE POSITIVE"
    elif c['polarity'] == 0 and c['deceptive'] == 1:
        return "TRUE_NEGATIVE"
    else:
        return "FALSE NEGATIVE"


data['final_class'] = data.apply(create_class, axis=1)
data['given_class'] = data.apply(specific_class, axis=1)
data
Y = data['given_class']
Y

encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
encoded_Y
dummy_y = np_utils.to_categorical(encoded_Y)
textData = pd.DataFrame(list(data['text']))
stemmer = snowballstemmer.EnglishStemmer()
stop = stopwords.words('english')

stop.extend(['may', 'also', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'across', 'among', 'beside', 'however', 'yet', 'within'])
stop.extend(list(ascii_lowercase))
stoplist = stemmer.stemWords(stop)
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist)))
textData[0].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_~1234567890\'\\\]', '', inplace=True, regex=True)
wordlist = filter(None, " ".join(list(set(list(itertools.chain(*textData[0].str.split('')))))).split(""))

data['stemmed_text_data'] = [".join(filter(None, filter(lambda word: word not in stop, line))) for line in textData[0].str.lower().str.split(' ')]
minimum_count = 1
str_frequencies = pd.DataFrame(list(Counter(filter(None, list(itertools.chain(*data['stemmed_text_data'].str.split(' '))))).items()), columns=['word', 'count'])
low_frequency_words = set(str_frequencies[str_frequencies['count'] <= minimum_count]['word'])
data['stemmed_text_data'] = [' '.join(filter(None, filter(lambda word: word not in low_frequency_words, line))) for line in data['stemmed_text_data'].str.split(' ')]
data['stemmed_text_data'] = [" ".join(stemmer.stemWords(re.sub("[!"#%\'()*+,-/:;<=>?@\[\]^_~{|}~1234567890\\]", '', next_text).split(' '))) for next_text in data['stemmed_text_data']]

lemmatizer = WordNetLemmatizer()
w = re.compile("\w+", re.I)

def label_sentences(df, input_point):
  labeled_sentences = []
  list_sen = []
  for index, datapoint in df.iterrows():
    tokenized_words = re.findall(w, datapoint[input_point].lower())
    labeled_sentences.append(LabeledSentence(words=tokenized_words, tags=['SENT_%s' % index]))
    list_sen.append(tokenized_words)
  return labeled_sentences, list_sen

def train_doc2vec_model(labeled_sentences):
  model = Doc2Vec(min_count=1, window=9, size=512, sample=6e-5, negative=5, workers=7)
  model.build_vocab(labeled_sentences)
  pretrained_weights = model.wv.syn0
  vocab_size, embedding_size = pretrained_weights.shape
  model.train(labeled_sentences, total_examples=vocab_size, epochs=400)
  return model

textData = data['stemmed_text_data'].to_frame().reset_index()
sen, corpus = label_sentences(textData, 'stemmed_text_data')
doc2vec_model = train_doc2vec_model(sen)
doc2vec_model.save("doc2vec_model_opinion_corpus.d2v")
doc2vec_model = Doc2Vec.load("doc2vec_model_opinion_corpus.d2v")

tfidfl = TfidfVectorizer(tokenizer=lambda i: i, lowercase=False, ngram_range=(1, 1))
result_train1 = tfidfl.fit_transform(corpus)
tfidf2 = TfidfVectorizer(tokenizer=lambda i: i, lowercase=False, ngram_range=(1, 2))
result_train2 = tfidf2.fit_transform(corpus)
tfidf3 = TfidfVectorizer(tokenizer=lambda i: i, lowercase=False, ngram_range=(1, 3))
result_train3 = tfidf3.fit_transform(corpus)

# Truncated SVD with n_components set to 512 and n_iter set to 40 for reproducibility
svd = TruncatedSVD(n_components=512, n_iter=40, random_state=34)

# Applying SVD on train data transformed by tfidf1, tfidf2 and tfidf3
tfidf_datal = svd.fit_transform(result_train1)
tfidf_data2 = svd.fit_transform(result_train2)
tfidf_data3 = svd.fit_transform(result_train3)

# Load spaCy language model for English ('en')
nlp = spacy.load('en')

# Assuming temp_textData is a DataFrame containing the text data
temp_textData = pd.DataFrame(list(data['text']))

# Initialize empty lists to store POS tags, tokens, dependency tags
overall_pos_tags_tokens = []
overall_pos = []
overall_tokens = []
overall_dep = []

# Iterate over each text data point in temp_textData
for i in range(1600):
  doc = nlp(temp_textData[0][i])  # Assuming text data is in the first column (index 0)
  given_pos_tags_tokens = []
  given_pos = []
  given_tokens = []
  given_dep = []

  # Iterate over each token in the processed document
  for token in doc:
    output = "%s %s" % (token.pos_, token.tag_)  # Combine POS tag and tag info
    given_pos_tags_tokens.append(output)
    given_pos.append(token.pos_)
    given_tokens.append(token.tag_)
    given_dep.append(token.dep_)

  # Append the processed information for each data point
  overall_pos_tags_tokens.append(given_pos_tags_tokens)
  overall_pos.append(given_pos)
  overall_tokens.append(given_tokens)
  overall_dep.append(given_dep)

# Create count vectorizer for POS tags, tokens, and dependency tags
count = CountVectorizer(tokenizer=lambda i: i, lowercase=False)

# Convert POS tag tokens, POS tags, tokens, and dependency tags into sparse matrices
pos_tags_data = count.fit_transform(overall_pos_tags_tokens).todense()
pos_data = count.fit_transform(overall_pos).todense()
tokens_data = count.fit_transform(overall_tokens).todense()
dep_data = count.fit_transform(overall_dep).todense()

# MinMaxScaler for normalization
min_max_scaler = MinMaxScaler()

# Normalize POS tag tokens, POS tags, tokens, and dependency tags data
normalized_pos_tags_data = min_max_scaler.fit_transform(pos_tags_data)
normalized_pos_data = min_max_scaler.fit_transform(pos_data)
normalized_tokens_data = min_max_scaler.fit_transform(tokens_data)
normalized_dep_data = min_max_scaler.fit_transform(dep_data)

# Initialize final feature matrices for POS tags, tokens, etc. with zeros
final_pos_tags_data = np.zeros(shape=(1600, 512)).astype(np.float32)
final_pos_data = np.zeros(shape=(1600, 512)).astype(np.float32)
final_tokens_data = np.zeros(shape=(1600, 512)).astype(np.float32)
final_dep_data = np.zeros(shape=(1600, 512)).astype(np.float32)

# Fill the final feature matrices with normalized data
final_pos_tags_data[:normalized_pos_tags_data.shape[0], :normalized_pos_tags_data.shape[1]] = normalized_pos_tags_data
final_pos_data[:normalized_pos_data.shape[0], :normalized_pos_data.shape[1]] = normalized_pos_data
final_tokens_data[:normalized_tokens_data.shape[0], :normalized_tokens_data.shape[1]]=normalized_tokens_data
final_dep_data[:normalized_dep_data.shape[0],:normalized_dep_data.shape[1]]=normalized_dep_data

 maxlength = []
for i in range(0, len(sen)):
    maxlength.append(len(sen[i][0]))
print(max(maxlength))

def vectorize_comments(df, d2v_model):
    y = []
    comments = []
    for i in range(0, df.shape[0]):
        label = 'SENT_%s' % i
        comments.append(d2v_model.docvecs[label])
    df['vectorized_comments'] = comments
    return df

textData = vectorize_comments(textData, doc2vec_model)
print(textData.head(2))

X_train, X_test, y_train, y_test = train_test_split(
    textData["vectorized_comments"].T.tolist(),
    dummy_y,
    test_size=0.1,
    random_state=56
)

X = np.array(textData["vectorized_comments"].T.tolist()).reshape((1, 1600, 512))
y = np.array(dummy_y).reshape((1600, 4))

X_train2 = np.array(X_train).reshape((1, 1440, 512))
y_train2 = np.array(y_train).reshape((1, 1440, 4))

X_test2 = np.array(X_test).reshape((1, 160, 512))
y_test2 = np.array(y_test).reshape((1, 160, 4))

Xtemp = textData["vectorized_comments"].T.tolist()
ytemp = data['given_class']
training_indices = []
testing_indices = []

skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(Xtemp, ytemp)

for train_index, test_index in skf.split(Xtemp, ytemp):
    count_i = 0  # Initialize counter for training data index
    for i in train_index:
        len1 = len(sen[i][0])  # Get the length of the current sentence
        average_vector1 = np.zeros(512).astype(np.float32)  # Initialize averaging vectors
        average_vector2 = np.zeros(512).astype(np.float32)
        average_vector3 = np.zeros(512).astype(np.float32)

        for j in range(max(maxlength) + 10):
            if j < len1:
                # Fill X_train3 with doc2vec vectors and update averaging vectors
                X_train3[count_i, j, :] = doc2vec_model[sen[i][0][j]]
                average_vector1 += result_train1[i, tfidfl.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
                average_vector2 += result_train2[i, tfidf2.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
                average_vector3 += result_train3[i, tfidf3.vocabulary_[sen[i][0][j]]] * doc2vec_model[sen[i][0][j]]
            elif j == len1:
                X_train3[count_i, j, :] = tfidf_data1[i]  # Fill with tfidf data
            elif j == len1 + 1:
                X_train3[count_i, j:, :] = tfidf_data2[i]  # Fill with tfidf data
            elif j == len1 + 2:
                X_train3[count_i, j, :] = tfidf_data3[i]  # Fill with tfidf data
            elif j == len1 + 3:
                X_train3[count_i, j, :] = average_vector1  # Fill with averaged doc2vec vector
            elif j == len1 + 4:
            X_test3[count_i, j, :] = average_vector2  # Fill with averaged doc2vec vector
            elif j == len1 + 5:
                X_test3[count_i, j, :] = average_vector3  # Fill with averaged doc2vec vector
            elif j == len1 + 6:
                X_test3[count_i, j:, :] = final_pos_tags_data[i]  # Fill with POS tag features
            elif j == len1 + 7:
                X_test3[count_i, j:, :] = final_pos_data[i]  # Fill with POS features
            elif j == len1 + 8:
                X_test3[count_i, j:, :] = final_tokens_data[i]  # Fill with token features
            elif j == len1 + 9:
                X_test3[count_i, j:, :] = final_dep_data[i]  # Fill with dependency features
            else:
                X_test3[count_i, j, :] = empty_word  # Fill with padding
            Y_test3[count_i, :] = dummy_y[i]  # Fill target labels
        count_i += 1

      return X_train3, X_test3, Y_train3, Y_test3
model = Sequential()
model.add(Conv1D(filters=128, kernel_size=9, padding='same', activation='relu',
                 input_shape=(max(maxlength)+10, 512)))  # Input layer with Conv1D
model.add(Dropout(0.25))  # Regularization with Dropout
model.add(MaxPooling1D(pool_size=2))  # Downsampling with MaxPooling1D
model.add(Dropout(0.25))  # Regularization
model.add(Conv1D(filters=128, kernel_size=7, padding='same', activation='relu'))  # Another Conv1D layer
model.add(Dropout(0.25))  # Regularization
model.add(MaxPooling1D(pool_size=2))  # Downsampling
model.add(Dropout(0.25))  # Regularization
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))  # Another Conv1D layer
model.add(Dropout(0.25))  # Regularization
model.add(Bidirectional(LSTM(50, dropout=0.25, recurrent_dropout=0.2)))  # Bidirectional LSTM layer
model.add(Dense(4, activation='softmax'))  # Output layer with 4 classes

# Compile the model with optimizer and loss function
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

# Print model summary
print(model.summary())

# Initialize a list to store final accuracies
final_accuracies = []

# Perform 10-fold cross-validation
for i in range(10):
    filename = 'weights.best.from_scratch%s.hdf5' % i  # Filename for best weights
    checkpointer = ModelCheckpoint(filepath=filename, save_best_only=True, verbose=1)  # Checkpoint callback

    # Get training and testing data for the current fold
    X_train3, X_test3, Y_train3, Y_test3 = extractTrainingAndTestingData(i)

    # Train the model
    model.fit(X_train3,
              Y_train3,
              epochs=10,
              callbacks=[checkpointer],
              validation_data=(X_test3, Y_test3),
              batch_size=512)

    # Load the best weights for this fold
    model.load_weights(filename)

    # Make predictions and calculate accuracy
    predicted = np.rint(model.predict(X_test3))
    accuracy = accuracy_score(Y_test3, predicted)
    print("Accuracy for fold", i, ":", accuracy)
    final_accuracies.append(accuracy)

# Print the average accuracy across folds
print("Average accuracy:", sum(final_accuracies) / len(final_accuracies))
Y_test


SyntaxError: unterminated string literal (detected at line 116) (<ipython-input-1-a46f4a765478>, line 116)