# Implementing the LPU Model with LDA and Synonyms

In [1]:
# importing the libraries used

import pandas as pd
import re
import string
import sklearn
import csv

# 1 . Data Set Formation And Preprocessing

In [2]:

# reading comments and synopsis files
dataset_file="dataset_shawshank.csv"
synopsis_file='synopsis_shawshank.txt'
df = pd.read_csv(dataset_file)
file = open(synopsis_file,"r")
text=file.read()
sentences = text.split('. ')
df2 = pd.DataFrame(sentences, columns=["review_text"])
df2.to_csv('synopsis3.csv', index=False)

#separating the spoiler and non-spoiler comments into two data frames
df_pos=df.loc[df['is_spoiler'] == True]
df_neg=df.loc[df['is_spoiler'] == False]
pos_size=min(1000,df_pos.shape[0])
neg_size=min(500,df_neg.shape[0])

# selecting half of spoiler comments from total spoiler comments as positive dataset
df_pos1=(df_pos.loc[:0.5*pos_size,:])
df_pos1.drop(["is_spoiler"],axis=1,inplace=True)
train_size=0.5*(pos_size)
test_size=0.5*(pos_size+neg_size)
df_test_pos=df_pos.loc[0.5*pos_size+1:,:]
df_test_neg=df_neg.loc[:,:]

# training data includes synopsis and half of spoiler comments
# test data includes half of spoiler comments and all the non-spoiler comments
frames_train=[df_pos1,df2]
frames_test=[df_test_pos,df_test_neg]
df_train=pd.concat(frames_train)
df_test=pd.concat(frames_test)
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)

(479, 6)
956 500
(479, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [3]:
#giving index to the comments.

index=[]
for i in range(len(df_train.index)):
    index.append("Positive "+str(i))
df_train['index']=index
df_train.set_index('index', inplace=True)

index2=[]
for i in range(len(df_test.index)):
    index2.append("Comment "+str(i))
df_test['index']=index2
df_test.set_index('index', inplace=True)


In [4]:
# cleaning the input comments i.e. removing punctuation marks and digits

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
round1 = lambda x: clean_text_round1(x)

In [5]:
# Cleaned output data

data_clean = pd.DataFrame(df_train.review_text.apply(round1))
data_clean2 = pd.DataFrame(df_test.review_text.apply(round1))


In [6]:

# Used CountVectorizer to get count of all the words present in the comment 
# It is used for preparing feature vector and training and test set

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.review_text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index


cv2 = CountVectorizer(stop_words='english')
data_cv2 = cv2.fit_transform(data_clean2.review_text)
data_dtm2 = pd.DataFrame(data_cv2.toarray(), columns=cv2.get_feature_names())
data_dtm2.index = data_clean2.index

In [7]:
# Imporing libraries

import numpy as np
from gensim import matutils, models
import scipy.sparse


In [8]:
#corpus formation for lda input.
sparse_counts = scipy.sparse.csr_matrix(data_dtm.transpose())
corpus = matutils.Sparse2Corpus(sparse_counts)

# word to identifier(a number) dictionary
id_word_dict={}
for i in range(len(data_dtm.columns)):
    id_word_dict[i]=data_dtm.columns[i]

In [9]:
#  applied LDA on corpus. 
total_topics=50
lda = models.LdaMulticore(corpus=corpus, id2word=id_word_dict, num_topics=total_topics, passes=50)

In [10]:

# word to index dictionary
# Indexed all words in data_dtm and data_dtm2 matrix
word_dict={}
word_dict2={}
j=0
for i in data_dtm.columns:
    word_dict[i]=j
    j+=1
j=0
for i in data_dtm2.columns:
    word_dict2[i]=j
    j+=1

In [11]:
# Scraped words from lda topic distribution output
# Stored in dictionary of form word-index
word_weight_positive={}
cnt=0
for i in range(total_topics):
    word_topic=lda.print_topic(i).split('+')
    j=0
    for word_probab in word_topic:
        val,wor=word_probab.split('*')        
        wor=str(wor)
        #print(wor)
        y=len(wor)
        if j==len(word_topic)-1:
            wor = wor[1:y-1]
        else:
            wor = wor[1:y-2]
        #print(wor)
        if wor not in word_weight_positive:
            word_weight_positive[wor]=cnt
            cnt+=1
            
        
        j=j+1

In [12]:
# Imported some libraries for similarity checking.

import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /home/rushi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#  2. Preparation of Training and Test  Data Set

In [13]:
# Initializing training and test set as all zeros 
# Here number of features for each training sample is
# number of words LDA gave

train_size = len(df_train.index)
test_size = len(df_test.index)
no_of_words = len(word_weight_positive)
X_train = np.zeros((train_size,no_of_words))
X_test = np.zeros((test_size,no_of_words))
Y_test = np.zeros(test_size)
print(len(word_weight_positive))


198


In [None]:
# For each comment (in training set) we were iterating over words of each comment.
# If word was not found in lda-output words , we were searching weather  similar
# word  from set given by 'Wordnet' is present in lda-output or not.
# If we found similar word is present then we are updating weight of that feature
# with weight of current word.


#Finaly we have created matrix input for training one class svm.
for i in range(train_size):
    s = set(data_clean.iloc[i][0].split(' '))
    for wor in s:
        if wor not in word_weight_positive:
            flag=0
            for syn in wordnet.synsets(wor): 
                for l in syn.lemmas(): 
                    temp_word = l.name()
                    if temp_word in word_weight_positive:
                        flag=1
                        break
                if flag==1:
                    break
            if flag==1:
                index = word_weight_positive[temp_word]
                ind2 = word_dict[temp_word]
                X_train[i][index]+=data_dtm.iloc[i][ind2]
        else :
            index = word_weight_positive[wor]
            ind2 = word_dict[wor]
            X_train[i][index]+=data_dtm.iloc[i][ind2]

In [None]:
# IN the same way as we have created testing matrix
for i in range(test_size):
    s = set(data_clean2.iloc[i][0].split(' '))
    for wor in s:
        if wor in word_dict2:
            if wor not in word_weight_positive:
                flag=0
                for syn in wordnet.synsets(wor): 
                    for l in syn.lemmas(): 
                        temp_word = l.name()
                        if temp_word in word_weight_positive:
                            flag=1
                            break
                    if flag==1:
                        break
                if flag==1:
                    index = word_weight_positive[temp_word]
                    ind2 = word_dict2[wor]
                    X_test[i][index]+=data_dtm2.iloc[i][ind2]
            else :
                index = word_weight_positive[wor]
                ind2 = word_dict2[wor]
                X_test[i][index]+=data_dtm2.iloc[i][ind2]

In [None]:
# Setting the labels of test data

for i in range(test_size):
    Y_test[i]=df_test.iloc[i]["is_spoiler"]

# 3. One Class SVM Model

In [None]:
#  Applied One class SVM model.
# you can change gamma attribute with polynomial and give degree as input.
from sklearn.svm import OneClassSVM
lpu_model = OneClassSVM(gamma='auto').fit(X_train)
Y_out = lpu_model.predict(X_test)

# 4.  Reslult Calculations

In [None]:
# result calculations.

total=len(Y_out)
false_positive=0
false_negative=0
true_positive=0
true_negative=0

for i in range(len(Y_out)):
    if Y_out[i]==-1 and Y_test[i]==1:
        false_negative+=1
    elif Y_out[i]==1 and Y_test[i]==0:
        false_positive+=1
    elif Y_out[i]==1 and Y_test[i]==1:
        true_positive+=1
    else:
        true_negative+=1

In [None]:
# result declaration

print(true_positive,true_negative)
print(false_negative,false_positive)
print((true_negative+true_positive)/total)

precision=true_positive/(true_positive+false_positive)
recall=true_positive/(true_positive+false_negative)
f1score=(2*precision*recall)/(precision+recall)
print(f1score)