In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer

from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.svm import SVC

nltk.download('punkt')
print("Hello!!!")

Hello!!!


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumit_hzrt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
rootpath = 'data/positive_polarity/'
basepaths = [rootpath + 'deceptive_from_MTurk/fold', rootpath + 'truthful_from_TripAdvisor/fold']
basepaths

['data/positive_polarity/deceptive_from_MTurk/fold',
 'data/positive_polarity/truthful_from_TripAdvisor/fold']

In [3]:
k = 5

In [4]:
def create_data_list(basepath):
    k_fold_data = []
    for i in range(1, k+1):
        data = []
        path = basepath + str(i)
        with os.scandir(path) as entries:
            for entry in entries:
                with open(entry, encoding="utf8") as file:
                    data.append(file.readline())
        k_fold_data.append(data)
    return k_fold_data

# k_fold_data -> list of list ([<review_text>])
k_fold_data_deceptive = create_data_list(basepaths[0])
k_fold_data_true = create_data_list(basepaths[1])

print(len(k_fold_data_deceptive[0]))
print(len(k_fold_data_true[0]))

80
80


In [5]:
# 46753

In [6]:
def build_bigram_plus_dict(k_fold_data, bigram_plus_dict):
#     stemmer = PorterStemmer()
    bigram_plus_set = set(bigram_plus_dict)

    for data in k_fold_data:
        for text in data:
            tokens = word_tokenize(text)
#             stemmed_tokens = [stemmer.stem(token.lower()) for token in tokens]
            
            # adding unigrams
            for unigram in tokens:
                if not unigram in bigram_plus_set:
                    bigram_plus_set.add(unigram)
                    bigram_plus_dict.append(unigram)
            
            # adding bigrams
            for bigram in list(nltk.bigrams(tokens)):
                if not bigram in bigram_plus_set:
                    bigram_plus_dict.append(bigram)
                    bigram_plus_set.add(bigram)
                    
    return bigram_plus_dict

bigram_plus_dict = []
bigram_plus_dict = build_bigram_plus_dict(k_fold_data_true, bigram_plus_dict)
bigram_plus_dict = build_bigram_plus_dict(k_fold_data_deceptive, bigram_plus_dict)

# reverse dict
bigram_plus_reverse_dict = {v: k for k, v in enumerate(bigram_plus_dict)}
print(len(bigram_plus_dict))
print(len(bigram_plus_reverse_dict))

46753
46753


In [10]:
def create_document_vector(data, bigram_plus_reverse_dict):
    reviews = np.empty((0, len(bigram_plus_reverse_dict)), dtype = np.float64)
#     print(reviews.shape)
    for text in data:
        review = np.zeros((1, len(bigram_plus_reverse_dict)), dtype = np.float64)
        tokens = word_tokenize(text)
        
        for unigram in tokens:
            review[0][bigram_plus_reverse_dict[unigram]]+=1
                    
        for bigram in list(nltk.bigrams(tokens)):
            review[0][bigram_plus_reverse_dict[bigram]]+=1
            
        review = preprocessing.normalize(review, norm='l2')
        reviews = np.append(reviews, review, axis = 0)
    
    return reviews

# document_vector = create_document_vector(k_fold_data_true[0], bigram_plus_reverse_dict)
# np.sqrt(np.sum(document_vector**2, axis=1))   # --> to check summation of normalized vector is one

In [11]:
def svm(k_fold_data_true, k_fold_data_deceptive, bigram_plus_reverse_dict):
    
    params_grid = [{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    
#     final_svm_model = None
    
    # 0 -> true, 1 -> deceptive
    for i in range(k):
        print('Fold', str(i+1), ' as Test Dataset')
        
        test_x = create_document_vector(k_fold_data_true[i], bigram_plus_reverse_dict)
        test_y = np.zeros(len(k_fold_data_true[i]))
        
        test_x = np.append(test_x, create_document_vector(k_fold_data_deceptive[i], bigram_plus_reverse_dict), axis = 0)
        test_y = np.append(test_y, np.ones(len(k_fold_data_deceptive[i])))
        
        
        train_x = None
        train_y = None
        isFirst = True
        
        for j in range(k):
            if i == j:
                continue
                
            true_reviews = create_document_vector(k_fold_data_true[j], bigram_plus_reverse_dict)
            deceptive_reviews = create_document_vector(k_fold_data_deceptive[j], bigram_plus_reverse_dict)
            
            if isFirst == True:
                isFirst = False
                train_x = true_reviews
                train_y = np.zeros(true_reviews.shape[0])
                
                train_x = np.append(train_x, deceptive_reviews, axis=0)
                train_y = np.append(train_y, np.ones(deceptive_reviews.shape[0]))
            else:
                train_x = np.append(train_x, true_reviews, axis=0)
                train_y = np.append(train_y, np.zeros(true_reviews.shape[0]))
                
                train_x = np.append(train_x, deceptive_reviews, axis=0)
                train_y = np.append(train_y, np.ones(deceptive_reviews.shape[0]))
        
        print('\t' + 'Shape-> Train_x: ', train_x.shape, ' Train_y: ', train_y.shape, ' Test_x: ', test_x.shape, ' Test_y: ', test_y.shape)
        svm_model = GridSearchCV(SVC(), params_grid, cv=5)
        svm_model.fit(train_x, train_y)

        # View the training accuracy score
        print('\t Best score for training data:', svm_model.best_score_,"\n") 

        # View the best parameters for the model found using grid search
        print('\t Best C:',svm_model.best_estimator_.C,"\n") 

        best_model = svm_model.best_estimator_
        
        test_y_predicted = best_model.predict(test_x)
        print(classification_report(test_y, test_y_predicted))
        print('\t' + '\033[1m' + "Training set score for SVM: %f" % best_model.score(train_x , train_y))
        print('\t' + '\033[1m' + "Test  set score for SVM: %f" % best_model.score(test_x , test_y ))
        
        


In [12]:
svm(k_fold_data_true, k_fold_data_deceptive, bigram_plus_reverse_dict)

Fold 1  as Test Dataset
	Shape-> Train_x:  (640, 46753)  Train_y:  (640,)  Test_x:  (160, 46753)  Test_y:  (160,)
	 Best score for training data: 0.88125 

	 Best C: 10 

	              precision    recall  f1-score   support

         0.0       0.89      0.88      0.88        80
         1.0       0.88      0.89      0.88        80

    accuracy                           0.88       160
   macro avg       0.88      0.88      0.88       160
weighted avg       0.88      0.88      0.88       160

	[1mTraining set score for SVM: 1.000000
	[1mTest  set score for SVM: 0.881250
Fold 2  as Test Dataset
	Shape-> Train_x:  (640, 46753)  Train_y:  (640,)  Test_x:  (160, 46753)  Test_y:  (160,)
	 Best score for training data: 0.8640625 

	 Best C: 10 

	              precision    recall  f1-score   support

         0.0       0.90      0.95      0.93        80
         1.0       0.95      0.90      0.92        80

    accuracy                           0.93       160
   macro avg       0.93     