In [1]:
import pandas as pd
import string
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import spacy

In [2]:
df=pd.read_csv("deceptive-opinion.csv")
df.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [3]:
df['deceptive'] = df['deceptive'].map({'truthful': 1, 'deceptive': 0})
df.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,1,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,1,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,1,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,1,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,1,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [4]:
#text_process function for pos tagging using spacy
sp=spacy.load('en')

def text_process(mess):
    
    pos_tagged = []
 
    data=sp(mess)
    
    for word in data:
        pos_tagged.append((word.text,  word.pos_))
    
    return pos_tagged

In [5]:
#creating a bow transformer with token analyzer
from sklearn.feature_extraction.text import CountVectorizer


bow_transformer = CountVectorizer(analyzer=text_process).fit(df['text'])

print(len(bow_transformer.vocabulary_))

13240


In [6]:
print(bow_transformer.get_feature_names())



In [7]:
#print(len(bow_transformer.vocabulary_))
#print(bow_transformer)
#message_bow.shape
#non nulls
#message_bow.nnz
#message_bow

In [8]:
#transforming the text in df to vectors after tokenization using count vectorizer
message_bow=bow_transformer.transform(df['text'])

In [9]:
from sklearn.model_selection import train_test_split
text_train,text_test,label_train,label_test= train_test_split(df['text'],df['deceptive'],test_size=0.3)


In [11]:
#data pipeline is used to store pipeline of workflow
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier ,export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


#creating a pipeline for svm and doing predictions
pipeline_svm=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',SVC())
])
pipeline_svm.fit(text_train,label_train)
predictions_svm=pipeline_svm.predict(text_test)
score_svm=cross_val_score(SVC(),message_bow,df['deceptive'],cv=5)

print("Classification report with POS Tagging for 1600 reviews using SVM Classifier")
print(" ")
print("svm",classification_report(label_test,predictions_svm))
print("5 Fold Cross Validation Score:",score_svm)
print("")

print("------------------------------------------------------------------------------")


#creating a pipeline for text processing and fitting our data in the pipeline NB
pipeline=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',MultinomialNB())
])

pipeline.fit(text_train,label_train)

#prediction
predictions=pipeline.predict(text_test)
score_NB=cross_val_score(MultinomialNB(),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 1600 reviews using MultinomialNB Classifier")
print(" ")
print(classification_report(label_test,predictions))
print("5 Fold Cross Validation Score:",score_NB)
print("")

print("------------------------------------------------------------------------------")

#creating a pipeline for Logistic Regression and doing predictions
pipeline_LR=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',LogisticRegression(max_iter=200))
])
pipeline_LR.fit(text_train,label_train)
predictions_LR=pipeline_LR.predict(text_test)
score_LR=cross_val_score(LogisticRegression(max_iter=200),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 1600 reviews using Logistic Regression Classifier")
print(" ")
print(classification_report(label_test,predictions_LR))
print("5 Fold Cross Validation Score:",score_LR)
print("")

print("------------------------------------------------------------------------------")

#creating a pipeline for DecisionTreeClassifier and doing predictions
pipeline_DT=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',DecisionTreeClassifier())
])
pipeline_DT.fit(text_train,label_train)
predictions_DT=pipeline_DT.predict(text_test)
score_DT=cross_val_score(DecisionTreeClassifier(),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 1600 reviews using Decision Tree Classifier")
print(" ")
print(classification_report(label_test,predictions_DT))
print("5 Fold Cross Validation Score:",score_DT)
print("")

print("------------------------------------------------------------------------------")

#creating a pipeline for RandomForestClassifier and doing predictions
pipeline_RF=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',RandomForestClassifier())
])
pipeline_RF.fit(text_train,label_train)
predictions_RF=pipeline_RF.predict(text_test)
score_RF=cross_val_score(RandomForestClassifier(),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 1600 reviews using Random Forest Classifier")
print(" ")
print(classification_report(label_test,predictions_RF))
print("5 Fold Cross Validation Score:",score_RF)
print("")

print("------------------------------------------------------------------------------")




Classification report with POS Tagging for 1600 reviews using SVM Classifier
 
svm               precision    recall  f1-score   support

           0       0.92      0.41      0.57       260
           1       0.58      0.96      0.72       220

    accuracy                           0.66       480
   macro avg       0.75      0.69      0.65       480
weighted avg       0.77      0.66      0.64       480

5 Fold Cross Validation Score: [0.703125 0.66875  0.7      0.70625  0.703125]

------------------------------------------------------------------------------
Classification report with POS Tagging for 1600 reviews using MultinomialNB Classifier
 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       260
           1       0.89      0.92      0.91       220

    accuracy                           0.91       480
   macro avg       0.91      0.92      0.91       480
weighted avg       0.92      0.91      0.91       480

5 Fold Cross Val



Classification report with POS Tagging for 1600 reviews using Logistic Regression Classifier
 
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       260
           1       0.81      0.89      0.85       220

    accuracy                           0.85       480
   macro avg       0.85      0.86      0.85       480
weighted avg       0.86      0.85      0.85       480

5 Fold Cross Validation Score: [0.88125  0.878125 0.85625  0.828125 0.83125 ]

------------------------------------------------------------------------------
Classification report with POS Tagging for 1600 reviews using Decision Tree Classifier
 
              precision    recall  f1-score   support

           0       0.73      0.68      0.70       260
           1       0.65      0.70      0.67       220

    accuracy                           0.69       480
   macro avg       0.69      0.69      0.69       480
weighted avg       0.69      0.69      0.69       480

5 Fo



Classification report with POS Tagging for 1600 reviews using Random Forest Classifier
 
              precision    recall  f1-score   support

           0       0.70      0.77      0.74       260
           1       0.70      0.61      0.65       220

    accuracy                           0.70       480
   macro avg       0.70      0.69      0.69       480
weighted avg       0.70      0.70      0.70       480

5 Fold Cross Validation Score: [0.74375 0.78125 0.775   0.68125 0.7125 ]

------------------------------------------------------------------------------


