In [2]:
import pandas as pd
import string
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import spacy
import warnings
warnings.filterwarnings('ignore')

In [3]:
opinion_spam=pd.read_csv("deceptive-opinion.csv")
opinion_spam.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [4]:
df=opinion_spam.loc[opinion_spam['polarity']=="positive"]
df.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [5]:
df['deceptive'] = df['deceptive'].map({'truthful': 1, 'deceptive': 0})
df.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,1,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,1,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,1,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,1,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,1,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [6]:
#text_process function for pos tagging using spacy
sp=spacy.load('en')

def text_process(mess):
    
    pos_tagged = []
 
    data=sp(mess)
    
    for word in data:
        pos_tagged.append((word.text,  word.pos_))
    
    return pos_tagged

In [7]:
#creating a bow transformer with pos tagging analyzer
from sklearn.feature_extraction.text import CountVectorizer


bow_transformer = CountVectorizer(analyzer=text_process).fit(df['text'])

print(len(bow_transformer.vocabulary_))

7466


In [8]:
print(bow_transformer.get_feature_names())

[('\n', 'SPACE'), (' ', 'SPACE'), ('!', 'PUNCT'), ('#', 'SYM'), ('$', 'SYM'), ('$48', 'NUM'), ('%', 'NOUN'), ('&', 'CCONJ'), ("'", 'NOUN'), ("'", 'PART'), ("'", 'PUNCT'), ("'d", 'VERB'), ("'ll", 'VERB'), ("'m", 'AUX'), ("'re", 'AUX'), ("'s", 'AUX'), ("'s", 'NOUN'), ("'s", 'PART'), ("'s", 'PRON'), ("'s", 'VERB'), ("'ve", 'AUX'), ('(', 'PUNCT'), (')', 'PUNCT'), ('*', 'PUNCT'), ('+', 'CCONJ'), ('+', 'SYM'), (',', 'PUNCT'), ('-', 'ADJ'), ('-', 'NOUN'), ('-', 'PROPN'), ('-', 'PUNCT'), ('-', 'SYM'), ('-', 'VERB'), ('--', 'PUNCT'), ('-20', 'PROPN'), ('-Atlanta', 'PROPN'), ('-Bobby', 'PROPN'), ('-Clean', 'PROPN'), ('-Helpful', 'X'), ('-Review', 'PROPN'), ('-Well', 'VERB'), ('.', 'PROPN'), ('.', 'PUNCT'), ('..', 'PUNCT'), ('...', 'PUNCT'), ('....', 'PUNCT'), ('.....', 'PUNCT'), ('.An', 'PUNCT'), ('.but', 'PUNCT'), ('/', 'PUNCT'), ('/', 'SYM'), ('/sight', 'NOUN'), ('06/04/05', 'PROPN'), ('08', 'NUM'), ('1', 'NUM'), ('1', 'X'), ('1&1/2', 'NUM'), ('1.5', 'NUM'), ('1.99', 'NUM'), ('1/2', 'NUM'), ('

In [9]:
print(len(bow_transformer.vocabulary_))

7466


In [10]:
print(bow_transformer)

CountVectorizer(analyzer=<function text_process at 0x12a4bc158>, binary=False,
                decode_error='strict', dtype=<class 'numpy.int64'>,
                encoding='utf-8', input='content', lowercase=True, max_df=1.0,
                max_features=None, min_df=1, ngram_range=(1, 1),
                preprocessor=None, stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)


In [11]:
#transforming the text in df to vectors after pos tagging using count vectorizer
message_bow=bow_transformer.transform(df['text'])

In [12]:
message_bow.shape

(800, 7466)

In [13]:
#non nulls
message_bow.nnz

71181

In [14]:
message_bow

<800x7466 sparse matrix of type '<class 'numpy.int64'>'
	with 71181 stored elements in Compressed Sparse Row format>

In [15]:
from sklearn.model_selection import train_test_split
text_train,text_test,label_train,label_test= train_test_split(df['text'],df['deceptive'],test_size=0.3)


In [18]:
#data pipeline is used to store pipeline of workflow
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier ,export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


#creating a pipeline for svm and doing predictions
pipeline_svm=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',SVC())
])
pipeline_svm.fit(text_train,label_train)
predictions_svm=pipeline_svm.predict(text_test)
score_svm=cross_val_score(SVC(),message_bow,df['deceptive'],cv=5)

print("Classification report with POS Tagging for 800 reviews using SVM Classifier")
print(" ")
print("svm",classification_report(label_test,predictions_svm))
print("5 Fold Cross Validation Score:",score_svm)
print("")

print("------------------------------------------------------------------------------")


#creating a pipeline for text processing and fitting our data in the pipeline NB
pipeline=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',MultinomialNB())
])

pipeline.fit(text_train,label_train)

#prediction
predictions=pipeline.predict(text_test)
score_NB=cross_val_score(MultinomialNB(),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 800 reviews using MultinomialNB Classifier")
print(" ")
print(classification_report(label_test,predictions))
print("5 Fold Cross Validation Score:",score_NB)
print("")

print("------------------------------------------------------------------------------")

#creating a pipeline for Logistic Regression and doing predictions
pipeline_LR=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',LogisticRegression(max_iter=200))
])
pipeline_LR.fit(text_train,label_train)
predictions_LR=pipeline_LR.predict(text_test)
score_LR=cross_val_score(LogisticRegression(max_iter=200),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 800 reviews using Logistic Regression Classifier")
print(" ")
print(classification_report(label_test,predictions_LR))
print("5 Fold Cross Validation Score:",score_LR)
print("")

print("------------------------------------------------------------------------------")

#creating a pipeline for DecisionTreeClassifier and doing predictions
pipeline_DT=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',DecisionTreeClassifier())
])
pipeline_DT.fit(text_train,label_train)
predictions_DT=pipeline_DT.predict(text_test)
score_DT=cross_val_score(DecisionTreeClassifier(),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 800 reviews using Decision Tree Classifier")
print(" ")
print(classification_report(label_test,predictions_DT))
print("5 Fold Cross Validation Score:",score_DT)
print("")

print("------------------------------------------------------------------------------")

#creating a pipeline for RandomForestClassifier and doing predictions
pipeline_RF=Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('classifier',RandomForestClassifier())
])
pipeline_RF.fit(text_train,label_train)
predictions_RF=pipeline_RF.predict(text_test)
score_RF=cross_val_score(RandomForestClassifier(),message_bow,df['deceptive'],cv=5)
print("Classification report with POS Tagging for 800 reviews using Random Forest Classifier")
print(" ")
print(classification_report(label_test,predictions_RF))
print("5 Fold Cross Validation Score:",score_RF)
print("")

print("------------------------------------------------------------------------------")


Classification report with POS Tagging for 800 reviews using SVM Classifier
 
svm               precision    recall  f1-score   support

           0       0.50      0.93      0.65       104
           1       0.85      0.29      0.44       136

    accuracy                           0.57       240
   macro avg       0.68      0.61      0.55       240
weighted avg       0.70      0.57      0.53       240

5 Fold Cross Validation Score: [0.71875 0.7     0.7375  0.73125 0.625  ]

------------------------------------------------------------------------------
Classification report with POS Tagging for 800 reviews using MultinomialNB Classifier
 
              precision    recall  f1-score   support

           0       0.89      0.94      0.92       104
           1       0.95      0.91      0.93       136

    accuracy                           0.93       240
   macro avg       0.92      0.93      0.92       240
weighted avg       0.93      0.93      0.93       240

5 Fold Cross Validation