# Import Libraries

In [1]:
import pandas as pd
import numpy as np
# Prevent future/deprecation warnings from showing in output
import warnings
warnings.filterwarnings(action='ignore')

import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

#df = pd.read_csv("C://Users//User//Desktop/deception_data_converted_final_Alex.csv", sep=',', engine='python')
df = pd.read_csv('C:\\Users\\User\\Documents\\deception_data_converted_final.tsv', sep='\t')
df.head(10)

Unnamed: 0,lie,sentiment,review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very..."
1,f,n,'i really like this buffet restaurant in Marsh...
2,f,n,"'After I went shopping with some of my friend,..."
3,f,n,'Olive Oil Garden was very disappointing. I ex...
4,f,n,'The Seven Heaven restaurant was never known f...
5,f,n,'I went to XYZ restaurant and had a terrible e...
6,f,n,'I went to ABC restaurant two days ago and I h...
7,f,n,'I went to the Chilis on Erie Blvd and had the...
8,f,n,'OMG. This restaurant is horrible. The recepti...
9,f,n,"'Yesterday, I went to a casino-restaurant call..."


In [2]:
# Check for NaN values:
df.isnull().sum()

lie          0
sentiment    0
review       0
dtype: int64

In [3]:
# Create Feature and Label sets
X = df['review']  # note the double set of brackets
y = df['sentiment']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

Training Data Shape: (61,)
Testing Data Shape:  (31,)


# Multinomial Naive bayes #1

In [5]:
#multinomial #1
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [6]:
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8709677419354839
              precision    recall  f1-score   support

           n       1.00      0.78      0.88        18
           p       0.76      1.00      0.87        13

   micro avg       0.87      0.87      0.87        31
   macro avg       0.88      0.89      0.87        31
weighted avg       0.90      0.87      0.87        31



# Bernoulli #1

In [7]:
# Bernoulli 
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', BernoulliNB()),
              ])
nb.fit(X_train, y_train)

BernNB  = BernoulliNB(binarize = True)
# binarize = 0.1 > Values greater than the threshold(0.1) map to 1, 
# while values less than or equal to the threshold map to 0.
# It can also be used as a pre-processing step for estimators



In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorising the text
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
# transform
X_train_transformed = vect.transform(X_train)
X_test_tranformed =vect.transform(X_test)

In [10]:
from sklearn.naive_bayes import BernoulliNB

# instantiate bernoulli NB object
bnb = BernoulliNB()
# fit 
bnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = bnb.predict(X_test_tranformed)

# predict probability
y_pred_proba =bnb.predict_proba(X_test_tranformed)

# accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7096774193548387

In [11]:
# accuracy
from sklearn import metrics
from sklearn.metrics import classification_report
y_pred = bnb.predict(X_test_tranformed)
print(classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

           n       1.00      0.50      0.67        18
           p       0.59      1.00      0.74        13

   micro avg       0.71      0.71      0.71        31
   macro avg       0.80      0.75      0.70        31
weighted avg       0.83      0.71      0.70        31



# Modeling with Data Processing

In [12]:
df2 = df
df2.head(10)

Unnamed: 0,lie,sentiment,review
0,f,n,"'Mike\'s Pizza High Point, NY Service was very..."
1,f,n,'i really like this buffet restaurant in Marsh...
2,f,n,"'After I went shopping with some of my friend,..."
3,f,n,'Olive Oil Garden was very disappointing. I ex...
4,f,n,'The Seven Heaven restaurant was never known f...
5,f,n,'I went to XYZ restaurant and had a terrible e...
6,f,n,'I went to ABC restaurant two days ago and I h...
7,f,n,'I went to the Chilis on Erie Blvd and had the...
8,f,n,'OMG. This restaurant is horrible. The recepti...
9,f,n,"'Yesterday, I went to a casino-restaurant call..."


## Data Cleaning Stopwords, punct, symbols

In [13]:
import itertools
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding. BeautifulSoup's text attribute will return a string stripped of any HTML tags and metadata.
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df2['review'] = df2['review'].apply(clean_text)
df2.head()

Unnamed: 0,lie,sentiment,review
0,f,n,mikes pizza high point ny service slow quality...
1,f,n,really like buffet restaurant marshall street ...
2,f,n,went shopping friend went dodo restaurant dinn...
3,f,n,olive oil garden disappointing expect good foo...
4,f,n,seven heaven restaurant never known superior s...


# MultiNomial Bayes #2 Processed

In [14]:
# Create Feature and Label sets
X = df2['review']  # note the double set of brackets
y = df2['sentiment']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.7419354838709677
              precision    recall  f1-score   support

           n       1.00      0.56      0.71        18
           p       0.62      1.00      0.76        13

   micro avg       0.74      0.74      0.74        31
   macro avg       0.81      0.78      0.74        31
weighted avg       0.84      0.74      0.74        31



# Bernoulli #2 processed

In [15]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', BernoulliNB()),
              ])
nb.fit(X_train, y_train)

BernNB  = BernoulliNB(binarize = 1)
# binarize = 0.1 > Values greater than the threshold(0.1) map to 1, 

# vectorising the text
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)

# transform
X_train_transformed = vect.transform(X_train)
X_test_tranformed =vect.transform(X_test)

# instantiate bernoulli NB object
bnb = BernoulliNB()
# fit 
bnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = bnb.predict(X_test_tranformed)

# predict probability
y_pred_proba =bnb.predict_proba(X_test_tranformed)


metrics.accuracy_score(y_test, y_pred_class)

y_pred = bnb.predict(X_test_tranformed)

print('accuracy %s' % accuracy_score(y_pred_class, y_test))
print(classification_report(y_test, y_pred_class))

accuracy 0.7096774193548387
              precision    recall  f1-score   support

           n       1.00      0.50      0.67        18
           p       0.59      1.00      0.74        13

   micro avg       0.71      0.71      0.71        31
   macro avg       0.80      0.75      0.70        31
weighted avg       0.83      0.71      0.70        31



# MNB #3 on Truth

In [16]:
df3 = df2
df3.head()

Unnamed: 0,lie,sentiment,review
0,f,n,mikes pizza high point ny service slow quality...
1,f,n,really like buffet restaurant marshall street ...
2,f,n,went shopping friend went dodo restaurant dinn...
3,f,n,olive oil garden disappointing expect good foo...
4,f,n,seven heaven restaurant never known superior s...


In [20]:
# Create Feature and Label sets
X2 = df3['review']  # note the double set of brackets
y2 = df3['lie']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.33, random_state=42)

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.5806451612903226
              precision    recall  f1-score   support

           f       0.59      0.62      0.61        16
           t       0.57      0.53      0.55        15

   micro avg       0.58      0.58      0.58        31
   macro avg       0.58      0.58      0.58        31
weighted avg       0.58      0.58      0.58        31



# Bernoulli #3 processed

In [21]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', BernoulliNB()),
              ])
nb.fit(X_train, y_train)

BernNB  = BernoulliNB(binarize = 1)
# binarize = 0.1 > Values greater than the threshold(0.1) map to 1, 

# vectorising the text
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)

# transform
X_train_transformed = vect.transform(X_train)
X_test_tranformed =vect.transform(X_test)

# instantiate bernoulli NB object
bnb = BernoulliNB()
# fit 
bnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = bnb.predict(X_test_tranformed)

# predict probability
y_pred_proba =bnb.predict_proba(X_test_tranformed)


metrics.accuracy_score(y_test, y_pred_class)

y_pred = bnb.predict(X_test_tranformed)

print('accuracy %s' % accuracy_score(y_pred_class, y_test))
print(classification_report(y_test, y_pred_class))

accuracy 0.5806451612903226
              precision    recall  f1-score   support

           f       0.71      0.31      0.43        16
           t       0.54      0.87      0.67        15

   micro avg       0.58      0.58      0.58        31
   macro avg       0.63      0.59      0.55        31
weighted avg       0.63      0.58      0.55        31

