In [136]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/yelp-dataset-based-on-fake-reviewers/cleaned_data.csv
/kaggle/input/deceptive-opinion-spam-corpus/deceptive-opinion.csv


In [137]:
import re
import string
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [138]:
df = pd.read_csv('/kaggle/input/deceptive-opinion-spam-corpus/deceptive-opinion.csv')

In [139]:
df.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [140]:
df = df.drop(["hotel", "polarity","source"], axis=1)

In [141]:
df = df.sample(frac=1)

In [142]:
df.head()

Unnamed: 0,deceptive,text
494,deceptive,Homewood Suites by Hilton Chicago Downtown is ...
1547,deceptive,When I first made reservations at The Palmer H...
782,deceptive,The Palmer House Hilton was recommended to me ...
75,truthful,"The reviews we read were a bit mixed, but I th..."
1421,deceptive,My wife and I stayed at the Ambassador East Ho...


In [143]:
from sklearn import preprocessing 

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column 'species'. 
df['deceptive']= label_encoder.fit_transform(df['deceptive']) 

df['deceptive'].unique() 

array([0, 1])

In [144]:
df.head()

Unnamed: 0,deceptive,text
494,0,Homewood Suites by Hilton Chicago Downtown is ...
1547,0,When I first made reservations at The Palmer H...
782,0,The Palmer House Hilton was recommended to me ...
75,1,"The reviews we read were a bit mixed, but I th..."
1421,0,My wife and I stayed at the Ambassador East Ho...


In [145]:
#dataset description #truthful=1 deceptive=0
df.groupby('deceptive').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
deceptive,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,800,800,I stayed at the Swissotel Chicago while I was ...,1
1,800,796,Very disappointed in our stay in Chicago Monoc...,2


In [146]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

In [147]:
# Some preprocesssing that will be common to all the text classification methods

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_char(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

In [148]:
def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

In [149]:
df['text'] = df['text'].map(lambda a: clean_numbers(a))

In [150]:
df['text'] = df['text'].map(lambda a: clean_char(a))

In [151]:
df['text'] = df['text'].map(lambda a: clean_text(a))

In [152]:
df['text']

494     homewood suit hilton chicago downtown wonder h...
1547    first made reserv palmer hous hilton excit gor...
782     palmer hous hilton recommend friend visit chic...
75      review read bit mix thought excel stay splendi...
1421    wife stay ambassador east hotel last month son...
447     hyatt regenc chicago hotel delight stay never ...
1542    servic subpar room need better clean check cou...
170     stay hard rock januari night locat michigan av...
264     recent trip chicago attend major trade show pl...
43      husband decid take trip chicago last minut qui...
897     noisi constant water run pipe terribl much bet...
1292    disappoint hotel stay swissotel enjoy much ser...
1316    one better experi first arriv hotel tri get ro...
331     wife redeem hilton reward point stay night pal...
0       stay one night getaway famili thursday tripl a...
1306    husband stay hotel suppos romant weekend far c...
1085    disappoint stay chicago monoco stay mani time ...
1121    use ho

In [153]:
df.describe()

Unnamed: 0,deceptive
count,1600.0
mean,0.5
std,0.500156
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600 entries, 494 to 696
Data columns (total 2 columns):
deceptive    1600 non-null int64
text         1600 non-null object
dtypes: int64(1), object(1)
memory usage: 37.5+ KB


In [155]:
x = df['text']
y = df['deceptive']

In [156]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

In [157]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['deceptive'], random_state=5)
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 1600
Number of rows in the training set: 1200
Number of rows in the test set: 400


In [158]:
X_train, X_test, y_train, y_test

(894     general speak noth bad place would clean issu ...
 471     husband stay short get away weekend love conve...
 1455    made regular busi trip chicago decid stay hote...
 595     magnific mile chicago great place visit stay a...
 22      actual book reserv hotel phone got great rate ...
 322     wife decid spend three day chicago last summer...
 865     line check desk tremend long decid use compute...
 1385    recent trip chicago stay ambassador east hotel...
 862     omni chosen locat whichwork perfect bed wond e...
 874     stay fairmont two saturday row stay disappoint...
 1464    sofitel chicago water tower downtown area adve...
 281     want nice place stay night dinner theater daug...
 481     thank sheraton tower invit enjoy indoor pool g...
 474     hyatt regenc chicago one beauti hotel ever sta...
 690     hilton hotel help make trip chicago central lo...
 680     stay sever differ hotel chicago jame best ever...
 908     would recommend stay swissotel chicago travel .

In [159]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [160]:
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [161]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [162]:
predictions = naive_bayes.predict(testing_data)

In [164]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
mnbaccuracy = accuracy_score(y_test, predictions)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9025
Precision score:  0.9325842696629213
Recall score:  0.8601036269430051
F1 score:  0.894878706199461


In [165]:
from sklearn.svm import SVC 
svc = SVC()
svc.fit(training_data, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [166]:
svc_predictions = svc.predict(testing_data)

In [167]:
from sklearn.metrics import accuracy_score
svcaccuracy = accuracy_score(y_test,svc_predictions)
print('Accuracy score: ', format(accuracy_score(y_test,svc_predictions)))
print('Precision score: ', format(precision_score(y_test,svc_predictions)))
print('Recall score: ', format(recall_score(y_test, svc_predictions)))
print('F1 score: ', format(f1_score(y_test, svc_predictions)))

Accuracy score:  0.5625
Precision score:  0.525
Recall score:  0.9792746113989638
F1 score:  0.6835443037974684


In [168]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(training_data, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [169]:
knn_predictions = knn.predict(testing_data)

In [170]:
from sklearn.metrics import accuracy_score
knnaccuracy = accuracy_score(knn_predictions,y_test )
print('Accuracy score: ', format(accuracy_score(knn_predictions,y_test )))
print('Precision score: ', format(precision_score(y_test,knn_predictions)))
print('Recall score: ', format(recall_score(y_test, knn_predictions)))
print('F1 score: ', format(f1_score(y_test, knn_predictions)))

Accuracy score:  0.5825
Precision score:  0.8421052631578947
Recall score:  0.16580310880829016
F1 score:  0.27705627705627706


In [171]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB()
training_data1 = training_data.toarray()
gnb.fit(training_data1, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [172]:
testing_data1= testing_data.toarray()
gnb_predictions = gnb.predict(testing_data1)

In [173]:
from sklearn.metrics import accuracy_score
gnb_accuracy = accuracy_score(gnb_predictions,y_test )
print('Accuracy score: ', format(accuracy_score(gnb_predictions,y_test )))
print('Precision score: ', format(precision_score(y_test,gnb_predictions)))
print('Recall score: ', format(recall_score(y_test, gnb_predictions)))
print('F1 score: ', format(f1_score(y_test, gnb_predictions)))

Accuracy score:  0.665
Precision score:  0.6577540106951871
Recall score:  0.6373056994818653
F1 score:  0.6473684210526316


In [174]:
# training a DescisionTreeClassifier 
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier(max_depth = 2)
dtree_model.fit(training_data1, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [175]:
dtree_predictions = dtree_model.predict(testing_data) 

In [176]:
from sklearn.metrics import accuracy_score
dtree_accuracy = accuracy_score(dtree_predictions,y_test )
print('Accuracy score: ', format(accuracy_score(dtree_predictions,y_test )))
print('Precision score: ', format(precision_score(y_test,dtree_predictions)))
print('Recall score: ', format(recall_score(y_test, dtree_predictions)))
print('F1 score: ', format(f1_score(y_test, dtree_predictions)))

Accuracy score:  0.66
Precision score:  0.6255506607929515
Recall score:  0.7357512953367875
F1 score:  0.6761904761904761


In [178]:
from sklearn.linear_model import SGDClassifier

In [179]:
sgd_clf = SGDClassifier()
sgd_clf.fit(training_data, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [180]:
sgdpredicted = sgd_clf.predict(testing_data)

In [181]:
from sklearn.metrics import accuracy_score
sgd_accuracy = accuracy_score(sgdpredicted,y_test )
print('Accuracy score: ', format(accuracy_score(sgdpredicted,y_test )))
print('Precision score: ', format(precision_score(y_test,sgdpredicted)))
print('Recall score: ', format(recall_score(y_test, sgdpredicted)))
print('F1 score: ', format(f1_score(y_test, sgdpredicted)))

Accuracy score:  0.8775
Precision score:  0.8913043478260869
Recall score:  0.8497409326424871
F1 score:  0.8700265251989391


In [184]:
from sklearn.linear_model import LogisticRegression

In [185]:
lr = LogisticRegression()
lr.fit(training_data, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [186]:
lrpredicted = lr.predict(testing_data)

In [187]:
from sklearn.metrics import accuracy_score
lr_accuracy = accuracy_score(lrpredicted,y_test )
print('Accuracy score: ', format(accuracy_score(lrpredicted,y_test )))
print('Precision score: ', format(precision_score(y_test,lrpredicted)))
print('Recall score: ', format(recall_score(y_test, lrpredicted)))
print('F1 score: ', format(f1_score(y_test, lrpredicted)))

Accuracy score:  0.87
Precision score:  0.8691099476439791
Recall score:  0.8601036269430051
F1 score:  0.8645833333333333


In [188]:
print('Multinomial Naive Bayes:',mnbaccuracy)
print('Gausian Naive Bayes:',gnb_accuracy)
print('Decision tree:',dtree_accuracy)
print('Support Vector Classifier:',svcaccuracy)
print('K-Nearest Neighbour:',knnaccuracy)
print('Stochastic Gradient Descent:',sgd_accuracy)
print('LogisticRegression:',lr_accuracy)

Multinomial Naive Bayes: 0.9025
Gausian Naive Bayes: 0.665
Decision tree: 0.66
Support Vector Classifier: 0.5625
K-Nearest Neighbour: 0.5825
Stochastic Gradient Descent: 0.8775
LogisticRegression: 0.87


In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(training_data)
y_kmeans = kmeans.predict(training_data)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set() 

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [None]:
ps = PorterStemmer()

In [None]:
stemmed_dataset = []
for i in range(0,1600):
    stemmed_array = df['text'][i].split()
    stemmed = [ps.stem(word) for word in stemmed_array if not word in set(stopwords.words('english'))]
    stemmed = ' '.join(stemmed)
    stemmed_dataset.append(stemmed)

In [None]:
print(stemmed[0:1600])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(stemmed_dataset)

In [None]:
from sklearn.cluster import KMeans
wcss =[]

In [None]:
for i in range(1,1600):
    kmeans =  KMeans(n_clusters=i, init='k-means++', max_iter = 300, n_init=10, random_state = 0, verbose=True)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)