# Import all required modules

In [25]:
import os
import fnmatch
from textblob import TextBlob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from nltk.corpus import stopwords
from nltk import pos_tag,pos_tag_sents
import re
import operator
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split  
from sklearn import metrics
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.grid_search import GridSearchCV
import pickle
from nltk.corpus import stopwords

# Fetch all text files from the path and extract the labels from it and create a dataframe of the labels

In [8]:
path = '/home/icecreamlabs/Documents/op_spam_train'

save1 = []

configfiles = [os.path.join(subdir,f)
for subdir, dirs, files in os.walk(path)
    for f in fnmatch.filter(files, '*.txt')]

#print (configfiles)

for f in configfiles:
    c = re.search('(trut|deceptiv)\w',f)
    save1.append(c.group())

labels = pd.DataFrame(save1, columns = ['Labels'])  
#labels

# Fetch all the reviews and append in a list

In [9]:
save = []
directory =os.path.join("/home/icecreamlabs/Documents/op_spam_train") 
for subdir,dirs ,files in os.walk(directory): 
   # print (subdir)
    for file in files: 
        if fnmatch.filter(files, '*.txt'):
            f=open(os.path.join(subdir, file),'r') 
            a = f.read() 
            #print (a)
            save.append(a)         

# Merge the review dataframe and label dataframe

In [10]:
reviews = pd.DataFrame(save, columns = ['HotelReviews'])


result = pd.merge(reviews, labels,right_index=True,left_index = True)


result['HotelReviews'] = result['HotelReviews'].map(lambda x: x.lower())
result.head()

Unnamed: 0,HotelReviews,Labels
0,the conrad chicago hotel had some of the worst...,deceptive
1,overpriced is the best word to describe the co...,deceptive
2,there is a great difference between a hyatt re...,deceptive
3,i recently stayed at the hyatt regency chicago...,deceptive
4,this hotel is rather far from the airport and ...,deceptive


# Remove stopwords from the Hotel Reviews column

In [11]:
stop = stopwords.words('english')

result['review_without_stopwords'] = result['HotelReviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
result.head()

Unnamed: 0,HotelReviews,Labels,review_without_stopwords
0,the conrad chicago hotel had some of the worst...,deceptive,conrad chicago hotel worst service ever dealt ...
1,overpriced is the best word to describe the co...,deceptive,overpriced best word describe conrad chicago h...
2,there is a great difference between a hyatt re...,deceptive,"great difference hyatt regency hotel, normal h..."
3,i recently stayed at the hyatt regency chicago...,deceptive,"recently stayed hyatt regency chicago hotel, d..."
4,this hotel is rather far from the airport and ...,deceptive,hotel rather far airport traffic took almost 2...


# Extract parts of speech from Hotel Reviews which will be fed as a Feature Input to the model

In [12]:
def pos(review_without_stopwords):
    return TextBlob(review_without_stopwords).tags

In [13]:
os = result.review_without_stopwords.apply(pos)
os1 = pd.DataFrame(os)

In [14]:
os1['pos'] = os1['review_without_stopwords'].map(lambda x:" ".join(["/".join(x) for x in x ]) )
result = result = pd.merge(result, os1,right_index=True,left_index = True)
result.head()

Unnamed: 0,HotelReviews,Labels,review_without_stopwords_x,review_without_stopwords_y,pos
0,the conrad chicago hotel had some of the worst...,deceptive,conrad chicago hotel worst service ever dealt ...,"[(conrad, NNS), (chicago, VBP), (hotel, NN), (...",conrad/NNS chicago/VBP hotel/NN worst/JJS serv...
1,overpriced is the best word to describe the co...,deceptive,overpriced best word describe conrad chicago h...,"[(overpriced, VBN), (best, JJS), (word, NN), (...",overpriced/VBN best/JJS word/NN describe/NN co...
2,there is a great difference between a hyatt re...,deceptive,"great difference hyatt regency hotel, normal h...","[(great, JJ), (difference, NN), (hyatt, JJ), (...",great/JJ difference/NN hyatt/JJ regency/NN hot...
3,i recently stayed at the hyatt regency chicago...,deceptive,"recently stayed hyatt regency chicago hotel, d...","[(recently, RB), (stayed, VBN), (hyatt, NN), (...",recently/RB stayed/VBN hyatt/NN regency/NN chi...
4,this hotel is rather far from the airport and ...,deceptive,hotel rather far airport traffic took almost 2...,"[(hotel, NN), (rather, RB), (far, RB), (airpor...",hotel/NN rather/RB far/RB airport/JJ traffic/N...


# Split the Data in to two parts 80% train and 20% test data

In [15]:
review_train, review_test, label_train, label_test = train_test_split(result['pos'],result['Labels'], test_size=0.2,random_state=42)

# Vectorize the words using Tfidf, train the model using multinomial naive bayes and predict on the test data

In [16]:
tf_vect = TfidfVectorizer(lowercase = True, use_idf=True, smooth_idf=True, sublinear_tf=False)

X_train_tf = tf_vect.fit_transform(review_train)
X_test_tf = tf_vect.transform(review_test)

mnb = MultinomialNB()
mnb.fit(X_train_tf, label_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
pred = mnb.predict(X_test_tf)

# Plot the confusion matrix, accuracy score and the classification report to analyse the performance of the model

In [18]:
print(metrics.accuracy_score(label_test, pred))
print (confusion_matrix(label_test, pred))
print (classification_report(label_test, pred))

0.86875
[[152  16]
 [ 26 126]]
             precision    recall  f1-score   support

  deceptive       0.85      0.90      0.88       168
      truth       0.89      0.83      0.86       152

avg / total       0.87      0.87      0.87       320



# Test the model with reviews from yelp, the multinaive bayes fails to predict the review correctly since it predicts a deceptive review as truth

In [19]:
s = "My family and I are huge fans of this place. The staff is super nice and the food is great. The chicken is very good and the garlic sauce is perfect. Ice cream topped with fruit is delicious too. Highly recommended!"

In [20]:
X_test_tf = tf_vect.transform([s])
y_predict = mnb.predict(X_test_tf)
print (y_predict)

['truth']


# Build an SVM classifier

In [21]:
review_train, review_test, label_train, label_test = train_test_split(result['review_without_stopwords_x'],result['Labels'], test_size=0.2,random_state=42)

In [22]:
tf_vect = TfidfVectorizer( stop_words = "english",  lowercase = True, use_idf=True, smooth_idf=True, sublinear_tf=False)


X_train_tf1 = tf_vect.fit_transform(review_train)
X_test_tf1 = tf_vect.transform(review_test)

In [26]:
with open('vectorizer.pickle', 'wb') as fin:
    pickle.dump(tf_vect, fin)

In [27]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [28]:
svc_param_selection(X_train_tf1,label_train,5)

{'C': 1, 'gamma': 0.001}

In [31]:
clf = svm.SVC(C=10,gamma=0.001,kernel='linear')
clf.fit(X_train_tf1,label_train)
pred1 = clf.predict(X_test_tf1)

# Save the model in pickle file and plot the accuracy score , confusion matrix and classification report

In [32]:
with open('mlmodel.pickle','wb') as f:
    pickle.dump(clf,f)

In [37]:
print(metrics.accuracy_score(label_test, pred1))
print (confusion_matrix(label_test, pred1))
print (classification_report(label_test, pred1))

0.878125
[[147  21]
 [ 18 134]]
             precision    recall  f1-score   support

  deceptive       0.89      0.88      0.88       168
      truth       0.86      0.88      0.87       152

avg / total       0.88      0.88      0.88       320



# Test the SVM model on 2 reviews from Yelp. The SVM model performs a lot better than Multi Naive Bayes since it predicts the labels for these 2 reviews correctly

In [34]:
def test_string(s):
    X_test_tf = tf_vect.transform([s])
    #print(tf_vect.get_feature_names())
    y_predict = clf.predict(X_test_tf)
    return y_predict

In [35]:
test_string("The hotel was good.The room had a 27 inch samsung led tv, a microwave.The room had a double bed")

array(['truth'], dtype=object)

In [36]:
test_string("My family and I are huge fans of this place. The staff is super nice and the food is great. The chicken is very good and the garlic sauce is perfect. Ice cream topped with fruit is delicious too. Highly recommended!")

array(['deceptive'], dtype=object)