In [2]:
import os
import fnmatch
from textblob import TextBlob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import pos_tag, pos_tag_sents
import regex as re
import operator
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import svm
from sklearn.grid_search import GridSearchCV
import pickle



In [3]:
#You will first iterate over all the text files and get the absolute path of all the text files 
#through which you will then extract the corresponding labels.

In [4]:
path = "op_spam_train/"
label = []

configFiles = [os.path.join(subdir, f)
               for subdir, dirs, files in os.walk(path)
               for f in fnmatch.filter(files, '*.txt')]


In [5]:
len(configFiles)


1600

In [6]:
configFiles[1]

'op_spam_train/negative_polarity\\deceptive_from_MTurk\\fold1\\d_hilton_10.txt'

In [7]:
#So, from the above output, you can observe that in order to extract the labels we need some kind of filter.
#And for that, you will use Regex also known as a regular expression.

In [8]:
for i in configFiles:
    c=re.search('(trut|deceptiv)\w',i)
    label.append(c.group())

In [9]:
labels=pd.DataFrame(label,columns=['Labels'])

In [10]:
labels.head(10)

Unnamed: 0,Labels
0,deceptive
1,deceptive
2,deceptive
3,deceptive
4,deceptive
5,deceptive
6,deceptive
7,deceptive
8,deceptive
9,deceptive


In [11]:
##Fetch all the reviews and append in a list

In [12]:
review=[]
directory=os.path.join(path)
for subdir,dirs,files in os.walk(directory):
    for file in files:
        if fnmatch.filter(files,"*.txt"):
            f=open(os.path.join(subdir,file),'r')
            a=f.read()
            review.append(a)
reviews=pd.DataFrame(review,columns=['HotelReviews'])

In [13]:
reviews.head(10)

Unnamed: 0,HotelReviews
0,We stayed at the Schicago Hilton for 4 days an...
1,Hotel is located 1/2 mile from the train stati...
2,I made my reservation at the Hilton Chicago be...
3,"When most people think Hilton, they think luxu..."
4,My husband and I recently stayed stayed at the...
5,My wife and I booked a room at the Hilton Chic...
6,"For a hotel rated with four diamonds by AAA, o..."
7,"I had high hopes for the Hilton Chicago, but I..."
8,We booked a room at the Hilton Chicago for two...
9,"I've stayed at other hotels in Chicago, but th..."


In [14]:
##Merge the review dataframe and label dataframe


In [15]:
result=pd.merge(reviews,labels,right_index=True,left_index=True)
result['HotelReviews']=result['HotelReviews'].map(lambda x:x.lower())

In [16]:
result.head()

Unnamed: 0,HotelReviews,Labels
0,we stayed at the schicago hilton for 4 days an...,deceptive
1,hotel is located 1/2 mile from the train stati...,deceptive
2,i made my reservation at the hilton chicago be...,deceptive
3,"when most people think hilton, they think luxu...",deceptive
4,my husband and i recently stayed stayed at the...,deceptive


In [17]:
#Remove stopwords from the Hotel Reviews column

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bahat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
stop=stopwords.words('english')
result['review_without_stopwords']=result['HotelReviews'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))

In [20]:
result.head()

Unnamed: 0,HotelReviews,Labels,review_without_stopwords
0,we stayed at the schicago hilton for 4 days an...,deceptive,stayed schicago hilton 4 days 3 nights confere...
1,hotel is located 1/2 mile from the train stati...,deceptive,hotel located 1/2 mile train station quite hik...
2,i made my reservation at the hilton chicago be...,deceptive,made reservation hilton chicago believing goin...
3,"when most people think hilton, they think luxu...",deceptive,"people think hilton, think luxury. know did. w..."
4,my husband and i recently stayed stayed at the...,deceptive,husband recently stayed stayed hilton chicago ...


In [21]:
#Extract parts of speech from Hotel Reviews which will be fed as a Feature Input to the model

In [22]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bahat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bahat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [23]:
def pos(review_without_stopwords):
    return TextBlob(review_without_stopwords).tags

In [24]:
os=result.review_without_stopwords.apply(pos)
os1=pd.DataFrame(os)

In [25]:
os1.head()

Unnamed: 0,review_without_stopwords
0,"[(stayed, VBN), (schicago, JJ), (hilton, NN), ..."
1,"[(hotel, NN), (located, VBD), (1/2, CD), (mile..."
2,"[(made, VBN), (reservation, NN), (hilton, NN),..."
3,"[(people, NNS), (think, VBP), (hilton, NN), (t..."
4,"[(husband, NN), (recently, RB), (stayed, VBD),..."


In [26]:
os1['pos']=os1['review_without_stopwords'].map(lambda x :" ".join(["/".join(x) for x in x]))

In [27]:
result =pd.merge(result,os1,right_index=True,left_index=True)
result.head()

Unnamed: 0,HotelReviews,Labels,review_without_stopwords_x,review_without_stopwords_y,pos
0,we stayed at the schicago hilton for 4 days an...,deceptive,stayed schicago hilton 4 days 3 nights confere...,"[(stayed, VBN), (schicago, JJ), (hilton, NN), ...",stayed/VBN schicago/JJ hilton/NN 4/CD days/NNS...
1,hotel is located 1/2 mile from the train stati...,deceptive,hotel located 1/2 mile train station quite hik...,"[(hotel, NN), (located, VBD), (1/2, CD), (mile...",hotel/NN located/VBD 1/2/CD mile/NN train/NN s...
2,i made my reservation at the hilton chicago be...,deceptive,made reservation hilton chicago believing goin...,"[(made, VBN), (reservation, NN), (hilton, NN),...",made/VBN reservation/NN hilton/NN chicago/NN b...
3,"when most people think hilton, they think luxu...",deceptive,"people think hilton, think luxury. know did. w...","[(people, NNS), (think, VBP), (hilton, NN), (t...",people/NNS think/VBP hilton/NN think/VBP luxur...
4,my husband and i recently stayed stayed at the...,deceptive,husband recently stayed stayed hilton chicago ...,"[(husband, NN), (recently, RB), (stayed, VBD),...",husband/NN recently/RB stayed/VBD stayed/JJ hi...


In [28]:
###Training Phase
##Split the Data into two parts 80% train and 20% test data

In [29]:
review_train,review_test,label_train,label_test=train_test_split(result['pos'],result['Labels'],test_size=0.2,random_state=13)

In [30]:
#Vectorize the Training and Testing data using TfidfVectorizer


In [31]:
tf_vect = TfidfVectorizer(lowercase=True,use_idf=True,smooth_idf=True,sublinear_tf=False)
X_train_tf=tf_vect.fit_transform(review_train)
X_test_tf=tf_vect.transform(review_test)

In [32]:
def svc_param_selection(X,y,nfolds):
    Cs=[0.001,0.01,0.1,1,10]
    gammas=[0.001,0.01,0.1,1]
    param_grid={'C':Cs,'gamma':gammas}
    grid_search=GridSearchCV(svm.SVC(kernel='linear'),param_grid,cv=nfolds)
    grid_search.fit(X,y)
    return grid_search.best_params_

In [33]:
svc_param_selection(X_train_tf,label_train,5)

{'C': 1, 'gamma': 0.001}

In [34]:
clf=svm.SVC(C=10,gamma=0.001,kernel='linear')
clf.fit(X_train_tf,label_train)
pred=clf.predict(X_test_tf)

In [35]:
##Save the Tfidf vectorizer and the ML model

In [36]:
with open('vectorizer.pickle','wb') as fin:
    pickle.dump(tf_vect,fin)

In [37]:
with open('mlmodel.pickle','wb') as f:
    pickle.dump(clf,f)

In [38]:
#Load the Tfidf vectorizer and the ML model

In [39]:
pkl=open('mlmodel.pickle','rb')
clf=pickle.load(pkl)
vec=open('vectorizer.pickle','rb')
tf_vect=pickle.load(vec)

In [40]:
def test_string(s):
    X_test_tf=tf_vect.transform([s])
    y_predict=clf.predict(X_test_tf)
    return y_predict

In [41]:
#Predict on the Test Data

In [42]:
X_test_tf=tf_vect.transform(review_test)
pred=clf.predict(X_test_tf)

In [43]:
print(metrics.accuracy_score(label_test,pred))

0.884375


In [44]:
print(confusion_matrix(label_test,pred))

[[139  16]
 [ 21 144]]


In [45]:
print(classification_report(label_test,pred))

             precision    recall  f1-score   support

  deceptive       0.87      0.90      0.88       155
      truth       0.90      0.87      0.89       165

avg / total       0.88      0.88      0.88       320



In [46]:
#Test the model with two reviews from Yelp

In [47]:
test_string("The hotel was good.The room had a 27-inch Samsung led tv, a microwave.The room had a double bed")

array(['truth'], dtype=object)

In [48]:
test_string("My family and I are huge fans of this place. The staff is super nice, and the food is great. The chicken is very good, and the garlic sauce is perfect. Ice cream topped with fruit is delicious too. Highly recommended!")

array(['truth'], dtype=object)

In [49]:
test_string("very boring mcdoanlds good")

array(['truth'], dtype=object)

In [50]:
# Test the Model using different Random States

In [51]:
review_train, review_test, label_train, label_test = train_test_split(result['pos'],result['Labels'], test_size=0.2,random_state=20)

In [52]:
X_test_tf=tf_vect.transform(review_test)
pred=clf.predict(X_test_tf)

In [53]:
print(metrics.accuracy_score(label_test,pred))

0.984375


In [54]:
print (confusion_matrix(label_test, pred))

[[158   1]
 [  4 157]]


In [55]:
print (classification_report(label_test, pred))

             precision    recall  f1-score   support

  deceptive       0.98      0.99      0.98       159
      truth       0.99      0.98      0.98       161

avg / total       0.98      0.98      0.98       320

