In [1]:
import pandas as pd
import os
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from gensim import matutils,corpora, models
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

  from numpy.core.umath_tests import inner1d


# We have positive reviews and negative reviews in separate directories

In [2]:
negative_list = os.listdir("negative_reviews") # names of all files in the negative_polarity dir into a list
positive_list = os.listdir("positive_reviews") # names of all files in the positive_polarity dir into a list

In [3]:
def preprocess(files_list,root_dir,polarity):
    labeled_class = []
    reviews = []
    actual_class =[]
    for j in files_list:
        labeled_class.append(polarity)
        k = str(open( root_dir + '/' + j).read())
        reviews.append(k)
        actual_class.append(str(j.split('_')[0]))
    data = pd.DataFrame({'labeled_class':labeled_class,'review':reviews,'actual_class':actual_class})
    return data

In [4]:
negative_df = preprocess(negative_list,'negative_reviews','negative')
positive_df = preprocess(positive_list,'positive_reviews','positive')

In [5]:
negative_df.head()

Unnamed: 0,labeled_class,review,actual_class
0,negative,My wife and I just spent a long weekend at the...,t
1,negative,The historic feel of the hotel really had a st...,t
2,negative,I haven't actually stayed at this hotel- yet- ...,t
3,negative,I was very much looking forward to our stay at...,t
4,negative,The hotel is almost always very helpful. This ...,t


In [6]:
positive_df.head()

Unnamed: 0,labeled_class,review,actual_class
0,positive,The Hard Rock Hotel Chicago has become my favo...,t
1,positive,My wife and I had to stay downtown for event t...,t
2,positive,took a weekend trip with my wife. got a great ...,t
3,positive,We stayed here because of all the great review...,t
4,positive,Just returned from a week in Chicago with the ...,t


In [7]:
target = []
for i in positive_df.index:
    if ((positive_df['labeled_class'][i] == 'positive') & (positive_df['actual_class'][i] == 't')):
        target.append(2)
    elif ((positive_df['labeled_class'][i] == 'positive') & (positive_df['actual_class'][i] == 'd')):
        target.append(1)
    else:
        print('Error!')
positive_df['target'] = target

In [8]:
target = []
for i in negative_df.index:
    if ((negative_df['labeled_class'][i] == 'negative') & (negative_df['actual_class'][i] == 't')):
        target.append(3)
    elif ((negative_df['labeled_class'][i] == 'negative') & (negative_df['actual_class'][i] == 'd')):
        target.append(4)
    else:
        print('Error!')
negative_df['target'] = target

In [9]:
data = positive_df.merge(negative_df,how='outer')

In [10]:
data = data[['review','target']]

In [11]:
data.head()

Unnamed: 0,review,target
0,The Hard Rock Hotel Chicago has become my favo...,2
1,My wife and I had to stay downtown for event t...,2
2,took a weekend trip with my wife. got a great ...,2
3,We stayed here because of all the great review...,2
4,Just returned from a week in Chicago with the ...,2


In [12]:
data.target.value_counts()

4    410
3    400
2    400
1    400
Name: target, dtype: int64

In [13]:
#Using Spacy to get tokens and pos tags

In [14]:
nlp=spacy.load('en',entity='false');

In [15]:
#pos_tags=[];
#g=[[]];
#for datapoint in data['review_tokenized']:
 #   s=""
  #  for j in datapoint:
   #     s=s+str(j)+"_"+j.pos_;
    #    s=s+","
    #t=s;    
    #pos_tags.append(t)  
    #g.append(pos_tags)
#print(g) 


In [16]:
tokens = []
lemma = []
pos = []
for doc in nlp.pipe(data['review'].astype('unicode').values, batch_size=50,
                        n_threads=6):
    if doc.is_parsed:
        tokens.append(str([n.text for n in doc ]))
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)   
        
data['species_tokens'] = tokens
data['species_lemma'] = lemma
data['species_pos'] = pos

In [17]:
data['species_tokens']

0       ['The', 'Hard', 'Rock', 'Hotel', 'Chicago', 'h...
1       ['My', 'wife', 'and', 'I', 'had', 'to', 'stay'...
2       ['took', 'a', 'weekend', 'trip', 'with', 'my',...
3       ['We', 'stayed', 'here', 'because', 'of', 'all...
4       ['Just', 'returned', 'from', 'a', 'week', 'in'...
5       ['We', 'stayed', 'at', 'the', 'Swissotel', 'fo...
6       ['I', 'just', 'left', 'the', 'Conrad', 'Chicag...
7       ['This', 'is', 'a', 'great', 'hotel', '!', 'Th...
8       ['Got', 'a', 'good', 'deal', 'on', 'a', 'weeke...
9       ['Thirty', 'years', 'ago', ',', 'we', 'had', '...
10      ['I', 'stayed', 'here', 'August', '20', 'throu...
11      ['The', 'Omni', 'Chicago', 'Hotel', 'I', 'am',...
12      ['excellent', 'staff', 'and', 'customer', 'ser...
13      ['I', 'am', 'a', 'fan', 'of', 'Kimpton', 'Hote...
14      ['I', 'stay', 'at', 'this', 'hotel', '2', 'tim...
15      ['The', 'InterContinental', 'Chicago', 'is', '...
16      ['This', 'hotel', 'was', 'wonderful', '!', 'I'...
17      ['I', 

In [18]:
#data['species_pos'] = data['species_pos'].astype(str).apply(lambda x: ', '.join(data['species_pos'].astype(str)))

In [19]:
data

Unnamed: 0,review,target,species_tokens,species_lemma,species_pos
0,The Hard Rock Hotel Chicago has become my favo...,2,"['The', 'Hard', 'Rock', 'Hotel', 'Chicago', 'h...","[the, hard, rock, hotel, chicago, have, become...","[DET, PROPN, PROPN, PROPN, PROPN, VERB, VERB, ..."
1,My wife and I had to stay downtown for event t...,2,"['My', 'wife', 'and', 'I', 'had', 'to', 'stay'...","[-PRON-, wife, and, -PRON-, have, to, stay, do...","[ADJ, NOUN, CCONJ, PRON, VERB, PART, VERB, NOU..."
2,took a weekend trip with my wife. got a great ...,2,"['took', 'a', 'weekend', 'trip', 'with', 'my',...","[take, a, weekend, trip, with, -PRON-, wife, ....","[VERB, DET, NOUN, NOUN, ADP, ADJ, NOUN, PUNCT,..."
3,We stayed here because of all the great review...,2,"['We', 'stayed', 'here', 'because', 'of', 'all...","[-PRON-, stay, here, because, of, all, the, gr...","[PRON, VERB, ADV, ADP, ADP, ADJ, DET, ADJ, NOU..."
4,Just returned from a week in Chicago with the ...,2,"['Just', 'returned', 'from', 'a', 'week', 'in'...","[just, return, from, a, week, in, chicago, wit...","[ADV, VERB, ADP, DET, NOUN, ADP, PROPN, ADP, D..."
5,We stayed at the Swissotel for a special birth...,2,"['We', 'stayed', 'at', 'the', 'Swissotel', 'fo...","[-PRON-, stay, at, the, swissotel, for, a, spe...","[PRON, VERB, ADP, DET, PROPN, ADP, DET, ADJ, N..."
6,I just left the Conrad Chicago and have nothin...,2,"['I', 'just', 'left', 'the', 'Conrad', 'Chicag...","[-PRON-, just, leave, the, conrad, chicago, an...","[PRON, ADV, VERB, DET, PROPN, PROPN, CCONJ, VE..."
7,"This is a great hotel! The views are fabulous,...",2,"['This', 'is', 'a', 'great', 'hotel', '!', 'Th...","[this, be, a, great, hotel, !, the, view, be, ...","[DET, VERB, DET, ADJ, NOUN, PUNCT, DET, NOUN, ..."
8,Got a good deal on a weekend pacage in this Ho...,2,"['Got', 'a', 'good', 'deal', 'on', 'a', 'weeke...","[get, a, good, deal, on, a, weekend, pacage, i...","[VERB, DET, ADJ, NOUN, ADP, DET, NOUN, NOUN, A..."
9,"Thirty years ago, we had a tiny ""room"" and ind...",2,"['Thirty', 'years', 'ago', ',', 'we', 'had', '...","[thirty, year, ago, ,, -PRON-, have, a, tiny, ...","[NUM, NOUN, ADV, PUNCT, PRON, VERB, DET, ADJ, ..."


In [20]:
col = []
for i,j in zip(data['species_tokens'], data['species_pos']):
   col.append([ x+'_'+ y for x,y in zip(eval(i),j)])

In [21]:
col[:10]

[['The_DET',
  'Hard_PROPN',
  'Rock_PROPN',
  'Hotel_PROPN',
  'Chicago_PROPN',
  'has_VERB',
  'become_VERB',
  'my_ADJ',
  'favorite_ADJ',
  'hotel_NOUN',
  '._PUNCT',
  'I_PRON',
  "'ve_VERB",
  'stayed_VERB',
  'there_ADV',
  'at_ADV',
  'least_ADV',
  '5_NUM',
  'times_NOUN',
  'now_ADV',
  'and_CCONJ',
  'have_VERB',
  'never_ADV',
  'had_VERB',
  'anything_NOUN',
  'other_ADJ',
  'than_ADP',
  'a_DET',
  'wonderful_ADJ',
  'experience_NOUN',
  '._PUNCT',
  'As_ADP',
  'you_PRON',
  'might_VERB',
  'have_VERB',
  'guessed_VERB',
  '-_PUNCT',
  'it_PRON',
  'has_VERB',
  'a_DET',
  'super_ADJ',
  'Rock_PROPN',
  '&_CCONJ',
  'Roll_PROPN',
  'theme_NOUN',
  ',_PUNCT',
  'with_ADP',
  'some_DET',
  'music_NOUN',
  'paraphanelia_NOUN',
  'in_ADP',
  'the_DET',
  'lobby_NOUN',
  'and_CCONJ',
  'on_ADP',
  'each_DET',
  'floor_NOUN',
  "'s_PART",
  'elevator_NOUN',
  'lobby_NOUN',
  '._PUNCT',
  'The_DET',
  'rooms_NOUN',
  'all_DET',
  'have_VERB',
  'large_ADJ',
  'photo_NOUN',
  'm

In [22]:
data['review_tokenized'] = pd.Series(col)

In [23]:
data.head()

Unnamed: 0,review,target,species_tokens,species_lemma,species_pos,review_tokenized
0,The Hard Rock Hotel Chicago has become my favo...,2,"['The', 'Hard', 'Rock', 'Hotel', 'Chicago', 'h...","[the, hard, rock, hotel, chicago, have, become...","[DET, PROPN, PROPN, PROPN, PROPN, VERB, VERB, ...","[The_DET, Hard_PROPN, Rock_PROPN, Hotel_PROPN,..."
1,My wife and I had to stay downtown for event t...,2,"['My', 'wife', 'and', 'I', 'had', 'to', 'stay'...","[-PRON-, wife, and, -PRON-, have, to, stay, do...","[ADJ, NOUN, CCONJ, PRON, VERB, PART, VERB, NOU...","[My_ADJ, wife_NOUN, and_CCONJ, I_PRON, had_VER..."
2,took a weekend trip with my wife. got a great ...,2,"['took', 'a', 'weekend', 'trip', 'with', 'my',...","[take, a, weekend, trip, with, -PRON-, wife, ....","[VERB, DET, NOUN, NOUN, ADP, ADJ, NOUN, PUNCT,...","[took_VERB, a_DET, weekend_NOUN, trip_NOUN, wi..."
3,We stayed here because of all the great review...,2,"['We', 'stayed', 'here', 'because', 'of', 'all...","[-PRON-, stay, here, because, of, all, the, gr...","[PRON, VERB, ADV, ADP, ADP, ADJ, DET, ADJ, NOU...","[We_PRON, stayed_VERB, here_ADV, because_ADP, ..."
4,Just returned from a week in Chicago with the ...,2,"['Just', 'returned', 'from', 'a', 'week', 'in'...","[just, return, from, a, week, in, chicago, wit...","[ADV, VERB, ADP, DET, NOUN, ADP, PROPN, ADP, D...","[Just_ADV, returned_VERB, from_ADP, a_DET, wee..."


In [24]:
from gensim import matutils,corpora, models

def vectorize_comments(df):
    d = corpora.Dictionary(df["review_tokenized"])
    d.filter_extremes(no_below=2, no_above=1)
    d.compactify()
    corpus = [d.doc2bow(text) for text in df["review_tokenized"]]
    corpus = matutils.corpus2csc(corpus, num_terms=len(d.token2id))
    corpus = corpus.transpose()
    return d, corpus

dictionary,corpus = vectorize_comments(data)
print (corpus.shape)

(1610, 6362)


In [25]:
def train_rfc(X,y):
    n_estimators = [100]
    min_samples_split = [2]
    min_samples_leaf = [1]
    bootstrap = [True]
    parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                  'min_samples_split': min_samples_split}
    clf = GridSearchCV(RandomForestClassifier(verbose=1,n_jobs=-1), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf


In [54]:
print(corpus)

  (0, 0)	1.0
  (0, 1)	2.0
  (0, 2)	1.0
  (0, 3)	2.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 7)	2.0
  (0, 8)	3.0
  (0, 9)	3.0
  (0, 10)	3.0
  (0, 11)	3.0
  (0, 12)	4.0
  (0, 13)	1.0
  (0, 14)	1.0
  (0, 15)	1.0
  (0, 16)	1.0
  (0, 17)	1.0
  (0, 18)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 21)	2.0
  (0, 22)	1.0
  (0, 23)	1.0
  (0, 24)	1.0
  :	:
  (1609, 1889)	3.0
  (1609, 1989)	1.0
  (1609, 2088)	1.0
  (1609, 2172)	1.0
  (1609, 2310)	1.0
  (1609, 2793)	1.0
  (1609, 2800)	2.0
  (1609, 2820)	1.0
  (1609, 2872)	1.0
  (1609, 2908)	1.0
  (1609, 3167)	1.0
  (1609, 3419)	1.0
  (1609, 3578)	1.0
  (1609, 3590)	1.0
  (1609, 3966)	1.0
  (1609, 4044)	1.0
  (1609, 4395)	1.0
  (1609, 4581)	1.0
  (1609, 4673)	1.0
  (1609, 5275)	1.0
  (1609, 5530)	1.0
  (1609, 6254)	1.0
  (1609, 6265)	1.0
  (1609, 6354)	1.0
  (1609, 6360)	1.0


In [26]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(corpus, data["target"], test_size=0.3, random_state=2016)
rfc_clf = train_rfc(X_train,y_train)
print ("Accuracy of RF on CV sets :{}".format(rfc_clf.best_score_))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jo

Accuracy of RF on CV sets :0.6921029281277729


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished


In [27]:
print("Accuracy of RF on test sets is : {}".format(rfc_clf.score(X_test,y_test)))

Accuracy of RF on test sets is : 0.6790890269151139


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [28]:
def train_svm(X,y):
    parameters = {'C': [10,15,20,25],'random_state':[0]}
    clf = GridSearchCV(SVC(), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf

In [50]:
svc_clf = train_svm(X_train,y_train)
print("Best accuracy of SVM on CV sets :{}".format(svc_clf.best_score_))
print("Accuracy of SVM on test sets is : {}".format(svc_clf.score(X_test,y_test)))

Best accuracy of SVM on CV sets :0.7551020408163265
Accuracy of SVM on test sets is : 0.7660455486542443


In [60]:
def model_test(review):
    a = svc_clf.predict(review)
    if a == 1.0 :
        return('Fake Review (Positive)')
    elif a == 2.0:
        return('True Review (Positive)')
    elif a == 3.0:
        return('True Review (Negative)')
    else :
        return('Fake Review (Negative)')

In [68]:
for i in X_test:
    print(model_test(i)) 

Fake Review (Negative)
True Review (Positive)
Fake Review (Negative)
True Review (Positive)
Fake Review (Negative)
Fake Review (Negative)
Fake Review (Positive)
True Review (Positive)
Fake Review (Positive)
True Review (Positive)
True Review (Negative)
True Review (Positive)
True Review (Negative)
True Review (Positive)
True Review (Positive)
Fake Review (Positive)
True Review (Positive)
Fake Review (Negative)
True Review (Positive)
Fake Review (Negative)
Fake Review (Negative)
True Review (Positive)
Fake Review (Negative)
True Review (Negative)
True Review (Positive)
True Review (Negative)
True Review (Negative)
Fake Review (Negative)
Fake Review (Negative)
Fake Review (Negative)
Fake Review (Negative)
Fake Review (Positive)
Fake Review (Negative)
True Review (Negative)
True Review (Positive)
True Review (Negative)
Fake Review (Positive)
Fake Review (Positive)
Fake Review (Negative)
Fake Review (Negative)
True Review (Positive)
Fake Review (Positive)
Fake Review (Positive)
True Review

True Review (Positive)
True Review (Negative)
Fake Review (Negative)
True Review (Positive)
True Review (Negative)
True Review (Positive)
Fake Review (Negative)
True Review (Negative)
Fake Review (Positive)
True Review (Negative)
Fake Review (Positive)
True Review (Positive)
Fake Review (Positive)
Fake Review (Positive)
True Review (Positive)
True Review (Positive)
Fake Review (Positive)
True Review (Positive)
Fake Review (Negative)
Fake Review (Positive)
Fake Review (Negative)
Fake Review (Positive)
True Review (Negative)
Fake Review (Negative)
True Review (Negative)
Fake Review (Positive)
True Review (Positive)
True Review (Negative)
True Review (Positive)
True Review (Positive)
Fake Review (Positive)
Fake Review (Negative)
Fake Review (Positive)
Fake Review (Positive)
Fake Review (Positive)
Fake Review (Negative)
True Review (Negative)
Fake Review (Negative)
True Review (Negative)
Fake Review (Negative)
Fake Review (Negative)
True Review (Negative)
Fake Review (Positive)
True Review