In [1]:
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
from gensim import matutils,corpora, models
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

  from numpy.core.umath_tests import inner1d


In [2]:
negative_list = os.listdir("negative_reviews") # names of all files in the negative_polarity dir into a list
positive_list = os.listdir("positive_reviews") # names of all files in the positive_polarity dir into a list

In [3]:
def preprocess(files_list,root_dir,polarity):
    labeled_class = []
    reviews = []
    actual_class =[]
    for j in files_list:
        labeled_class.append(polarity)
        k = str(open( root_dir + '/' + j).read())
        reviews.append(k)
        actual_class.append(str(j.split('_')[0]))
    data = pd.DataFrame({'labeled_class':labeled_class,'review':reviews,'actual_class':actual_class})
    return data

In [4]:
negative_df = preprocess(negative_list,'negative_reviews','negative')
positive_df = preprocess(positive_list,'positive_reviews','positive')

In [5]:
negative_df.head()

Unnamed: 0,labeled_class,review,actual_class
0,negative,My wife and I just spent a long weekend at the...,t
1,negative,The historic feel of the hotel really had a st...,t
2,negative,I haven't actually stayed at this hotel- yet- ...,t
3,negative,I was very much looking forward to our stay at...,t
4,negative,The hotel is almost always very helpful. This ...,t


In [6]:
positive_df.head()

Unnamed: 0,labeled_class,review,actual_class
0,positive,The Hard Rock Hotel Chicago has become my favo...,t
1,positive,My wife and I had to stay downtown for event t...,t
2,positive,took a weekend trip with my wife. got a great ...,t
3,positive,We stayed here because of all the great review...,t
4,positive,Just returned from a week in Chicago with the ...,t


In [7]:
target = []
for i in positive_df.index:
    if ((positive_df['labeled_class'][i] == 'positive') & (positive_df['actual_class'][i] == 't')):
        target.append(2)
    elif ((positive_df['labeled_class'][i] == 'positive') & (positive_df['actual_class'][i] == 'd')):
        target.append(1)
    else:
        print('Error!')
positive_df['target'] = target

In [8]:
target = []
for i in negative_df.index:
    if ((negative_df['labeled_class'][i] == 'negative') & (negative_df['actual_class'][i] == 't')):
        target.append(3)
    elif ((negative_df['labeled_class'][i] == 'negative') & (negative_df['actual_class'][i] == 'd')):
        target.append(4)
    else:
        print('Error!')
negative_df['target'] = target

In [9]:
data = positive_df.merge(negative_df,how='outer')

In [10]:
data = data[['review','target']]

In [11]:
data.head()

Unnamed: 0,review,target
0,The Hard Rock Hotel Chicago has become my favo...,2
1,My wife and I had to stay downtown for event t...,2
2,took a weekend trip with my wife. got a great ...,2
3,We stayed here because of all the great review...,2
4,Just returned from a week in Chicago with the ...,2


In [12]:
data.target.value_counts()

4    410
3    400
2    400
1    400
Name: target, dtype: int64

In [13]:
def extract_tokens(df):
    review_tokenized = []
    lmt = WordNetLemmatizer()
    for index, datapoint in df.iterrows():
        tokenize_words = word_tokenize(datapoint["review"].lower(),language='english')
        pos_word = pos_tag(tokenize_words)
        tokenize_words = ["_".join([lmt.lemmatize(i[0]),i[1]]) 
                        for i in pos_word if (i[0] not in stopwords.words("english") and len(i[0]) > 2)]
        review_tokenized.append(tokenize_words)
    df["review_tokenized"] = review_tokenized
    return df

data = extract_tokens(data)
data

Unnamed: 0,review,target,review_tokenized
0,The Hard Rock Hotel Chicago has become my favo...,2,"[hard_JJ, rock_NN, hotel_NN, chicago_NN, becom..."
1,My wife and I had to stay downtown for event t...,2,"[wife_NN, stay_VB, downtown_RB, event_NN, atte..."
2,took a weekend trip with my wife. got a great ...,2,"[took_VBD, weekend_NN, trip_NN, wife_NN, got_V..."
3,We stayed here because of all the great review...,2,"[stayed_VBD, great_JJ, review_NNS, true_JJ, hi..."
4,Just returned from a week in Chicago with the ...,2,"[returned_VBN, week_NN, chicago_NN, family_NN,..."
5,We stayed at the Swissotel for a special birth...,2,"[stayed_VBD, swissotel_NN, special_JJ, birthda..."
6,I just left the Conrad Chicago and have nothin...,2,"[left_VBD, conrad_NN, chicago_NN, nothing_NN, ..."
7,"This is a great hotel! The views are fabulous,...",2,"[great_JJ, hotel_NN, view_NNS, fabulous_JJ, se..."
8,Got a good deal on a weekend pacage in this Ho...,2,"[got_VBD, good_JJ, deal_NN, weekend_NN, pacage..."
9,"Thirty years ago, we had a tiny ""room"" and ind...",2,"[thirty_CD, year_NNS, ago_RB, tiny_JJ, room_NN..."


no_below is an int that represents a threshold filtering out number of occurrence of the tokens among documents above certain number. e.g. use no_below to filter out words appearing less than 10 times.

On the contrary, no_above is not a int but a float that represents faction of total corpus size. e.g. use no_above to filter out words appearing in more than 10% of all documents

In [14]:
from gensim import matutils,corpora, models

def vectorize_comments(df):
    d = corpora.Dictionary(df["review_tokenized"])
    d.filter_extremes(no_below=5, no_above=1)
    d.compactify()## remove gaps in id sequence after words that were removed
    corpus = [d.doc2bow(text) for text in df["review_tokenized"]]
    corpus = matutils.corpus2csc(corpus, num_terms=len(d.token2id))
    corpus = corpus.transpose()
    return d, corpus

dictionary,corpus = vectorize_comments(data)
print (corpus.shape)

(1610, 5964)


In [15]:
def train_rfc(X,y):
    n_estimators = [100]
    min_samples_split = [2]
    min_samples_leaf = [1]
    bootstrap = [True]
    parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                  'min_samples_split': min_samples_split}
    clf = GridSearchCV(RandomForestClassifier(verbose=1,n_jobs=-1), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf

In [16]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(corpus, data["target"], test_size=0.3, random_state=2016)
rfc_clf = train_rfc(X_train,y_train)
print ("Accuracy of RF on CV sets :{}".format(rfc_clf.best_score_))

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jo

Accuracy of RF on CV sets :0.7054125998225377


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished


In [17]:
print("Accuracy of RF on test sets is : {}".format(rfc_clf.score(X_test,y_test)))

Accuracy of RF on test sets is : 0.7018633540372671


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [18]:
def train_svm(X,y):
    parameters = {'C': [10,15,20,25],'random_state':[2016]}
    clf = GridSearchCV(SVC(), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf

In [19]:
svc_clf = train_svm(X_train,y_train)
print("Best accuracy of SVM on CV sets :{}".format(svc_clf.best_score_))
print("Accuracy of SVM on test sets is : {}".format(svc_clf.score(X_test,y_test)))

Best accuracy of SVM on CV sets :0.74622892635315
Accuracy of SVM on test sets is : 0.7412008281573499


In [20]:
def model_test(review):
    a = svc_clf.predict(review)
    if a == 1.0 :
        return('Fake Review (Positive review)')
    elif a == 2.0:
        return('True Review (Positive review)')
    elif a == 3.0:
        return('True Review (Negative review)')
    else :
        return('Fake Review (Negative review)')

In [21]:
for i in X_test[:10]:
    print(model_test(i))
    print('')

Fake Review (Negative review)

True Review (Negative review)

Fake Review (Negative review)

True Review (Positive review)

Fake Review (Negative review)

True Review (Negative review)

Fake Review (Positive review)

True Review (Positive review)

Fake Review (Positive review)

True Review (Positive review)

