In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sps
import itertools
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
import re

In [3]:
hotelReviews = pd.read_csv('Hotel_Reviews-2.csv')
cities = ['Paris', 'Amsterdam', 'London', 'Milan', 'Barcelona', 'Vienna']
nationality = ' United Kingdom '

In [4]:
#threshold for word count

wordThreshold = 3

In [5]:
#split function

def wordSplit(rev, stop_words = stopwords.words('English')):
    s = [item for sublist in [rev.lower().split() for f in re.findall('\d+|\D+',rev.lower())] for item in sublist]
    s = [x for x in s if x not in stop_words]
    return s

In [6]:
#bag-of-words shingling

def bagOfWordsShingles(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    tokens = []
    for i in range(len(s) - k - 1):
        bag = ''
        for j in range(i, i + k):
            bag += s[j]+' '
        tokens.append(bag[:-1])
    return tokens

In [7]:
englishReviews = hotelReviews.loc[hotelReviews['Reviewer_Nationality']==nationality]
posEnglishReviews = englishReviews.loc[englishReviews['Review_Total_Positive_Word_Counts'] > wordThreshold]
negEnglishReviews = englishReviews.loc[englishReviews['Review_Total_Negative_Word_Counts'] > wordThreshold]

In [21]:
#Counting the word count for the Positive Reviews and Thresholding

Positive_Processed_Word_Count = []
i = 0
for rev in posEnglishReviews['Positive_Review']:
    if i % (posEnglishReviews.shape[0]/10) == 0:
        print ('    ', 100 * (1.0 * i / posEnglishReviews.shape[0]), '% finished')
    i+=1
    s = wordSplit(rev)
    Positive_Processed_Word_Count.append(len(s))
    
posEnglishReviews['Positive_Processed_Word_Count'] = Positive_Processed_Word_Count
thrPosEngRevs = posEnglishReviews.loc[posEnglishReviews['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']

     0.0 % finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [22]:
#Counting the word count for the Negative Reviews and Thresholding

Negative_Processed_Word_Count = []
i = 0
for rev in negEnglishReviews['Negative_Review']:
    if i % (negEnglishReviews.shape[0]/10) == 0:
        print ('    ', 100 * (1.0 * i / negEnglishReviews.shape[0]), '% finished')
    i+=1
    s = wordSplit(rev)
    Negative_Processed_Word_Count.append(len(s))
    
negEnglishReviews['Negative_Processed_Word_Count'] = Negative_Processed_Word_Count
thrNegEngRevs = negEnglishReviews.loc[negEnglishReviews['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

     0.0 % finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [24]:
#see how many reviews were removed from dataset after thresholding

print (thrPosEngRevs.shape[0], posEnglishReviews.shape[0])
print (thrNegEngRevs.shape[0], negEnglishReviews.shape[0])
print ("Percentage of Negative Reviews: ", 1.0*thrNegEngRevs.shape[0] / (thrPosEngRevs.shape[0] + thrNegEngRevs.shape[0]))
print ("Percentage of Positive Reviews: ", 1.0*thrPosEngRevs.shape[0] / (thrPosEngRevs.shape[0] + thrNegEngRevs.shape[0]))

176425 211097
135926 166078
Percentage of Negative Reviews:  0.4351706893846986
Percentage of Positive Reviews:  0.5648293106153014


In [25]:
#python object
#numpy ndarray
#sparse matrix of type <class 'numpy.float64'>'

allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
featureMatrix = TfidfVectorizer(stop_words=None, tokenizer=bagOfWordsShingles, lowercase=True).fit_transform(allReviews)

In [33]:
featureMatrix

<312351x2424263 sparse matrix of type '<class 'numpy.float64'>'
	with 3287595 stored elements in Compressed Sparse Row format>

In [26]:
sps.save_npz('3WordThresholdedEnglishReviews_Features', featureMatrix)
np.savez('3WordThresholdedEnglishReviews_Labels',labels)

In [27]:
thrPosEngRevs.to_csv('3WordThresholdedEnglishReviews_Positive')
thrNegEngRevs.to_csv('3WordThresholdedEnglishReviews_Negative')

In [34]:
from sklearn import svm

In [35]:
clf=svm.SVC(kernel='linear', C=1.0)
clf.fit(featureMatrix, labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)