# This is the code for constructing the TF-IDF vector for classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sps
import itertools
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
import re

In [15]:
hotelReviews = pd.read_csv('C:/Users/deniz/Documents/HotelReviewsCountryCity.csv')
cities = ['Paris', 'Amsterdam', 'London', 'Milan', 'Barcelona', 'Vienna']
nationality = ' United Kingdom '

## This parameter represents the threshold for word count. 
## Note that 3 means reviews with MORE than 3 words will be in the dataset

In [3]:
wordThreshold = 3

## This is the split and Bag-Of-Words Shingling functions
## Change k according to your wish. But 3-shingles should work fine for English Language

In [3]:
def wordSplit(rev, stop_words = stopwords.words('English')):
    s = [item for sublist in [rev.lower().split() for f in re.findall('\d+|\D+',rev.lower())] for item in sublist]
    s = [x for x in s if x not in stop_words]
    return s

def bagOfWordsShingles(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    if len(s) < k:
        tokens = ['']
        for s_ in s:
            tokens[0] += s_ + ' '
        tokens[0] = tokens[0][:-1]
        return tokens
        
    tokens = []
    for i in range(len(s) - k + 1):
        bag = ''
        for j in range(i, i + k):
            bag += s[j]+' '
        tokens.append(bag[:-1])
    return tokens

def bagOfWordsShingles2(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    tokens = []
    for K in range(0,k):
        for i in range(len(s) - (k-K) + 1):
            bag = ''
            for j in range(i, i + k-K):
                bag += s[j]+' '
            tokens.append(bag[:-1])

    return tokens

In [16]:
#englishReviews = hotelReviews.loc[(hotelReviews['Reviewer_Nationality']==nationality) & (hotelReviews['City']=='Barcelona')]
englishReviews = hotelReviews.loc[(hotelReviews['Reviewer_Nationality']==nationality)]
#posEnglishReviews = englishReviews.loc[englishReviews['Review_Total_Positive_Word_Counts'] > wordThreshold]
#negEnglishReviews = englishReviews.loc[englishReviews['Review_Total_Negative_Word_Counts'] > wordThreshold]

In [32]:
posRevs = englishReviews['Positive_Review']
negRevs = englishReviews['Negative_Review']

posRevs = posRevs.reset_index()['Positive_Review']
negRevs = negRevs.reset_index()['Negative_Review']

In [38]:
Positive_Processed_Word_Count = []
Negative_Processed_Word_Count = []
Positive_Processed_Words = []
Negative_Processed_Words = []

for i in range(posRevs.shape[0]):
    if i % (posRevs.shape[0]/20) == 0:
        print '    ', 100 * (1.0 * i / posRevs.shape[0]), '% finished'
        
    sPos = wordSplit(posRevs.iloc[i])
    sNeg = wordSplit(negRevs.iloc[i])
    
    Positive_Processed_Word_Count.append(len(sPos))
    Negative_Processed_Word_Count.append(len(sNeg))
    
    Positive_Processed_Words.append(sPos)
    Negative_Processed_Words.append(sNeg)
    
englishReviews['Positive_Processed_Word_Count'] = Positive_Processed_Word_Count
englishReviews['Negative_Processed_Word_Count'] = Negative_Processed_Word_Count
englishReviews['Positive_Processed_Words'] = Positive_Processed_Words
englishReviews['Negative_Processed_Words'] = Negative_Processed_Words
#thrPosEngRevs = posEnglishReviews.loc[posEnglishReviews['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']
englishReviews.to_csv('UKReviewsWordCounted.csv')

     0.0 % finished
     4.99987767385 % finished
     9.99975534769 % finished
     14.9996330215 % finished
     19.9995106954 % finished
     24.9993883692 % finished
     29.9992660431 % finished
     34.9991437169 % finished
     39.9990213908 % finished
     44.9988990646 % finished
     49.9987767385 % finished
     54.9986544123 % finished
     59.9985320862 % finished
     64.99840976 % finished
     69.9982874338 % finished
     74.9981651077 % finished
     79.9980427815 % finished
     84.9979204554 % finished
     89.9977981292 % finished
     94.9976758031 % finished
     99.9975534769 % finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
englishReviews

Unnamed: 0.1,Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,...,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng,Country,City,Positive_Processed_Word_Count,Negative_Processed_Word_Count
3,3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,...,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968,Netherlands,Amsterdam,14,105
6,6,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/17/2017,7.7,Hotel Arena,United Kingdom,Cleaner did not change our sheet and duvet ev...,33,1403,...,6,4.6,"[' Leisure trip ', ' Group ', ' Duplex Twin Ro...",17 days,52.360576,4.915968,Netherlands,Amsterdam,8,14
7,7,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/17/2017,7.7,Hotel Arena,United Kingdom,Apart from the price for the brekfast Everyth...,11,1403,...,1,10.0,"[' Leisure trip ', ' Couple ', ' Duplex Double...",17 days,52.360576,4.915968,Netherlands,Amsterdam,13,5
10,10,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/7/2017,7.7,Hotel Arena,United Kingdom,Nothing all great,5,1403,...,2,10.0,"[' Leisure trip ', ' Group ', ' Duplex Double ...",27 days,52.360576,4.915968,Netherlands,Amsterdam,486,2
12,12,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/6/2017,7.7,Hotel Arena,United Kingdom,The floor in my room was filfy dirty Very bas...,28,1403,...,7,4.6,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",28 days,52.360576,4.915968,Netherlands,Amsterdam,4,42
16,16,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/3/2017,7.7,Hotel Arena,United Kingdom,Very steep steps in room up to the bed not sa...,38,1403,...,8,6.3,"[' Leisure trip ', ' Family with young childre...",31 days,52.360576,4.915968,Netherlands,Amsterdam,11,21
20,20,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,6/19/2017,7.7,Hotel Arena,United Kingdom,Bed was on upper level with a narrow twist st...,40,1403,...,1,6.3,"[' Leisure trip ', ' Travelers with friends ',...",45 days,52.360576,4.915968,Netherlands,Amsterdam,12,23
24,24,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,5/27/2017,7.7,Hotel Arena,United Kingdom,Nothing,3,1403,...,2,9.6,"[' Leisure trip ', ' Couple ', ' Duplex Double...",68 days,52.360576,4.915968,Netherlands,Amsterdam,310,1
26,26,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,5/25/2017,7.7,Hotel Arena,United Kingdom,Nothing at all to do with the Hotel of course...,51,1403,...,2,9.6,"[' Leisure trip ', ' Group ', ' Duplex Double ...",70 days,52.360576,4.915968,Netherlands,Amsterdam,243,17
31,31,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,5/11/2017,7.7,Hotel Arena,United Kingdom,Extensive restorations works going on We had ...,61,1403,...,14,7.1,"[' Leisure trip ', ' Group ', ' 2 rooms ', ' S...",84 days,52.360576,4.915968,Netherlands,Amsterdam,9,29


In [34]:
thrPosEngRevs = englishReviews.loc[englishReviews['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']
thrNegEngRevs = englishReviews.loc[englishReviews['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles, lowercase=True).fit_transform(allReviews)

sps.save_npz('UkReviews_Thresholded_BOW3_Features', featureMatrix)
np.savez('UkReviews_Thresholded_BOW3_Labels',labels)

In [35]:
thrPosEngRevs = englishReviews.loc[englishReviews['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']
thrNegEngRevs = englishReviews.loc[englishReviews['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles2, lowercase=True).fit_transform(allReviews)

sps.save_npz('UkReviews_Thresholded_BOW123_Features', featureMatrix)
np.savez('UkReviews_Thresholded_BOW123_Labels',labels)

In [36]:
thrPosEngRevs = englishReviews['Positive_Review']
thrNegEngRevs = englishReviews['Negative_Review']

allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles, lowercase=True).fit_transform(allReviews)

sps.save_npz('UkReviews_Raw_BOW3_Features', featureMatrix)
np.savez('UkReviews_Raw_BOW3_Labels',labels)

In [37]:
thrPosEngRevs = englishReviews['Positive_Review']
thrNegEngRevs = englishReviews['Negative_Review']

allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles2, lowercase=True).fit_transform(allReviews)

sps.save_npz('UkReviews_Raw_BOW123_Features', featureMatrix)
np.savez('UkReviews_Raw_BOW123_Labels',labels)