# This is the code for constructing the TF-IDF vector for calssification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sps
import itertools
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
import re

In [2]:
hotelReviews = pd.read_csv('C:/Users/deniz/Documents/HotelReviewsCountryCity.csv')
cities = ['Paris', 'Amsterdam', 'London', 'Milan', 'Barcelona', 'Vienna']
nationality = ' United Kingdom '

## This parameter represents the threshold for word count. 
## Note that 3 means reviews with MORE than 3 words will be in the dataset

In [3]:
wordThreshold = 3

## This is the split and Bag-Of-Words Shingling functions
## Change k according to your wish. But 3-shingles should work fine for English Language

In [4]:
def wordSplit(rev, stop_words = stopwords.words('English')):
    s = [item for sublist in [rev.lower().split() for f in re.findall('\d+|\D+',rev.lower())] for item in sublist]
    s = [x for x in s if x not in stop_words]
    return s

In [5]:
def bagOfWordsShingles(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    tokens = []
    for i in range(len(s) - k - 1):
        bag = ''
        for j in range(i, i + k):
            bag += s[j]+' '
        tokens.append(bag[:-1])
    return tokens

In [9]:
englishReviews = hotelReviews.loc[hotelReviews['Reviewer_Nationality']==nationality]
posEnglishReviews = englishReviews.loc[englishReviews['Review_Total_Positive_Word_Counts'] > wordThreshold]
negEnglishReviews = englishReviews.loc[englishReviews['Review_Total_Negative_Word_Counts'] > wordThreshold]

## Counting the word count (after preprocessing) for the Positive Reviews and Thresholding

In [10]:
Positive_Processed_Word_Count = []
i = 0
for rev in posEnglishReviews['Positive_Review']:
    if i % (posEnglishReviews.shape[0]/10) == 0:
        print '    ', 100 * (1.0 * i / posEnglishReviews.shape[0]), '% finished'
    i+=1
    s = wordSplit(rev)
    Positive_Processed_Word_Count.append(len(s))
    
posEnglishReviews['Positive_Processed_Word_Count'] = Positive_Processed_Word_Count
thrPosEngRevs = posEnglishReviews.loc[posEnglishReviews['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']

     0.0 % finished
     9.99966839889 % finished
     19.9993367978 % finished
     29.9990051967 % finished
     39.9986735956 % finished
     49.9983419944 % finished
     59.9980103933 % finished
     69.9976787922 % finished
     79.9973471911 % finished
     89.99701559 % finished
     99.9966839889 % finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


## Counting the word count (after preprocessing) for the Negative Reviews and Thresholding

In [11]:
Negative_Processed_Word_Count = []
i = 0
for rev in negEnglishReviews['Negative_Review']:
    if i % (negEnglishReviews.shape[0]/10) == 0:
        print '    ', 100 * (1.0 * i / negEnglishReviews.shape[0]), '% finished'
    i+=1
    s = wordSplit(rev)
    Negative_Processed_Word_Count.append(len(s))
    
negEnglishReviews['Negative_Processed_Word_Count'] = Negative_Processed_Word_Count
thrNegEngRevs = negEnglishReviews.loc[negEnglishReviews['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

     0.0 % finished
     9.99951829863 % finished
     19.9990365973 % finished
     29.9985548959 % finished
     39.9980731945 % finished
     49.9975914932 % finished
     59.9971097918 % finished
     69.9966280904 % finished
     79.996146389 % finished
     89.9956646877 % finished
     99.9951829863 % finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


## Just to see how many reviews were removed from the dataset after thresholding

In [12]:
print thrPosEngRevs.shape[0], posEnglishReviews.shape[0]
print thrNegEngRevs.shape[0], negEnglishReviews.shape[0]
print "Percentage of Negative Reviews: ", 1.0*thrNegEngRevs.shape[0] / (thrPosEngRevs.shape[0] + thrNegEngRevs.shape[0])
print "Percentage of Positive Reviews: ", 1.0*thrPosEngRevs.shape[0] / (thrPosEngRevs.shape[0] + thrNegEngRevs.shape[0])

176425 211097
135926 166078
Percentage of Negative Reviews:  0.435170689385
Percentage of Positive Reviews:  0.564829310615


In [13]:
allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
featureMatrix = TfidfVectorizer(stop_words=None, tokenizer=bagOfWordsShingles, lowercase=True).fit_transform(allReviews)

## Saving the results for later use

In [14]:
sps.save_npz('3WordThresholdedEnglishReviews_Features', featureMatrix)
np.savez('3WordThresholdedEnglishReviews_Labels',labels)

In [15]:
thrPosEngRevs.to_csv('3WordThresholdedEnglishReviews_Positive')
thrNegEngRevs.to_csv('3WordThresholdedEnglishReviews_Negative')

To load the fetureMatrix use sps.load_npz

Otherwise you will probably run out of memory.

To load the labels use np.loadz('file_name.npz')['arr_0']