In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sps
import itertools
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
import re

In [12]:
englishReviews = pd.read_csv('UKReviewsWordCounted.csv')
cities = ['Paris', 'Amsterdam', 'London', 'Milan', 'Barcelona', 'Vienna']
nationality = ' United Kingdom '

In [4]:
wordThreshold = 3

In [5]:
def wordSplit(rev, stop_words = stopwords.words('English')):
    s = [item for sublist in [rev.lower().split() for f in re.findall('\d+|\D+',rev.lower())] for item in sublist]
    s = [x for x in s if x not in stop_words]
    return s

def bagOfWordsShingles(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    if len(s) < k:
        tokens = ['']
        for s_ in s:
            tokens[0] += s_ + ' '
        tokens[0] = tokens[0][:-1]
        return tokens
        
    tokens = []
    for i in range(len(s) - k + 1):
        bag = ''
        for j in range(i, i + k):
            bag += s[j]+' '
        tokens.append(bag[:-1])
    return tokens

def bagOfWordsShingles2(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    tokens = []
    for K in range(0,k):
        for i in range(len(s) - (k-K) + 1):
            bag = ''
            for j in range(i, i + k-K):
                bag += s[j]+' '
            tokens.append(bag[:-1])

    return tokens

In [19]:
for c in cities:
    print "City: ", c
    cityRevs = englishReviews.loc[englishReviews['City']==c]
    
    #Thresholded BOW3
    print "    Thresholded BOW3"
    
    thrPosEngRevs = cityRevs.loc[cityRevs['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']
    thrNegEngRevs = cityRevs.loc[cityRevs['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

    allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
    labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
    featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles, lowercase=True).fit_transform(allReviews)

    sps.save_npz(c+'_UkReviews_Thresholded_BOW3_Features', featureMatrix)
    np.savez(c+'_UkReviews_Thresholded_BOW3_Labels',labels)
    
    #Thresholded BOW123
    print "    Thresholded BOW123"
    
    thrPosEngRevs = cityRevs.loc[cityRevs['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']
    thrNegEngRevs = cityRevs.loc[cityRevs['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

    allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
    labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
    featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles2, lowercase=True).fit_transform(allReviews)

    sps.save_npz(c+'_UkReviews_Thresholded_BOW123_Features', featureMatrix)
    np.savez(c+'_UkReviews_Thresholded_BOW123_Labels',labels)
    
    #Raw BOW3
    print "    Raw BOW3"
    
    thrPosEngRevs = cityRevs['Positive_Review']
    thrNegEngRevs = cityRevs['Negative_Review']

    allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
    labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
    featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles, lowercase=True).fit_transform(allReviews)

    sps.save_npz(c+'_UkReviews_Raw_BOW3_Features', featureMatrix)
    np.savez(c+'_UkReviews_Raw_BOW3_Labels',labels)
    
    #Raw BOW123
    print "    Raw BOW123"
    
    thrPosEngRevs = cityRevs['Positive_Review']
    thrNegEngRevs = cityRevs['Negative_Review']

    allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
    labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
    featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles2, lowercase=True).fit_transform(allReviews)

    sps.save_npz(c+'_UkReviews_Raw_BOW123_Features', featureMatrix)
    np.savez(c+'_UkReviews_Raw_BOW123_Labels',labels)

City:  Paris
    Thresholded BOW3
    Thresholded BOW123
    Raw BOW3
    Raw BOW123
City:  Amsterdam
    Thresholded BOW3
    Thresholded BOW123
    Raw BOW3
    Raw BOW123
City:  London
    Thresholded BOW3
    Thresholded BOW123
    Raw BOW3
    Raw BOW123
City:  Milan
    Thresholded BOW3
    Thresholded BOW123
    Raw BOW3
    Raw BOW123
City:  Barcelona
    Thresholded BOW3
    Thresholded BOW123
    Raw BOW3
    Raw BOW123
City:  Vienna
    Thresholded BOW3
    Thresholded BOW123
    Raw BOW3
    Raw BOW123
