In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sps
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import langid
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deniz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Change this to whichever CSV YOU WANT. But In the 2nd week, I constructed a csv file that also has the 'City' name column.
# I'm using this csv for simplicity. You can construct the same by following my code in 2nd week.
hotelReviews = pd.read_csv('C:/Users/deniz/Documents/HotelReviewsCountryCity.csv')
cities = ['Paris', 'Amsterdam', 'London', 'Milan', 'Barcelona', 'Vienna']

In [15]:
# For this, I've used Only UK reviews on Barcelona. Change it as you wish.
# Don't use more than 30000 texts with the function tho. Because it gets impossible to construct the cos similarity matrix.

#allPosRevs = hotelReviews.loc[(hotelReviews['City'] == 'Barcelona') & (hotelReviews['Reviewer_Nationality'] == ' United Kingdom ') & (hotelReviews['Positive_Review'] != 'No Positive')]['Positive_Review']
#allPosRevs = hotelReviews.loc[(hotelReviews['City'] == 'Barcelona') & (hotelReviews['Reviewer_Nationality'] == ' United Kingdom ')]['Positive_Review']
allPosRevs = hotelReviews['Positive_Review']
#allPosWords = hotelReviews.loc[hotelReviews['Positive_Review'] != 'No Positive']['Positive_Review']
#allReviews = hotelReviews.loc[(hotelReviews['Positive_Review'] != 'No Positive') & (hotelReviews['Negative_Review'] != 'No Negative')][['Positive_Review', 'Negative_Review']]

In [7]:
def reviewWordList(reviews, wordLengthThreshold = 3, stopWordList = stopwords.words('English'), displayProgress = False):
    allWords = []
    i = 1
    for posRev in reviews:
        if i % ((len(reviews))/10) == 0 and displayProgress:
            print '    ', 100 * (1.0 * i / len(reviews)), '% finished'
        i+=1
        allWords.append([f.split() for f in re.findall('\d+|\D+',posRev.lower())])

    allWords = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(allWords))))
    allWords = list(set(allWords))
    revWordLst = []
    for w in allWords:
        if len(w) > wordLengthThreshold and w not in stopWordList:
            revWordLst.append(w)
    return revWordLst

In [8]:
def getSimilarIndices(cosSim, simThreshold = .8, displayProgress = False):
    simRevs = []

    i = 1
    for index in range(cosSim.shape[0]):
        if i % (cosSim.shape[0]/10) == 0 and displayProgress:
            print '    ', 100 * (1.0 * i / cosSim.shape[0]), '% finished'
        i+=1
        i_0 = np.array(cosSim[index].todense())[0]
        sims = []
        for where in np.argwhere(i_0 > simThreshold):
            if where[0] != index:
                #print where[0], lAPR[where[0]]
                sims.append(where[0])
        simRevs.append([index, sims])
    return simRevs

In [9]:
def getSimilarTexts(reviews, simThreshold = .8, displayProgress = False, wordLengthThreshold = 3, stopWordList = stopwords.words('English')):
    if displayProgress:
        print "Extracting Important Texts"
    rwl = reviewWordList(reviews, wordLengthThreshold = 3, stopWordList = stopwords.words('English'), displayProgress=displayProgress)
    
    if displayProgress:
        print "Constructing TF-IDF Vectorizer"
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('English'), vocabulary=rwl)
    
    if displayProgress:
        print "Constructing Feature Matrix"
    featureMatrix = vectorizer.fit_transform(reviews)
    
    if displayProgress:
        print "Constructing Cosine Similarity Matrix"
    cosSim = cosine_similarity(featureMatrix, Y=None, dense_output=False)
    
    if displayProgress:
        print "Extracting Similar Reviews"
    simRevs = getSimilarIndices(cosSim, simThreshold=simThreshold, displayProgress=displayProgress)
    
    return simRevs

In [52]:
# Usage: 
# Chagne the displayProgress to False if you don't want to see the progress of the algorithm. 
# Change the simThreshold as you wish to meet your demands. The higher, the more similar two texts. 
# Set to 1 if you wanna find duplicates
# Change wordLengthThreshold to the maximum length of significant words (for tf-idf). The default is 3. IE: words with length 1 or 2 won't count
# Change stopWordList to any language stopowrds you want. The default is stopwords.words('English')

# Input 'allPosRevs' is the pandas list of all positive reviews I've extracted in the 3rd block. 
# It has to be a single column pandas dataframe (This is the equivalent of pandas list)

simRevs = getSimilarTexts(allPosRevs, simThreshold = .8, displayProgress=True)

Extracting Important Texts
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
Constructing TF-IDF Vectorizer
Constructing Feature Matrix
Constructing Cosine Similarity Matrix
Extracting Similar Reviews
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
