In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sps
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import langid
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from PIL import Image
import gmaps

In [2]:
# Change this to whichever CSV YOU WANT. But In the 2nd week, I constructed a csv file that also has the 'City' name column.
# I'm using this csv for simplicity. You can construct the same by following my code in 2nd week.
hotelReviews = pd.read_csv('C:/Users/deniz/Documents/HotelReviewsCountryCity.csv')
cities = ['Paris', 'Amsterdam', 'London', 'Milan', 'Barcelona', 'Vienna']

In [3]:
def myTokenizer(word, stop_words = stopwords.words('English'), wordLengthThreshold = 1):
    tks = word.split()
    s = []
    for tok in tks:
        if tok not in stop_words and len(tok) > wordLengthThreshold:
            s.append(tok)
    tokens = list(s)
    for i in range(len(s)-1):
        tokens.append(s[i] + ' ' + s[i+1])
    for i in range(len(s)-2):
        tokens.append(s[i] + ' ' + s[i+1] + ' ' + s[i+2])
    return tokens

def myTokenizerIncludingReversed(word, stop_words = stopwords.words('English'), wordLengthThreshold = 1):
    tks = word.split()
    s = []
    for tok in tks:
        if tok not in stop_words and len(tok) > wordLengthThreshold:
            s.append(tok)
    tokens = list(s)
    for i in range(len(s)-1):
        tokens.append(s[i] + ' ' + s[i+1])
        tokens.append(s[i+1] + ' ' + s[i])
    for i in range(len(s)-2):
        tokens.append(s[i] + ' ' + s[i+1] + ' ' + s[i+2])
        tokens.append(s[i] + ' ' + s[i+2] + ' ' + s[i+1])
        tokens.append(s[i+1] + ' ' + s[i] + ' ' + s[i+2])
        tokens.append(s[i+1] + ' ' + s[i+2] + ' ' + s[i])
        tokens.append(s[i+2] + ' ' + s[i] + ' ' + s[i+1])
        tokens.append(s[i+2] + ' ' + s[i+1] + ' ' + s[i])
    return tokens

def myTokenizerWeird(word):
    if word == 'No Positive' or word == 'No Negative':
        return ['']
    stop_words = stopwords.words('English')
    s = [item for sublist in [word.split() for f in re.findall('\d+|\D+',word.lower())] for item in sublist]
    s = [x for x in s if x not in stop_words]
    tokens = list(s)
    for i in range(len(s)-1):
        tokens.append(s[i] + ' ' + s[i+1])
        #tokens.append(s[i+1] + ' ' + s[i])
    for i in range(len(s)-2):
        tokens.append(s[i] + ' ' + s[i+1] + ' ' + s[i+2])
        #tokens.append(s[i] + ' ' + s[i+2] + ' ' + s[i+1])
        #tokens.append(s[i+1] + ' ' + s[i] + ' ' + s[i+2])
        #tokens.append(s[i+1] + ' ' + s[i+2] + ' ' + s[i])
        #tokens.append(s[i+2] + ' ' + s[i] + ' ' + s[i+1])
        #tokens.append(s[i+2] + ' ' + s[i+1] + ' ' + s[i])
    return tokens

In [4]:
def getSimilarIndices(cosSim, simThreshold = .8, displayProgress = False):
    simRevs = []

    i = 1
    for index in range(cosSim.shape[0]):
        if i % (cosSim.shape[0]/10) == 0 and displayProgress:
            print '    ', 100 * (1.0 * i / cosSim.shape[0]), '% finished'
        i+=1
        i_0 = np.array(cosSim[index].todense())[0]
        sims = []
        for where in np.argwhere(i_0 > simThreshold):
            if where[0] != index:
                #print where[0], lAPR[where[0]]
                sims.append(where[0])
        simRevs.append([index, sims])
    return simRevs

def getSimilarIndices2(cosSim, simThreshold = .8, displayProgress = False):
    simRevs = []

    i = 1
    for index in range(cosSim.shape[0]):
        if i % (cosSim.shape[0]/10) == 0 and displayProgress:
            print '    ', 100 * (1.0 * i / cosSim.shape[0]), '% finished'
        i+=1
        i_0 = np.array(cosSim[index].todense())[0]
        sims = [index]
        for where in np.argwhere(i_0 > simThreshold):
            if where[0] != index:
                #print where[0], lAPR[where[0]]
                sims.append(where[0])
        simRevs.append(list(np.sort(sims)))
    return simRevs

In [5]:
def getConnectionMatrix(simRevs, displayProgress = False):
    connectionMatrix = sps.lil_matrix((len(simRevs),len(simRevs))).astype(np.bool)
    i = 1
    for s in simRevs:
        if i % ((len(simRevs))/10) == 0 and displayProgress:
            print '    ', 100 * (1.0 * i / len(simRevs)), '% finished'
        i+=1
        connectionMatrix[np.ones(len(s[1]))*s[0],s[1]] = 1
    return connectionMatrix

def getConnectionMatrix2(simRevs, nMax, displayProgress = False):
    connectionMatrix = sps.lil_matrix((nMax,nMax)).astype(np.bool)
    i = 1
    for s in simRevs:
        if i % ((len(simRevs))/10) == 0 and displayProgress:
            print '    ', 100 * (1.0 * i / len(simRevs)), '% finished'
        i+=1
        for ind in range(len(s)):
            for j in range(ind+1, len(s)-1):
                connectionMatrix[s[ind],s[j]] = 1
                connectionMatrix[s[j],s[ind]] = 1
    return connectionMatrix

In [6]:
def extractClustersSingleDegree(graphMat, minClusterSize = -1,  displayProgress = False):
    conM = graphMat.copy()
    clusters = []
    iteration = 0
    conSum = conM.sum()
    lastClusterSize = np.inf
    
    while conSum > 0 and lastClusterSize > minClusterSize:
        if displayProgress:
            print "iteration: ", iteration, " conM.sum():", conSum

        maxIndex = np.argmax(np.array(conM.sum(axis=0))[0])
        clusterIndices = np.sort(np.insert(conM[maxIndex,:].nonzero()[1], 0, maxIndex))
        
        cluster = []
        
        for index in clusterIndices:
            cluster.append(np.sort(np.insert(conM[index,:].nonzero()[1], 0, index)))
            
        cluster = np.array(list(set([item for sublist in cluster for item in sublist])))
        clusters.append(cluster)
        conM[cluster,:] = 0
        conM[:,cluster] = 0
        
        iteration += 1
        conSum = conM.sum()
        lastClusterSize = cluster.shape[0]
        
    return clusters

def extractClustersSingleDegree2(graphMat, minClusterSize = -1,  displayProgress = False):
    conM = graphMat.copy()
    clusters = []
    iteration = 0
    conSum = conM.sum()
    lastClusterSize = np.inf
    
    while conSum > 0 and lastClusterSize > minClusterSize:
        if displayProgress:
            print "iteration: ", iteration, " conM.sum():", conSum
            iteration += 1
            
        maxIndex = np.argmax(np.array(conM.sum(axis=0))[0])
        cluster = np.sort(np.insert(conM[maxIndex,:].nonzero()[1], 0, maxIndex))
        
        presize = 0
        
        while presize < cluster.shape[0]:
            presize = cluster.shape[0]
            clsList = list(cluster)
            for c in clsList:
                cluster = np.array(list(set(list(np.hstack([cluster, conM[c,:].nonzero()[1]])))))
                
            conM[cluster,:] = 0
            conM[:,cluster] = 0
            
        clusters.append(cluster)
        conSum = conM.sum()
        lastClusterSize = cluster.shape[0]
    return clusters

In [7]:
def printGraphImage(graph, fname = 'Image.bmp', sort = False):
    if sort:
        elems = np.array(graph.sum(axis=0))[0]
        sortedIndices = np.argsort(elems)[::-1]
        sortedConG = graph.tocsc()[:,sortedIndices]
        sottedConG = sortedConG.tocsr()[sortedIndices,:]
        sortedConG = sortedConG.tocsc()[:,sortedIndices]
        sortedConG = sortedConG.tolil()
        
    else:
        sortedConG = graph

    dcImg = Image.new('1', sortedConG.shape, color=1)
    pixels = dcImg.load()

    nzs = sortedConG.nonzero()

    for i in range(len(nzs[0])):
        pixels[nzs[0][i], nzs[1][i]] = 0
        
    dcImg.save(fname)

In [8]:
def clustering(reviews, simThreshold = 0.8, fName1 = 'image1.bmp', fName2='image2.bmp'):

    #featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'), tokenizer=myTokenizer, lowercase=True).fit_transform(allPosRevs)
    #featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'), tokenizer=myTokenizerIncludingReversed, lowercase=True).fit_transform(allPosRevs)
    featureMatrix = TfidfVectorizer(stop_words=None, tokenizer=myTokenizerWeird, lowercase=True).fit_transform(reviews)
    cosSim = cosine_similarity(featureMatrix, Y=None, dense_output=False)
    simRevs = getSimilarIndices(cosSim, simThreshold=simThreshold, displayProgress=True)
    connectionMatrix = getConnectionMatrix(simRevs, displayProgress=True)
    clusters = extractClustersSingleDegree2(connectionMatrix, minClusterSize = 5, displayProgress=True)

    clusterSizes = np.argsort([c.shape[0] for c in clusters])[::-1]
    sortedClusters = [clusters[cls] for cls in clusterSizes]
    sortIndex = np.array([item for sublist in sortedClusters for item in sublist])
    pltP = connectionMatrix[sortIndex,:]
    pltP = pltP[:,sortIndex]
    printGraphImage(pltP, fName1)

    preSorted = np.sort([item for sublist in clusters for item in sublist])
    pltP = connectionMatrix[preSorted,:]
    pltP = pltP[:,preSorted]
    printGraphImage(pltP, fName2)

    return clusters

In [195]:
clusters = clustering(allPosRevs, simThreshold = 0.5)

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
iteration:  0  conM.sum(): 1079278
iteration:  1  conM.sum(): 147088
iteration:  2  conM.sum(): 56186
iteration:  3  conM.sum(): 45056
iteration:  4  conM.sum(): 40288
iteration:  5  conM.sum(): 34754
iteration:  6  conM.sum(): 29550
iteration:  7  conM.sum(): 25602
iteration:  8  conM.sum(): 23050
iteration:  9  conM.sum(): 21328
iteration:  10  conM.sum(): 20412
iteration:  11  conM.sum(): 18772
ite

In [196]:
clusterSizes = np.argsort([c.shape[0] for c in clusters])[::-1]
sortedClusters = [clusters[cls] for cls in clusterSizes]

In [208]:
for s in sortedClusters[6]:
    print list(allPosRevs)[s]

 Everything
 Everything 
 Everything
 Everything
 Everything
 Everything
 Everything
 everything
 Everything
 everything
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything 
 Everything
 Everything
 everything
 Everything
 Everything 
 Everything
 Everything
 Everything 
 Everything
 Everything
 Everything
 everything
 Everything
 Everything
 Everything 
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything 
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything
 Everything
 everything
 Everything
 Everything
 Everything 
 Everything 
 Everything
 Everything
 Everything
 Everything
 everything
 Everything
 Everything
 Everything
 Everything
 Everything 
 everything 
 Everything
 Everything
 Everything
 Everything
 Everything 
 Everything
 everything
 everything
 Everything 
 Everything
 Everything
 everything
 Everything
 everything
 Everything 
 Everything
 Everything
 Ev

In [16]:
def connectionMatrix(reviews, simThreshold = 0.8):

    #featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'), tokenizer=myTokenizer, lowercase=True).fit_transform(allPosRevs)
    #featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'), tokenizer=myTokenizerIncludingReversed, lowercase=True).fit_transform(allPosRevs)
    featureMatrix = TfidfVectorizer(stop_words=None, tokenizer=myTokenizerWeird, lowercase=True).fit_transform(reviews)
    cosSim = cosine_similarity(featureMatrix, Y=None, dense_output=False)
    simRevs = getSimilarIndices(cosSim, simThreshold=simThreshold, displayProgress=True)
    connectionMatrix = getConnectionMatrix(simRevs, displayProgress=True)
    return connectionMatrix.tocsr()

In [10]:
# For this, I've used Only UK reviews on Barcelona. Change it as you wish.
# Don't use more than 30000 texts with the function tho. Because it gets impossible to construct the cos similarity matrix.

#allPosRevs = hotelReviews.loc[(hotelReviews['City'] == 'Barcelona') & (hotelReviews['Reviewer_Nationality'] == ' United Kingdom ') & (hotelReviews['Positive_Review'] != 'No Positive')]['Positive_Review']
allPosRevs = hotelReviews.loc[(hotelReviews['City'] == 'Barcelona') & (hotelReviews['Reviewer_Nationality'] == ' United Kingdom ')]['Positive_Review']

#allPosRevs = hotelReviews['Positive_Review']
#allPosWords = hotelReviews.loc[hotelReviews['Positive_Review'] != 'No Positive']['Positive_Review']
#allReviews = hotelReviews.loc[(hotelReviews['Positive_Review'] != 'No Positive') & (hotelReviews['Negative_Review'] != 'No Negative')][['Positive_Review', 'Negative_Review']]

In [11]:
allNegRevs = hotelReviews.loc[(hotelReviews['City'] == 'Barcelona') & (hotelReviews['Reviewer_Nationality'] == ' United Kingdom ')]['Negative_Review']

In [174]:
posConMat = connectionMatrix(allPosRevs, simThreshold=0.5)

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished


In [175]:
sps.save_npz('posConMat05', posConMat)

In [177]:
negConMat = connectionMatrix(allNegRevs, simThreshold=0.5)

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished


In [178]:
sps.save_npz('negConMat05', negConMat)

In [179]:
negNonZero = np.vstack([negConMat.nonzero()[0], negConMat.nonzero()[1]]).T
posNonZero = np.vstack([posConMat.nonzero()[0], posConMat.nonzero()[1]]).T

negNonZero.sort(axis = 1)
posNonZero.sort(axis = 1)

In [180]:
pcmU = posConMat.astype(np.uint8)
ncmU = negConMat.astype(np.uint8)
addition = pcmU + ncmU
row, col, val = sps.find(addition)
r_, c_, v_ = row[val == 2], col[val==2], val[val==2] / 2

In [181]:
intersMat = sps.lil_matrix(negConMat.shape)
intersMat[r_, c_] = v_

In [226]:
clusters = extractClustersSingleDegree2(intersMat, minClusterSize = 5, displayProgress=True)

iteration:  0  conM.sum(): 8656.0
iteration:  1  conM.sum(): 6004.0
iteration:  2  conM.sum(): 3842.0
iteration:  3  conM.sum(): 3160.0
iteration:  4  conM.sum(): 2458.0
iteration:  5  conM.sum(): 2000.0
iteration:  6  conM.sum(): 1760.0
iteration:  7  conM.sum(): 1494.0
iteration:  8  conM.sum(): 1416.0
iteration:  9  conM.sum(): 1364.0
iteration:  10  conM.sum(): 1308.0
iteration:  11  conM.sum(): 1270.0
iteration:  12  conM.sum(): 1228.0
iteration:  13  conM.sum(): 1186.0
iteration:  14  conM.sum(): 1146.0
iteration:  15  conM.sum(): 1104.0
iteration:  16  conM.sum(): 1074.0
iteration:  17  conM.sum(): 1044.0
iteration:  18  conM.sum(): 1014.0
iteration:  19  conM.sum(): 984.0
iteration:  20  conM.sum(): 954.0
iteration:  21  conM.sum(): 930.0
iteration:  22  conM.sum(): 908.0
iteration:  23  conM.sum(): 886.0


In [227]:
clusters = extractClustersSingleDegree2(intersMat, minClusterSize = 5, displayProgress=True)

clusterSizes = np.argsort([c.shape[0] for c in clusters])[::-1]
sortedClusters = [clusters[cls] for cls in clusterSizes]
sortIndex = np.array([item for sublist in sortedClusters for item in sublist])
pltP = intersMat[sortIndex,:]
pltP = pltP[:,sortIndex]
printGraphImage(pltP, 'IntersectionSorted05.bmp')

preSorted = np.sort([item for sublist in clusters for item in sublist])
pltP = intersMat[preSorted,:]
pltP = pltP[:,preSorted]
printGraphImage(pltP, 'IntersectionUnSorted05.bmp')

iteration:  0  conM.sum(): 8656.0
iteration:  1  conM.sum(): 6004.0
iteration:  2  conM.sum(): 3842.0
iteration:  3  conM.sum(): 3160.0
iteration:  4  conM.sum(): 2458.0
iteration:  5  conM.sum(): 2000.0
iteration:  6  conM.sum(): 1760.0
iteration:  7  conM.sum(): 1494.0
iteration:  8  conM.sum(): 1416.0
iteration:  9  conM.sum(): 1364.0
iteration:  10  conM.sum(): 1308.0
iteration:  11  conM.sum(): 1270.0
iteration:  12  conM.sum(): 1228.0
iteration:  13  conM.sum(): 1186.0
iteration:  14  conM.sum(): 1146.0
iteration:  15  conM.sum(): 1104.0
iteration:  16  conM.sum(): 1074.0
iteration:  17  conM.sum(): 1044.0
iteration:  18  conM.sum(): 1014.0
iteration:  19  conM.sum(): 984.0
iteration:  20  conM.sum(): 954.0
iteration:  21  conM.sum(): 930.0
iteration:  22  conM.sum(): 908.0
iteration:  23  conM.sum(): 886.0


In [259]:
clid = -4
print len(sortedClusters[clid])
for s in sortedClusters[clid]:
    print list(allPosRevs)[s], '\t\t', list(allNegRevs)[s]

6
No Positive 		 No tea or coffee in the room
No Positive 		 No tea or coffee in the room 
No Positive 		 No tea coffee in room
No Positive 		 No tea or coffee in the room
No Positive 		 No tea and coffee in the room
No Positive 		 Was expecting tea coffee in room 


In [185]:
allRatings = hotelReviews.loc[(hotelReviews['City'] == 'Barcelona') & (hotelReviews['Reviewer_Nationality'] == ' United Kingdom ')]['Reviewer_Score']

In [186]:
allLocations = hotelReviews.loc[(hotelReviews['City'] == 'Barcelona') & (hotelReviews['Reviewer_Nationality'] == ' United Kingdom ')][['lat','lng']]

In [187]:
scoreMeans = []
scoreStds = []
for i in range(len(sortedClusters)):
    mean = np.mean(np.array(allRatings)[sortedClusters[i]])
    std = np.std(np.array(allRatings)[sortedClusters[i]])
    scoreMeans.append(mean)
    scoreStds.append(std)

In [210]:
len(scoreMeans)

24

In [212]:
for i in range(len(scoreMeans)):
    print i, '\t', np.round(scoreMeans[i],3),'\t', np.round(scoreStds[i],3)

0 	9.842 	0.463
1 	8.709 	1.083
2 	9.195 	0.782
3 	9.259 	0.805
4 	9.77 	0.572
5 	9.421 	0.553
6 	9.461 	0.456
7 	9.281 	0.775
8 	9.4 	0.564
9 	9.075 	0.779
10 	9.7 	0.332
11 	9.7 	0.387
12 	9.75 	0.397
13 	9.229 	0.726
14 	9.6 	0.478
15 	7.914 	1.688
16 	8.571 	0.565
17 	9.117 	1.175
18 	9.25 	0.61
19 	8.15 	1.198
20 	7.633 	1.291
21 	9.733 	0.442
22 	9.667 	0.427
23 	8.76 	1.454


In [266]:
allLoc = np.array(allLocations)
clusterMap = []
for s in sortedClusters[2]:
    if not np.isnan(allLoc[s][0]) and not np.isnan(allLoc[s][1]):
        clusterMap.append(allLoc[s])

In [267]:
gmaps.configure(api_key="AIzaSyArnYbqE3btwnUDKiUWfFujTGKkjH2bbCY")

fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(clusterMap, max_intensity = 1, point_radius=15))
fig

In [13]:
posClusters = clustering(allPosRevs, fName1='Positive_Clustered.bmp', fName2='Positive_UnClustered.bmp')

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
iteration:  0  conM.sum(): 1079278
iteration:  1  conM.sum(): 147088
iteration:  2  conM.sum(): 56186
iteration:  3  conM.sum(): 45056
iteration:  4  conM.sum(): 40288
iteration:  5  conM.sum(): 34754
iteration:  6  conM.sum(): 29550
iteration:  7  conM.sum(): 25602
iteration:  8  conM.sum(): 23050
iteration:  9  conM.sum(): 21328
iteration:  10  conM.sum(): 20412
iteration:  11  conM.sum(): 18772
ite

In [15]:
negClusters = clustering(allNegRevs, fName1='Negative_Clustered.bmp', fName2='Negative_UnClustered.bmp')

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished


KeyboardInterrupt: 

In [18]:
reviews = allNegRevs
featureMatrix = TfidfVectorizer(stop_words=None, tokenizer=myTokenizerWeird, lowercase=True).fit_transform(reviews)
cosSim = cosine_similarity(featureMatrix, Y=None, dense_output=False)

In [19]:
simRevs = getSimilarIndices(cosSim, simThreshold=0.8, displayProgress=True)

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished


In [24]:
connectionMatrix = getConnectionMatrix(simRevs, displayProgress=True)

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished


In [28]:
unifiedclusters = extractClustersSingleDegree2(connectionMatrix*connectionMatrixPos, minClusterSize = 5, displayProgress=True)

iteration:  0  conM.sum(): 16322195




In [30]:
connectionMatrix*connectionMatrixPos

<20803x20803 sparse matrix of type '<type 'numpy.bool_'>'
	with 16322195 stored elements in Compressed Sparse Row format>

In [31]:
connectionMatrix

<20803x20803 sparse matrix of type '<type 'numpy.bool_'>'
	with 28869734 stored elements in LInked List format>

In [32]:
connectionMatrixPos

<20803x20803 sparse matrix of type '<type 'numpy.bool_'>'
	with 1057578 stored elements in LInked List format>

In [11]:
reviews = allPosRevs

In [12]:
featureMatrixPos = TfidfVectorizer(stop_words=None, tokenizer=myTokenizerWeird, lowercase=True).fit_transform(reviews)

In [13]:
cosSimPos = cosine_similarity(featureMatrixPos, Y=None, dense_output=False)

In [14]:
simRevsPos = getSimilarIndices(cosSimPos, simThreshold=0.8, displayProgress=True)

     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished


In [20]:
def getConnectionMatrix3(simRevs, displayProgress = False):
    connectionMatrix = sps.lil_matrix((len(simRevs),len(simRevs))).astype(np.bool)
    i = 1
    for s in simRevs:
        if i % ((len(simRevs))/10) == 0 and displayProgress:
            print '    ', 100 * (1.0 * i / len(simRevs)), '% finished'
        i+=1
        connectionMatrix[np.ones(len(s[1]))*s[0],s[1]] = 1
    return connectionMatrix

# LIMIT HERE!

In [None]:
clusters = extractClustersSingleDegree2(connectionMatrix, minClusterSize = 5, displayProgress=True)

In [None]:
clusterSizes = np.argsort([c.shape[0] for c in clusters])[::-1]
sortedClusters = [clusters[cls] for cls in clusterSizes]
sortIndex = np.array([item for sublist in sortedClusters for item in sublist])
pltP = connectionMatrix[sortIndex,:]
pltP = pltP[:,sortIndex]
printGraphImage(pltP, fName1)

preSorted = np.sort([item for sublist in clusters for item in sublist])
pltP = connectionMatrix[preSorted,:]
pltP = pltP[:,preSorted]
printGraphImage(pltP, fName2)

In [203]:
kMeansClusters = [[] for i in range(K)]
kSimilarities = cosine_similarity(featureMatrix, kMeans)
relClusters = np.argmax(kSimilarities,axis = 1)
revIndex = 0
for r in relClusters:
    kMeansClusters[r].append(revIndex)
    revIndex += 1

kMeans = []
for cluster in kMeansClusters:
    kMeans.append(np.array(featureMatrix[cluster,:].sum(axis = 0))[0])
    
for k in kMeansClusters:
    print len(k)

1015
876
519
1205
290
593
691
370
732
856
290
363
226
331
558
1873
844
983
248
575
774
1477
1195
414
308
76
210
391
58
286
289
25
346
158
1358


In [207]:
clusterSizes = np.argsort([len(c) for c in kMeansClusters])[::-1]
sortedClusters = [kMeansClusters[cls] for cls in clusterSizes]
sortIndex = np.array([item for sublist in sortedClusters for item in sublist])
pltP = featureMatrix[sortIndex,:]
pltP = pltP[:,sortIndex]
printGraphImage(pltP, 'kMEans_clustered_3Shingle_MinS10.bmp')

In [205]:
for s in kMeansClusters[0]:
    print list(allPosRevs)[s]

No Positive
 Only positive I can think of is the location
No Positive
 
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
 
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No Positive
No

In [None]:
def getConnectivityGraph(reviews, simThreshold = .8, displayProgress = False, wordLengthThreshold = 3, stopWordList = stopwords.words('English'), getTFIDFMatrix = False):
    if displayProgress:
        print "Extracting Important Texts"
    rwl = reviewWordList(reviews, wordLengthThreshold = 3, stopWordList = stopwords.words('English'), displayProgress=displayProgress)
    
    if displayProgress:
        print "Constructing Feature Matrix"
    featureMatrix = TfidfVectorizer(stop_words=stopwords.words('English'), vocabulary=rwl).fit_transform(reviews)
    
    if displayProgress:
        print "Constructing Cosine Similarity Matrix"
    cosSim = cosine_similarity(featureMatrix, Y=None, dense_output=False)
    
    if displayProgress:
        print "Extracting Similar Reviews"
    simRevs = getSimilarIndices(cosSim, simThreshold=simThreshold, displayProgress=displayProgress)
    
    if displayProgress:
        print "Constructing Connectivity Graph"
        connectionMatrix = getConnectionMatrix(simRevs, displayProgress=displayProgress)
        
    return connectionMatrix

In [10]:
def extractClusters(graphMat, displayProgress = False):
    conM = graphMat.copy()
    clusters = []
    iteration = 0
    conSum = conM.sum()
    while conSum > 0:
        if displayProgress:
            print "iteration: ", iteration, " conM.sum():", conSum

        index = np.argmax(np.array(conM.sum(axis=0))[0])
        cluster = np.sort(np.insert(conM[index,:].nonzero()[1], 0, index))
        clusters.append(cluster)
        conM[cluster,:] = 0
        conM[:,cluster] = 0
        iteration += 1
        conSum = conM.sum()
        
    return clusters

In [70]:
def extractClustersExtensive(graphMat, minClusterSize = -1,  displayProgress = False):
    conM = graphMat.copy()
    clusters = []
    iteration = 0
    conSum = conM.sum()
    lastClusterSize = np.inf
    
    while conSum > 0 and lastClusterSize > minClusterSize:
        if displayProgress:
            print "iteration: ", iteration, " conM.sum():", conSum

        connectionSums = np.array(conM.sum(axis=0))[0]
        maxCon = np.max(connectionSums)
        indices = np.where(connectionSums == maxCon)
        
        cluster = []
        
        for index in indices:
            cluster.append(np.sort(np.insert(conM[index,:].nonzero()[1], 0, index)))
            
        cluster = np.array(list(set([item for sublist in cluster for item in sublist])))
        clusters.append(cluster)
        conM[cluster,:] = 0
        conM[:,cluster] = 0
        
        iteration += 1
        conSum = conM.sum()
        lastClusterSize = cluster.shape[0]
        
    return clusters

In [12]:
def extractClustersExtensive(graphMat, minClusterSize = -1,  displayProgress = False):
    conM = graphMat.copy()
    clusters = []
    iteration = 0
    conSum = conM.sum()
    lastClusterSize = np.inf
    
    while conSum > 0 and lastClusterSize > minClusterSize:
        if displayProgress:
            print "iteration: ", iteration, " conM.sum():", conSum

        connectionSums = np.array(conM.sum(axis=0))[0]
        maxCon = np.max(connectionSums)
        indices = np.where(connectionSums == maxCon)
        
        cluster = []
        
        for index in indices:
            cluster.append(np.sort(np.insert(conM[index,:].nonzero()[1], 0, index)))
            
        cluster = np.array(list(set([item for sublist in cluster for item in sublist])))
        clusters.append(cluster)
        conM[cluster,:] = 0
        conM[:,cluster] = 0
        
        iteration += 1
        conSum = conM.sum()
        lastClusterSize = cluster.shape[0]
        
    return clusters

In [13]:
connectivityGraph = getConnectivityGraph(allPosRevs, simThreshold = .8, displayProgress=True)

Extracting Important Texts
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
Constructing Feature Matrix
Constructing Cosine Similarity Matrix
Extracting Similar Reviews
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.9870211027 % finished
     99.985579003 % finished
Constructing Connectivity Graph
     9.9985579003 % finished
     19.9971158006 % finished
     29.9956737009 % finished
     39.9942316012 % finished
     49.9927895015 % finished
     59.9913474018 % finished
     69.9899053021 % finished
     79.9884632024 % finished
     89.98702

In [14]:
clusters = extractClustersSingleDegree(connectivityGraph, minClusterSize = 5, displayProgress=True)

iteration:  0  conM.sum(): 1154692
iteration:  1  conM.sum(): 222502
iteration:  2  conM.sum(): 115246
iteration:  3  conM.sum(): 96178
iteration:  4  conM.sum(): 75238
iteration:  5  conM.sum(): 60552
iteration:  6  conM.sum(): 48630
iteration:  7  conM.sum(): 42290
iteration:  8  conM.sum(): 38612
iteration:  9  conM.sum(): 36062
iteration:  10  conM.sum(): 34012
iteration:  11  conM.sum(): 32152
iteration:  12  conM.sum(): 30366
iteration:  13  conM.sum(): 28884
iteration:  14  conM.sum(): 27582
iteration:  15  conM.sum(): 26172
iteration:  16  conM.sum(): 25196
iteration:  17  conM.sum(): 24348
iteration:  18  conM.sum(): 23390
iteration:  19  conM.sum(): 22602
iteration:  20  conM.sum(): 21764
iteration:  21  conM.sum(): 20920
iteration:  22  conM.sum(): 20050
iteration:  23  conM.sum(): 19246
iteration:  24  conM.sum(): 18432
iteration:  25  conM.sum(): 17754
iteration:  26  conM.sum(): 17104
iteration:  27  conM.sum(): 16588
iteration:  28  conM.sum(): 15934
iteration:  29  conM

In [34]:
clusterSizes = np.argsort([c.shape[0] for c in clusters])[::-1]
sortedClusters = [clusters[cls] for cls in clusterSizes]
sortIndex = np.array([item for sublist in sortedClusters for item in sublist])
pltP = connectivityGraph[sortIndex,:]
pltP = pltP[:,sortIndex]
printGraphImage(pltP, 'clustered_single_deg_08.bmp')

In [None]:
l1 = [i for i in range(connectivityGraph.shape[0])]
l3 = np.array(list(set(l1) - set(sortIndex)))
displayIndices = np.hstack([sortIndex,l3])
pltP = connectivityGraph[displayIndices,:]
pltP = pltP[:,displayIndices]
printGraphImage(pltP, 'clustered_single_deg_allSamples_05.bmp')