In [1]:
import pandas as pd
import re
import numpy as np
import time
import collections
from nltk.corpus import stopwords
import nltk.data
import networkx as nx
import proj_base
from gensim.models import word2vec
import matplotlib.pyplot as plt
import scipy.spatial.distance as scpd
from numpy.linalg import inv

import time
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



In [53]:
data = proj_base.getStandardData(numFiles=10)
unchanged = data.copy()
aspect = "Location"
data.shape

(1111, 13)

In [115]:
sentences = []

#word2vec takes a list of lists, where each internal list is a BOW


def review2sentences(review, tokenizer, remove_stopwords=True):
    #split each review into sentences
    raw_sent = tokenizer.tokenize(review.strip())
    
    sentences = []
    for s in raw_sent:
        if len(s) > 0 :
            sentences.append(review2wordlist(s, remove_stopwords))
            
    return sentences


def review2wordlist(review_text, remove_stopwords=True):
    #split the given text into BOW
    

    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)


def sentences2wordlist(sentences, remove_stopwords=True):
    
    allWords = []
    for s in sentences: 
        s = re.sub("[^a-zA-Z]"," ", s)
        words = s.lower().split()
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            allWords.extend([w for w in words if not w in stops])

    return(allWords)

for review in data["Content"]:
    sentences += review2sentences(review, tokenizer)
    

#BUILD WORD2Vec infrastructure

num_features = 100
min_word_count = 40
num_workers = 2
context = 2
downsampling = 1e-3

from gensim.models import word2vec
print("training")
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features)

def makeFeatureVec(words, model, num_features):
    #given a list of words and the pre-trained word model return a feature vector
    featureVec = np.zeros((num_features,), dtype="float32")
    
    nwords = 0
    index2word_set = set(model.index2word)
    
    
    #get the mean vector for each review
    for w in words:
        #print(w)
        if w in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[w])
            
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVec(reviews, model, num_features):
    #get average feature vec for list of bag of words
    count = 0
    reviewFeatureVec = np.zeros((len(reviews), num_features), dtype="float32")
    
    for r in reviews:
        if count % 1000 == 0:
            print("at review", count)
        #for each review add the feature vec
        reviewFeatureVec[count] = makeFeatureVec(r, model, num_features)
        count += 1
        
    return reviewFeatureVec


    
    

training


In [116]:
print(sentences2wordlist(data.loc[1]["aspectSentences"]))

['daughter', 'th', 'birthday', 'promised', 'take', 'two', 'girlfriends', 'see', 'shrek', 'musical', 'th', 'avenue', 'theater', 'decided', 'make', 'fun', 'night', 'booking', 'nearby', 'hotel', 'well', 'settled', 'monaco', 'based', 'proximity', 'theater', 'price', 'trip', 'adviser', 'reviews', 'rewarding', 'experience', 'play', 'started', 'pm', 'day', 'called', 'requested', 'early', 'check', 'told', 'guarantee', 'industry', 'standard', 'left', 'house', 'headed', 'monaco', 'real', 'expectation', 'able', 'check', 'room', 'early']


In [142]:
def isFeatureVecNull(fv):
    if sum(pd.notnull(fv)) == len(fv):
        return True
    return False


def getFeatureVec(review, useAspectSentences=False):
    if useAspectSentences:
        rev2words = sentences2wordlist(review)
    else:
        rev2words = review2wordlist(review, tokenizer)
    return makeFeatureVec(rev2words, model, num_features)


def getAllSims(rev, allRevs):
    return allRevs.apply(scpd.cosine, args=(rev,))

def buildSimilarityMatrix(data):
    numNodes = data.shape[0]
    sims = data["featureVec"].apply(getAllSims, args=(data["featureVec"],))
    
  
                
    return sims

time1 = time.time()

#similarities = buildSimilarityMatrix(data)
#g = buildGraph(data, similarities)
time2 = time.time()

print("time taken", time2-time1)
#similarities

('time taken', 9.512901306152344e-05)


In [46]:
np.linalg.norm(similarities)

4.9380903279181352

In [204]:


def calcMinLossPredictions(data):
    #define constants
    sim = buildSimilarityMatrix(data)
    sim = (sim - sim.mean()) / (sim.max() - sim.min())
    n = data.shape[0]
    print("Num data", n)
    a = 3.5
    b = 1.5
    k = 5
    k_p = 3
    M = 700
    
    isLabeled = np.isnan(data[aspect]) == False 
    
    y = np.maximum(np.nan_to_num(data[aspect]), np.ones(len(data)))
    
    kNN = np.zeros((n, n), float)
    #build kNN matrix
    for i in range(n):
        kthClosest = sorted(sim[i], reverse=True)[k]
        for j in range(n):
            thisSimilar = sim[i][j]
            if thisSimilar >= kthClosest:
                kNN[i][j] = 1
        
    
    #build matricies
    
    D = np.zeros((n, n), float)
    d_diag = np.zeros(n)
    for i in range(n):
        d_diag[i] =  sum(sorted(sim[i], reverse=True)[0:k])
    np.fill_diagonal(D, d_diag)
    
    c_diag = np.maximum(np.nan_to_num(np.maximum(data[aspect], np.ones(len(data))*M)), np.ones(len(data)))
    C = np.zeros((n, n), int)
    np.fill_diagonal(C, c_diag)
    
    W_p = np.zeros((n,n), float)
    for i in range(n):
        for j in range(n):
            if isLabeled[i]:
                W_p[i][j] = 0
            elif isLabeled[i] and kNN[i][j] == 1:
                W_p[i][j] = sim[i][j]
            elif not isLabeled[i] and kNN[i][j] == 1:
                W_p[i][j] = b*sim[i][j]
                
    W = np.maximum(W_p, np.transpose(W_p))
    
    delta = D - W
    
    constant = a*1.0/(k + k_p*b)
    toInv = C + constant*delta
    inverse = inv(toInv)
    
    C_y = C.dot(y)
    preds = inverse.dot(C_y)
    
    
    return np.round(np.clip(preds, 1, 5))


#preds = calcMinLossPredictions(data)
#preds
#

In [206]:
data = proj_base.getStandardData(numFiles=33)
unchanged = data.copy()
aspect = "Location"
data = proj_base.getTrainingData(data)
data = data.reset_index()
aspect = "aspectRating"
unchanged = data.copy()

print(aspect, data.shape)




('aspectRating', (812, 5))


In [207]:
numTrain = 700
numTest = 100
dat = data[0:numTrain]



#split data into train and test
moreData = data[numTrain:numTrain+numTest]
truth = unchanged[aspect][numTrain:numTrain+numTest]
moreData[aspect] = np.nan  #set last 10 to no prediction
fulldat = pd.concat([dat, moreData])

#remove reviews where feature vec is null

#use full content or aspect sentences?
fulldat["featureVec"] = fulldat["Content"].apply(getFeatureVec)
#fulldat["featureVec"] = fulldat["aspectSentences"].apply(getFeatureVec, args=(True,))
fvNull = fulldat["featureVec"].apply(isFeatureVecNull)
fulldat = fulldat[fvNull]


fullpreds = calcMinLossPredictions(fulldat)

print("DONE")
print(np.mean(fullpreds[numTrain:numTrain+numTest] == truth))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


('Num data', 800)
DONE
0.23


In [76]:
np.mean(fullpreds[numTrain:numTrain+numTest] == truth)

0.20000000000000001

In [171]:
predicted = fullpreds[numTrain:numTrain+numTest]

for p in range(len(truth)):
    print("pred v truth:",predicted[p], truth[numTrain+p])

('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 4.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 3.0)
('pred v truth:', 5.0, 3.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 2.0)
('pred v truth:', 5.0, 3.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 4.0)
('pred v truth:', 5.0, 1.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 4.0, 5.0)
('pred v truth:', 5.0, 4.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 4.0, 5.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 4.0, 5.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 4.0, 5.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 4.0)
('pred v truth:', 5.0, 3.0)
('pred v truth:', 5.0, 4.0)
('pred v truth:', 5.0, 5.0)
('pred v truth:', 5.0, 5.0)


In [111]:
truth

700    5.0
701    5.0
702    5.0
703    4.0
704    4.0
705    4.0
706    4.0
707    4.0
708    3.0
709    1.0
710    5.0
711    4.0
712    2.0
713    4.0
714    2.0
715    5.0
716    3.0
717    5.0
718    3.0
719    1.0
720    3.0
721    2.0
722    4.0
723    5.0
724    2.0
725    2.0
726    4.0
727    2.0
728    2.0
729    2.0
      ... 
740    1.0
741    3.0
742    5.0
743    5.0
744    4.0
745    5.0
746    3.0
747    2.0
748    4.0
749    4.0
750    3.0
751    5.0
752    3.0
753    5.0
754    5.0
755    5.0
756    5.0
757    4.0
758    2.0
759    4.0
760    5.0
761    3.0
762    5.0
763    5.0
764    4.0
765    2.0
766    4.0
767    4.0
768    4.0
769    2.0
Name: aspectRating, dtype: float64

In [None]:

# def buildGraph(data, sims):
    
#     G = nx.Graph()
#     neighborsForNodes = 6
#     nodes = data.shape[0]
#     # build nodes
#     for i in range(nodes):
#         rev = data.loc[i]
#         content = rev["Content"]
#         #todo only add relevant sentences or take whole content?
#         hasHardTruth = pd.notnull(rev[aspect])
#         G.add_node(i, {'author' : rev["Author"], 'content' :content, "labeled": hasHardTruth, 'truth':False})
        
#         #if i has a hard-truth rating add a node with that dongle
#         if hasHardTruth:
#             #print(rev[aspect])
#             G.add_node(i+nodes, truth=True, rating=rev[aspect], dongle=True)
#             G.add_edge(i, i+nodes, weight=1)
    
#     # build edges
#     #sims = buildSimilarityMatrix(data)
#     for i in range(nodes):
#         kthClosest = sorted(sims[i], reverse=True)[5]
#         for j in range(nodes):
#             thisSimilar = sims[i][j]
#             if thisSimilar > kthClosest:
#                 G.add_edge(i, j, weight=thisSimilar)
    
    

#     # add separate learner scores
    
#     return G