In [1]:
import pandas as pd
import re
import numpy as np
import time
import collections
from nltk.corpus import stopwords
import nltk.data
import networkx as nx
import proj_base
from gensim.models import word2vec
import matplotlib.pyplot as plt
import scipy.spatial.distance as scpd
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



In [2]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [3]:
data = proj_base.getStandardData(numFiles=1)
data.shape

(124, 13)

In [4]:
sentences = []

#word2vec takes a list of lists, where each internal list is a BOW


def review2sentences(review, tokenizer, remove_stopwords=True):
    #split each review into sentences
    raw_sent = tokenizer.tokenize(review.strip())
    
    sentences = []
    for s in raw_sent:
        if len(s) > 0 :
            sentences.append(review2wordlist(s, remove_stopwords))
            
    return sentences


def review2wordlist(review_text, remove_stopwords=True):
    #split the given text into BOW
    

    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)


for review in data["Content"]:
    sentences += review2sentences(review, tokenizer)
    
    

In [5]:
len(sentences)

1246

In [6]:
#BUILD WORD2Vec infrastructure

num_features = 100
min_word_count = 40
num_workers = 2
context = 2
downsampling = 1e-3

from gensim.models import word2vec
print("training")
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features)

def makeFeatureVec(words, model, num_features):
    #given a list of words and the pre-trained word model return a feature vector
    featureVec = np.zeros((num_features,), dtype="float32")
    
    nwords = 0
    index2word_set = set(model.index2word)
    
    
    #get the mean vector for each review
    for w in words:
        #print(w)
        if w in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[w])
            
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVec(reviews, model, num_features):
    #get average feature vec for list of bag of words
    count = 0
    reviewFeatureVec = np.zeros((len(reviews), num_features), dtype="float32")
    
    for r in reviews:
        if count % 1000 == 0:
            print("at review", count)
        #for each review add the feature vec
        reviewFeatureVec[count] = makeFeatureVec(r, model, num_features)
        count += 1
        
    return reviewFeatureVec




training


In [7]:
rev1 = data["Content"][0]
rev12words = review2wordlist(rev1, tokenizer)
print(rev12words)
rev1avg = makeFeatureVec(rev12words, model, num_features)
print(rev1avg)

['wonderful', 'time', 'even', 'snow', 'great', 'experience', 'goldfish', 'room', 'daughter', 'loved', 'fact', 'valet', 'parking', 'staff', 'put', 'chains', 'fabulous', 'staff', 'attentive', 'went', 'beyond', 'make', 'stay', 'enjoyable', 'oh', 'parking', 'charge', 'would', 'pay', 'garage', 'lot', 'bet', 'help', 'snow']
[ -3.48742306e-02  -9.19236988e-03  -2.74260025e-02  -2.39881426e-02
   6.74560945e-03  -1.78331602e-02   2.67210254e-03   1.17838674e-03
   2.38685757e-02   1.77801996e-02   1.30458809e-02  -5.14267804e-03
   1.03308987e-02   3.24241817e-03  -1.80550071e-03   8.11703317e-03
  -2.68183323e-03  -6.06593443e-03  -6.57466352e-02  -2.84223892e-02
   1.06701739e-02  -8.60380661e-03  -8.69145896e-03   3.47222248e-03
   1.90774053e-02  -1.78701840e-02  -2.40057129e-02  -2.68744468e-03
  -1.19224424e-02   2.31881607e-02  -8.37193243e-03  -1.82689633e-02
   3.32673488e-04   4.54220846e-02   2.12338064e-02  -6.63632760e-03
   1.22047374e-02  -1.93763711e-02  -4.00290042e-02   3.007

In [None]:
#GET HARD TRUTH RATINGS

aspect="Location"

def filterHardTruthForAspect(data, aspect):
    #give data points that have hard truth ratings
    return  data[pd.notnull(data[aspect])]

justLoc = filterHardTruthForAspect(data, aspect)
justLoc.shape

In [None]:

def buildGraph(data, sim):
    
    G = nx.Graph()
    neighborsForNodes = 6
    nodes = data.shape[0]
    # build nodes
    for i in range(nodes):
        rev = data.loc[i]
        content = rev["Content"]
        #todo only add relevant sentences or take whole content?
        
        G.add_node(i, {'author' : rev["Author"], 'content' :content})
        
        #if i has a hard-truth rating add a node with that dongle
        if rev[aspect] != np.nan:
            G.add_node(i+nodes, truth=True, rating=rev[aspect])
            G.add_edge(i, i+nodes, weight=1)
    
    # build edges
    sims = buildSimilarityMatrix(data)
    for i in range(nodes):
        kthClosest = sorted(sims[i], reverse=True)[5]
        for j in range(nodes):
            thisSimilar = sims[i][j]
            if thisSimilar > kthClosest:
                G.add_edge(i, j, weight=thisSimilar)
    
    

    # add separate learner scores
    
    return G

def howSimilar(r1, r2):
    #use word2vec cosine similarity
    
    rev2words = review2wordlist(r1, tokenizer)
    rev1vec = makeFeatureVec(rev2words, model, num_features)
    revj2words = review2wordlist(r2)
    rev2vec = makeFeatureVec(revj2words, model, num_features)
    return scpd.cosine(rev1vec, rev2vec)


def buildSimilarityMatrix(data):
    
    similar = [[0]*nodes for i in range(nodes)]
    
    for i in range(nodes):
        dati = data.loc[i]["Content"]
        
        for j in range(nodes):
            if i != j:
                datj = data.loc[j]["Content"]
               
                similar[i][j] = howSimilar(dati, data.loc[j]["Content"])
                
    return similar



similarities = buildSimilarityMatrix(data)
g = buildGraph(data, similarities)

In [None]:
rev2words = review2wordlist(data["Content"][0], tokenizer)
rev1vec = makeFeatureVec(rev2words, model, num_features)
revj2words = review2wordlist(data["Content"][1])
rev2vec = makeFeatureVec(revj2words, model, num_features)
scpd.cosine(rev1vec, rev2vec)

In [None]:
def getLoss(graph):
    #minimize loss ratings over the graph
    a = 2
    b = 1
    M = 10000  #strength of hard truth
    
    #for edge in graph
    #   loss = sum_labeled[M*(pred_i - actual_i)^2 ] + sum_unlabeled[(pred_i - learner_i)^2] + \\
    #   + sum_unlabeled[sum_knnLabeled[a*w_i_j*(pred_i-pred_j)^2]]  + \\
    #   + sum_unlabeled[sum_knnUnlabled[b*w_i_j*(pred_i-pred_j)^2]]
    
    
    return loss

def smoothness(edge):
    return 0

In [None]:
%matplotlib inline
nx.draw(g)
plt.show()

In [None]:
#from sklearn.svm import SVC

#do svm classification
#need to split data into hard-truth train and predict
#not sure how were actually going to do this. Hard-truth ratings are needed to train the model right?
#but we'll also need to have some to test the results
#so probably going to need to have a lot of reviews

#train = sample of data with hard rating

trainDataVecs = getAvgFeatureVec(train, model, num_features)
