In [1]:
import pandas as pd
import re
import numpy as np
import time
import collections
from nltk.corpus import stopwords
import nltk.data
import networkx as nx
import proj_base
from gensim.models import word2vec
import matplotlib.pyplot as plt
import scipy.spatial.distance as scpd
from numpy.linalg import inv
import nltk.sentiment
import time
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
from textblob import TextBlob
import pprint as pp
#tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



In [2]:
aspect = "Location"

In [None]:
data = proj_base.getStandardData(numFiles=30)
unchanged = data.copy()
aspect = "Location"
data.shape

In [11]:
sentences = []

#word2vec takes a list of lists, where each internal list is a BOW


def review2sentences(review, remove_stopwords=True):
    #split each review into sentences
    raw_sent = tokenizer.tokenize(review.strip())
    
    sentences = []
    for s in raw_sent:
        if len(s) > 0 :
            sentences.append(review2wordlist(s, remove_stopwords))
            
    return sentences


def review2wordlist(review_text, remove_stopwords=True):
    #split the given text into BOW
    

    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    return(words)


def sentences2wordlist(sentences, remove_stopwords=True):
    
    allWords = []
    for s in sentences: 
        s = re.sub("[^a-zA-Z]"," ", s)
        words = s.lower().split()
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            allWords.extend([w for w in words if not w in stops])

    return(allWords)

for review in data["Content"]:
    sentences += review2sentences(review, tokenizer)
    

#BUILD WORD2Vec infrastructure

num_features = 100
min_word_count = 40
num_workers = 2
context = 2
downsampling = 1e-3

from gensim.models import word2vec
print("training")
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features)

def makeFeatureVec(words, model, num_features):
    #given a list of words and the pre-trained word model return a feature vector
    featureVec = np.zeros((num_features,), dtype="float32")
    
    nwords = 0
    index2word_set = set(model.index2word)
    
    
    #get the mean vector for each review
    for w in words:
        #print(w)
        if w in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[w])
            
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureVec(reviews, model, num_features):
    #get average feature vec for list of bag of words
    count = 0
    reviewFeatureVec = np.zeros((len(reviews), num_features), dtype="float32")
    
    for r in reviews:
        if count % 1000 == 0:
            print("at review", count)
        #for each review add the feature vec
        reviewFeatureVec[count] = makeFeatureVec(r, model, num_features)
        count += 1
        
    return reviewFeatureVec


    
    

training


In [4]:
def isFeatureVecNull(fv):
    if sum(pd.notnull(fv)) == len(fv):
        return True
    return False


def getFeatureVec(review, useAspectSentences=False):
    if useAspectSentences:
        rev2words = sentences2wordlist(review)
    else:
        rev2words = review2wordlist(review, tokenizer)
    return makeFeatureVec(rev2words, model, num_features)


def doPSPCosine(r1, r2):
    return  1 - scpd.cosine([r1, 1-r1], [r2, 1-r2])
    #return scpd.cosine([r1, 1-r1], [r2, 1-r2])

def getAllPSP(rev, allRevs):
    return allRevs.apply(doPSPCosine, args=(rev,))

def getAllSims(rev, allRevs):
    return allRevs.apply(cosSim, args=(rev,))

def cosSim(r1, r2):
    return 1-scpd.cosine(r1, r2)

def buildSimilarityMatrix(data):
    numNodes = data.shape[0]
    sims = data["featureVec"].apply(getAllSims, args=(data["featureVec"],))
    
  
                
    return sims


def getPSP(rev, useAspectSentences=False):
    sentences = tokenizer.tokenize(rev.strip())
    count = 0
    ps = 0
    for s in sentences:
        count += 1
        sentiment = TextBlob(s)
        #print(sentiment)
        #print(sentiment.polarity)
        ps += sentiment.polarity
        
    return ps


def buildPSPSimilarityMatrix(data):
    numNodes = data.shape[0]
    sims = data["psp"].apply(getAllPSP, args=(data["psp"],))
    
  
                
    return sims

time1 = time.time()

#similarities = buildSimilarityMatrix(data)
#g = buildGraph(data, similarities)
time2 = time.time()

print("time taken", time2-time1)
#similarities

def getDataHist(data):
    nums = [0]*np.amax(data[aspect])
    for i in range(len(data)):
        score = data.loc[i][aspect]
        if not np.isnan(score):
            nums[int(score)-1] += 1
    return nums


def getBalancedSubsample(data, numUnlabeled=100):
    nums = getDataHist(data)
    #print(nums)
    toGrab = np.amin(nums)
    
    first = data[data[aspect] == 1.0][0:toGrab]
    second = data[data[aspect] == 2.0][0:toGrab]
    third = data[data[aspect] == 3.0][0:toGrab]
    fourth = data[data[aspect] == 4.0][0:toGrab]
    fifth = data[data[aspect] == 5.0][0:toGrab]
    unlabeled = data[pd.isnull(data[aspect])][0:numUnlabeled]
   
    df = pd.concat([first, second, third, fourth, fifth, unlabeled], ignore_index=True)
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(drop=True, inplace=True)
    return df




('time taken', 3.409385681152344e-05)


In [14]:

def calcMinLossPredictions(data):
    #define constants
    #sim = buildSimilarityMatrix(data)
    sim = buildPSPSimilarityMatrix(data)
    sim = (sim - sim.mean()) / (sim.max() - sim.min())
    n = data.shape[0]
    print("Num data", n)
    a = 5
    b = 1
    
    k = 3
    k_p = 1
    M = 300
    beta = a*1.0/b
    alpha = a*k + b*k_p
    isLabeled = np.isnan(data[aspect]) == False 
    #k = int(.2*sum(isLabeled))
    
    print("num labeled", sum(isLabeled))
    #print(isLabeled)
    y = np.nan_to_num(data[aspect])
    np.place(y, y==0, 3)
    #print(y)
    kNN = np.zeros((n, n), float)
    k_pNN = np.zeros((n, n), float)
    #build kNN matrix
    for i in range(n):
        if not isLabeled[i]:
            #LABELED NEIGHBORS
            candidates = np.multiply(isLabeled,sim[i])
            kthClosest = sorted(candidates, reverse=True)[k]
            for j in range(n):
                thisSimilar = candidates[j]
                if thisSimilar >= kthClosest and i != j:
                    #print("similar",i,j, data.loc[i][aspect], data.loc[j][aspect],data.loc[i]["psp"], data.loc[j]["psp"])
                    #print("similar",i,j, data.loc[i][aspect], data.loc[j][aspect])
                    kNN[i][j] = 1

            candidates = np.multiply(np.invert(isLabeled),sim[i])
            kthClosest = sorted(candidates, reverse=True)[k_p]
            for j in range(n):
                thisSimilar = candidates[j]
                if thisSimilar >= kthClosest and i != j:
                    #print("similar",i,j, data.loc[i][aspect], data.loc[j][aspect],data.loc[i]["psp"], data.loc[j]["psp"])
                    #print("similar, notLabled",i,j, data.loc[i][aspect], data.loc[j][aspect])
                    k_pNN[i][j] = 1
    #pp.pprint( kNN)

    
    c_diag = np.ones(len(data)) + M*isLabeled
    C = np.zeros((n, n), int)
    
    np.fill_diagonal(C, c_diag)
    #print(C)
    W_p = np.zeros((n,n), float)
    for i in range(n):
        for j in range(n):
            if isLabeled[i]:
                W_p[i][j] = 0
            elif isLabeled[j] and kNN[i][j] == 1:
                W_p[i][j] = sim[i][j]
            elif not isLabeled[j] and k_pNN[i][j] == 1:
                W_p[i][j] = beta*sim[i][j]
    #print("W_p",W_p)        
    W = np.maximum(W_p, np.transpose(W_p))
    #print("W ", W)
    D = np.zeros((n, n), float)
    d_diag = np.zeros(n)
    for i in range(n):
        d_diag[i] =  sum(W[i])
    np.fill_diagonal(D, d_diag)
    #pp.pprint(D)
    delta = D - W
    
    constant = alpha*1.0/(k + k_p*beta)
    toInv = C + constant*delta
    inverse = inv(toInv)
    
    C_y = C.dot(y)
    preds = inverse.dot(C_y)
    
    
    #return np.ceil(preds)
    return np.round(preds)


In [None]:
data = proj_base.getStandardData(numFiles=1)
data["psp"] = data["Content"].apply(getPSP)
#data["psp"]
sim = buildPSPSimilarityMatrix(data)

#sim

In [None]:
firstSample = proj_base.getStandardData(numFiles=30)

data = getBalancedSubsample(firstSample)
getDataHist(data)



In [None]:
unchanged = data.copy()
aspect = "Rooms"
proj_base.aspect = aspect
getDataHist(data)

In [None]:
data.shape

In [None]:
numTrain = 700
numTest = 80
dat = data[0:numTrain]



#split data into train and test
moreData = data[numTrain:numTrain+numTest]
truth = unchanged[aspect][numTrain:numTrain+numTest]
moreData[aspect] = np.nan  #set last 10 to no prediction
fulldat = pd.concat([dat, moreData])

#remove reviews where feature vec is null

#use full content or aspect sentences?
fulldat["featureVec"] = fulldat["Content"].apply(getFeatureVec)
#fulldat["psp"] = f
#fulldat["featureVec"] = fulldat["aspectSentences"].apply(getFeatureVec, args=(True,))

#fulldat["featureVec"] = fulldat["Content"].apply(getFeatureVec)
fulldat["psp"] = fulldat["Content"].apply(getPSP)
fvNull = fulldat["featureVec"].apply(isFeatureVecNull)
fulldat = fulldat[fvNull]


fullpreds = calcMinLossPredictions(fulldat)

print("DONE")
print(np.mean(fullpreds[numTrain:numTrain+numTest] == truth))

In [None]:
predicted = fullpreds[numTrain:numTrain+numTest]

for p in range(len(truth)):
    print("pred v truth:",numTrain+p,predicted[p], truth[numTrain+p])

# for p in range(len(fullpreds)):
#     print("pred v truth:",fullpreds[p], data.loc[p][aspect])

In [None]:
truth

In [None]:
fulldat.loc[1]["aspectSentences"][1]


def calcPSP(rev):
    count = 0
    ps = 0
    for s in rev:
        count += 1
        sentiment = nltk.sentiment.util.demo_liu_hu_lexicon(s)
        print(str(sentiment))
        if str(sentiment) == "Positive":
            print("POS")
            ps += 1
    return ps*1.0/count

calcPSP(fulldat.loc[1]["aspectSentences"])

In [None]:
from textblob import TextBlob

In [None]:
blob = TextBlob(fulldat.loc[1]["aspectSentences"][1])

In [None]:
data.loc[range(0,5)]

In [17]:
firstSample = proj_base.getStandardData(numFiles=20)
aspect = "Rooms"
proj_base.aspect = aspect
data = getBalancedSubsample(firstSample, numUnlabeled=10)
getDataHist(data)
cross=data.loc[range(0,400)]

In [18]:
def doCrossValidation(data, nfolds):
    unchanged = data.copy()
    total = data.shape[0]
    num_test = total/nfolds
    
    data["psp"] = data["Content"].apply(getPSP)
    #fulldat["featureVec"] = fulldat["aspectSentences"].apply(getFeatureVec, args=(True,))
    #fulldat["featureVec"] = fulldat["Content"].apply(getFeatureVec)
    #fvNull = fulldat["featureVec"].apply(isFeatureVecNull)
    #fulldat = fulldat[fvNull]
    
    for f in range(nfolds):
        #reset data
        
        data[aspect] = unchanged[aspect]
        test_ix = range(f*num_test, (f+1)*num_test)
        train_ix = [t for t in range(0, total) if t not in test_ix]
        
        print("cross fold nr ", f, "testSize", len(test_ix))
        #split data into train and test
        test = data.loc[test_ix]
        train = data.loc[train_ix]
        
        
        truth = unchanged[aspect][test_ix]
        test[aspect] = np.nan  #set last 10 to no prediction
        fulldat = pd.concat([test,train])

        #remove reviews where feature vec is null

        #use full content or aspect sentences?
        
        
        


        fullpreds = calcMinLossPredictions(fulldat)
        predicted = fullpreds[test_ix]
        print("DONE with fold", f+1, np.mean(predicted == truth))
        for p in range(len(truth)):
            print("pred v truth:", p,predicted[p], data.loc[p][aspect])


In [19]:
doCrossValidation(cross, 10)

('cross fold nr ', 0, 'testSize', 40)
('Num data', 400)
('num labeled', 356)
('DONE with fold', 1, 0.22500000000000001)
('pred v truth:', 0, 3.0, 1.0)
('pred v truth:', 1, 3.0, 3.0)
('pred v truth:', 2, 4.0, 4.0)
('pred v truth:', 3, 3.0, nan)
('pred v truth:', 4, 3.0, 4.0)
('pred v truth:', 5, 3.0, 5.0)
('pred v truth:', 6, 2.0, 1.0)
('pred v truth:', 7, 4.0, 4.0)
('pred v truth:', 8, 2.0, 4.0)
('pred v truth:', 9, 4.0, 3.0)
('pred v truth:', 10, 2.0, 3.0)
('pred v truth:', 11, 4.0, 2.0)
('pred v truth:', 12, 4.0, 5.0)
('pred v truth:', 13, 3.0, 4.0)
('pred v truth:', 14, 2.0, 1.0)
('pred v truth:', 15, 3.0, 5.0)
('pred v truth:', 16, 2.0, 2.0)
('pred v truth:', 17, 3.0, 1.0)
('pred v truth:', 18, 2.0, 1.0)
('pred v truth:', 19, 3.0, 3.0)
('pred v truth:', 20, 2.0, 5.0)
('pred v truth:', 21, 3.0, 1.0)
('pred v truth:', 22, 2.0, 1.0)
('pred v truth:', 23, 2.0, 4.0)
('pred v truth:', 24, 4.0, 4.0)
('pred v truth:', 25, 3.0, 3.0)
('pred v truth:', 26, 2.0, 2.0)
('pred v truth:', 27, 3.0,

In [None]:

# def buildGraph(data, sims):
    
#     G = nx.Graph()
#     neighborsForNodes = 6
#     nodes = data.shape[0]
#     # build nodes
#     for i in range(nodes):
#         rev = data.loc[i]
#         content = rev["Content"]
#         #todo only add relevant sentences or take whole content?
#         hasHardTruth = pd.notnull(rev[aspect])
#         G.add_node(i, {'author' : rev["Author"], 'content' :content, "labeled": hasHardTruth, 'truth':False})
        
#         #if i has a hard-truth rating add a node with that dongle
#         if hasHardTruth:
#             #print(rev[aspect])
#             G.add_node(i+nodes, truth=True, rating=rev[aspect], dongle=True)
#             G.add_edge(i, i+nodes, weight=1)
    
#     # build edges
#     #sims = buildSimilarityMatrix(data)
#     for i in range(nodes):
#         kthClosest = sorted(sims[i], reverse=True)[5]
#         for j in range(nodes):
#             thisSimilar = sims[i][j]
#             if thisSimilar > kthClosest:
#                 G.add_edge(i, j, weight=thisSimilar)
    
    

#     # add separate learner scores
    
#     return G

In [None]:
x = [1,2,3,4]

scpd.cosine(x,x)