In [1]:
data_dir = "/Users/ainalopez/Desktop/Text Mining Project/Data/"
stopwords_dir = '/Users/ainalopez/Downloads/text_mining-master-2/data/stopwords/stopwords.txt'
dict_dir = "/Users/ainalopez/Downloads/dictionary1.xlsx"

import numpy as np
import json
import pandas as pd
import os
import codecs
import nltk
import re
import math 
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
from itertools import repeat
from scipy import stats
import pylab


class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        

        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2], doc[3]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
     
    
    
    def document_term_matrix(self):
        """
        description:  returns a D by V array of frequency counts
        """
        # subroutine: computes the counts of each vocabulary in the document
        def counts(doc):
            # initialize a matrix
            term_mat = [0]*len(self.token_set)
            for token in doc.tokens:
                term_mat[list(self.token_set).index(token)] = term_mat[list(self.token_set).index(token)] + 1
            return term_mat;
        
        self.doc_term_matrix = [list(self.token_set)]
        
        for doc in self.docs:
            self.doc_term_matrix.append(counts(doc))
            
         
        
    def tf_idf(self):
        """
        description:  returns a D by V array of tf-idf scores
        """
        # Compute inverse document frequency
        idf = [0]*len(self.token_set)
        for token in self.token_set:
            ind = 0
            for doc in self.docs:
                if token in doc.tokens:
                    ind += 1
                    idf[list(self.token_set).index(token)] = math.log(self.N/ind)

        # Create a subroutine that computes tf_idf for one document
        def tfidf(doc):
            term_mat = [0]*len(self.token_set)
            for token in doc.tokens:
                term_mat[list(self.token_set).index(token)] = term_mat[list(self.token_set).index(token)] + 1
            
            for i,term in enumerate(term_mat):
                if term != 0:
                    term_mat[i] = (1 + math.log(term)) * idf[i]
            return term_mat;
    
        #tf_idf
        self.tf_idf_matrix = [list(self.token_set)]
        for doc in self.docs:
            self.tf_idf_matrix.append( tfidf(doc))

            
            
            
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, restaurant_name, restaurant_price,review_score, review_text):
        self.name = restaurant_name
        self.price = restaurant_price
        self.score = review_score
        self.text = review_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])


print "1.1 Loading the data and creating the corpus"

# Load the yelp data
yelp_reviews = list()
restaurant_tags = list()
restaurant_type = list()
restaurant_score = list()

for file in os.listdir(data_dir):
   f = open(data_dir + file)
   f = json.load(f)
   try: 
        yelp_reviews.append(list([f[6], f[9], f[1], f[0] ]))
        restaurant_tags.append(f[8])
        restaurant_score.append(f[1])
        restaurant_type.append(f[9])
   except:
        next 
        
# Create the Corpus     
corpus = Corpus(yelp_reviews, stopwords_dir, 2)


print "1.2 Computing the document term matrix and the tf idf matrix"
# Create the matrices: document term and tf_idf. 
corpus.document_term_matrix()
corpus.tf_idf()


print "1.3 Applying the dictionary method"
# Load Dictionaries
df = pd.read_excel(dict_dir, skiprows=0)
w = df['Word']
words = [str(x).lower() for x in df['Word']]
words = [PorterStemmer().stem(t) for t in words]

score1 = [str(x).lower() for x in df['Positive']] 
dictionary1 = dict(zip(words,score1))

score2 = [str(x).lower() for x in df['Negative']] 
dictionary2 = dict(zip(words,score2))

score_pos = []
x = corpus.tf_idf_matrix[0]
X = corpus.tf_idf_matrix[1:]
for token in x: 
    try:
        score_pos.append(dictionary1[token])
    except: 
        score_pos.append(0)
        
# get a vector with all the scores in order
score_pos=[int(x) for x in score_pos]
rank_pos = {}
elements=range(len(X))
   
for i in elements:
    rank_pos[i] = np.dot(score_pos,X[i])
    
x = corpus.tf_idf_matrix[0]    
score_neg = []
for token in x: 
    try:
        score_neg.append(dictionary2[token])
    except: 
        score_neg.append(0)
        
# get a vector with all the scores in order
score_neg =[int(x) for x in score_neg]
rank_neg = {}
elements=range(len(X))
   
for i in elements:
    rank_neg[i] = np.dot(score_neg,X[i])
    
scores =  map(float, np.asarray(restaurant_score))

print " "
print "1.4 Regression results"

slope, intercept, r_value, p_value, std_err = stats.linregress(scores,np.asarray(rank_neg.values()))
print "1.4.1 Negative Score"
print "r-squared:", r_value**2
print "slope:", slope
print "intercept:", intercept


slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(scores,np.asarray(rank_pos.values()))
print "1.4.2 Positive Score"
print "r-squared:", r_value2**2
print "slope:", slope2
print "intercept:", intercept2


predict_neg = intercept + slope * np.array([1,2,3,4,5])
pylab.figure()
# Plotting
pylab.plot(scores, np.asarray(rank_neg.values()), 'o')
pylab.plot(np.array([1,2,3,4,5]), predict_neg, 'k-')
pylab.xlabel('Review Score')
pylab.ylabel('Negative Dictionary Score')
pylab.title('Negative Dictionary Regression')
pylab.savefig('negative.png')


predict_pos = intercept2 + slope2 * np.array([1,2,3,4,5])

# Plotting
pylab.figure()
pylab.plot(scores, np.asarray(rank_pos.values()), 'o')
pylab.plot(np.array([1,2,3,4,5]), predict_pos, 'k-')
pylab.xlabel('Review Score')
pylab.ylabel('Positive Dictionary Score')
pylab.title('Positive Dictionary Regression')
pylab.savefig('positive.png')

scores =  map(float, np.asarray(restaurant_type))

slope, intercept, r_value, p_value, std_err = stats.linregress(scores,np.asarray(rank_neg.values()))
print "1.4.3 Type - Negative Score"
print "r-squared:", r_value**2
print "slope:", slope
print "intercept:", intercept


slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(scores,np.asarray(rank_pos.values()))
print "1.4.4 Type - Positive Score"
print "r-squared:", r_value2**2
print "slope:", slope2
print "intercept:", intercept2


predict_neg = intercept + slope * np.array([1,2,3,4])

# Plotting
pylab.figure()
pylab.plot(scores, np.asarray(rank_neg.values()), 'o')
pylab.plot(np.array([1,2,3,4]), predict_neg, 'k-')
pylab.xlabel('Restaurant Type')
pylab.ylabel('Negative Dictionary Score')
pylab.title('Negative Dictionary Regression')
pylab.savefig('negative_2.png')
        

predict_pos = intercept2 + slope2 * np.array([1,2,3,4,5])

# Plotting
pylab.figure()
pylab.plot(scores, np.asarray(rank_pos.values()), 'o')
pylab.plot(np.array([1,2,3,4,5]), predict_pos, 'k-')
pylab.xlabel('Restaurant Type')
pylab.ylabel('Positive Dictionary Score')
pylab.title('Positive Dictionary Regression')
pylab.savefig('positive_2.png')

print " "
print " "
print "SECOND PART: Cosine Similarity"

# Load the yelp data
yelp_reviews = list()
restaurant_score = list()
i = 0



for file in os.listdir(data_dir):
   f = open(data_dir + file)
   f = json.load(f)
   try: 
        if f[1] == "1.0":
            yelp_reviews.append(list([f[6], f[9], f[1], f[0] ]))
            restaurant_score.append(f[1])
   except:
        next 
                
for file in os.listdir(data_dir):
   f = open(data_dir + file)
   f = json.load(f)
   try: 
        if i >= 500:
            break
        if f[1] == "5.0":
            yelp_reviews.append(list([f[6], f[9], f[1], f[0] ]))
            restaurant_score.append(f[1]) 
            i = i+1

   except:
        next 

print "2.1 Load Corpus"
        
# Create the Corpus     
corpus2 = Corpus(yelp_reviews, stopwords_dir, 2)


print "2.2 Computing TF IDF matrix"
# Create the matrices: document term and tf_idf. 
corpus2.document_term_matrix()
corpus2.tf_idf()



print "2.3 SVD Decomposition"
X = corpus2.tf_idf_matrix[1:]

# Compute svd 
sing_values_nb = 120
 
U, s, V = np.linalg.svd(X)
X_hat = np.dot(U[:,0:(sing_values_nb-1)] * s[0:(sing_values_nb-1)], V[0:(sing_values_nb-1),:])

print "2.4 Computing Cosine Similarity"

def cosine_similarity(doc1, doc2):
    return np.dot(doc1, doc2) / ( math.sqrt(np.dot(doc1, doc1))* math.sqrt(np.dot(doc2, doc2)) )

similarity_X = np.zeros((len(X), len(X)))
similarity_X_hat = np.zeros((len(X_hat), len(X_hat)))

for i in range(len(X)):
    for j in range(len(X)):
        similarity_X[i][j] = cosine_similarity(X[i], X[j])
        
for i in range(len(X_hat)):
    for j in range(len(X_hat)):
        similarity_X_hat[i][j] = cosine_similarity(X_hat[i], X_hat[j])

        
bad_index = []
good_index = []
D = []

for i in range(len(X_hat)):
    if restaurant_score[i] == "1.0":
        good_index.append(i)
    if restaurant_score[i] == "5.0":
        bad_index.append(i)


good_good = []
bad_bad = []
bad_good = []

good_good_svd = []
bad_bad_svd = []
bad_good_svd = []


for r in bad_index:
    bad_bad.extend(similarity_X[r, bad_index])
    bad_bad_svd.extend(similarity_X_hat[r, bad_index])
    bad_good.extend(similarity_X[r, good_index])
    bad_good_svd.extend(similarity_X_hat[r, good_index])
    
for d in good_index:
    good_good.extend(similarity_X[d, good_index])
    good_good_svd.extend(similarity_X_hat[d, good_index])
    bad_good.extend(similarity_X[d, bad_index])
    bad_good_svd.extend(similarity_X_hat[d, bad_index])

print " "
print "2.5 Results"
print "Good Reviews"
print "Without SVD: ", np.mean(good_good)
print "With SVD: ", np.mean(good_good_svd)

print(" ")

print "Bad Reviews"
print "Without SVD: ",np.mean(bad_bad) 
print "With SVD: ",np.mean(bad_bad_svd)

print(" ")

print "Good vs Bad Reviews"
print "Without SVD: ", np.mean(bad_good)
print "With SVD: ", np.mean(bad_good_svd)



print ' '
print 'Top Words'

one = []
two = []
three = []
four = []
five = []

for doc in corpus.docs:
    if doc.score =="1.0":
        for token in doc.tokens:
            one.append(token)
    if doc.score =="2.0":
        for token in doc.tokens:
            two.append(token)
    if doc.score =="3.0":
        for token in doc.tokens:
            three.append(token)
    if doc.score =="4.0":
        for token in doc.tokens:
            four.append(token)
    if doc.score =="5.0":
        for token in doc.tokens:
            five.append(token)


         
fd1 = nltk.FreqDist(one)
d1 = {}

for token in fd1:
     d1[token]= fd1[token]


fd2 = nltk.FreqDist(two)
d2 = {}

for token in fd2:
     d2[token]= fd2[token]
        
fd3 = nltk.FreqDist(three)
d3 = {}

for token in fd3:
     d3[token]= fd3[token]
        
        
fd4 = nltk.FreqDist(four)
d4 = {}

for token in fd4:
     d4[token]= fd4[token]
        
fd5 = nltk.FreqDist(five)
d5 = {}

for token in fd5:
     d5[token]= fd5[token]

print 'Top 1.0 words'
print sorted(d1, key=d1.get, reverse=True)[1:10] 

print 'Top 2.0 words'
print sorted(d2, key=d2.get, reverse=True)[1:10] 

print 'Top 3.0 words'
print sorted(d3, key=d3.get, reverse=True)[1:10] 

print 'Top 4.0 words'
print sorted(d4, key=d4.get, reverse=True)[1:10] 

print 'Top 5.0 words'
print sorted(d5, key=d5.get, reverse=True)[1:10] 


1.1 Loading the data and creating the corpus
1.2 Computing the document term matrix and the tf idf matrix
1.3 Applying the dictionary method
 
1.4 Regression results
1.4.1 Negative Score
r-squared: 0.0442410148601
slope: -1746.58980694
intercept: 11836.9743377
1.4.2 Positive Score
r-squared: 0.0170780228909
slope: 919.649383248
intercept: 1214.24362952
1.4.3 Type - Negative Score
r-squared: 0.00822748121046
slope: 1100.45205202
intercept: 2387.14906056
1.4.4 Type - Positive Score
r-squared: 0.022356671439
slope: 1537.32735334
intercept: 2229.78078655
 
 
SECOND PART: Cosine Similarity
2.1 Load Corpus
2.2 Computing TF IDF matrix
2.3 SVD Decomposition
2.4 Computing Cosine Similarity
 
2.5 Results
Good Reviews
Without SVD:  0.0297159073674
With SVD:  0.158796085077
 
Bad Reviews
Without SVD:  0.030714154158
With SVD:  0.169015542019
 
Good vs Bad Reviews
Without SVD:  0.0199511013632
With SVD:  0.114315738359
 
Top Words
Top 1.0 words
[u'thi', u'food', u'place', u'order', u'servic', u'res