In [347]:
%reset
import nltk

class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):
    def __init__(self):
        pass
        
    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        #adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos


Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [348]:
#from yaml import load, dump
import yaml
class DictionaryTagger(object):
    def __init__(self, dictionary_paths):
        files = [open(path, 'r') for path in dictionary_paths]
            
        dictionaries = [yaml.load(dict_file) for dict_file in files]
        #dictionaries = [dict_positive,dict_negative]

        map(lambda x: x.close(), files)
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N) #avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    #self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence


In [349]:
def value_of(sentiment):
    if sentiment == 'positive': return 1
    if sentiment == 'negative': return -1
    return 0

def value_of_pos(sentiment):
    if sentiment == 'positive': return 1
    return 0

def value_of_neg(sentiment):
    if sentiment == 'negative': return -1
    return 0
    
def sentiment_score_basic_pos(review):    
    return sum ([value_of_pos(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]])

def sentiment_score_basic_neg(review):    
    return sum ([value_of_neg(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]])

def sentiment_score_basic(review):    
    return sum ([value_of(tag) for sentence in dict_tagged_sentences for token in sentence for tag in token[2]])

def sentence_score(sentence_tokens, previous_token, acum_score):    
    if not sentence_tokens:
        return acum_score
    else:
        current_token = sentence_tokens[0]
        tags = current_token[2]
        token_score = sum([value_of(tag) for tag in tags])
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        return sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)

def sentence_score_pos(sentence_tokens, previous_token, acum_score):    
    if not sentence_tokens:
        return acum_score
        
    else:
        current_token = sentence_tokens[0]
        tags = current_token[2]
        token_score = sum([value_of(tag) for tag in tags])
        
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        if token_score<0:
            token_score=0
        return sentence_score_pos(sentence_tokens[1:], current_token, acum_score+token_score)

def sentence_score_neg(sentence_tokens, previous_token, acum_score):    
    if not sentence_tokens:
        return acum_score
    else:
        current_token = sentence_tokens[0]
        tags = current_token[2]
        token_score = sum([value_of_neg(tag) for tag in tags])
        
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        if token_score>0:
            token_score=0
        return sentence_score_neg(sentence_tokens[1:], current_token, acum_score + token_score)

def sentiment_score(review):
    return sum([sentence_score(sentence, None, 0.0) for sentence in review])

def sentiment_score_pos(review):
    return sum([sentence_score_pos(sentence, None, 0.0) for sentence in review])

def sentiment_score_neg(review):
    return sum([sentence_score_neg(sentence, None, 0.0) for sentence in review])

In [350]:
import pandas as pd
import numpy as np

cats = ['Rooms', 'Date', 'Location', 'Service', 'Business service', 'Author', 'Check in / front desk', 'No. Helpful', 
        'Cleanliness', 'Content', 'Value', 'No. Reader', 'Overall','sentiment','score','score_basic_pos','score_basic_neg',
        'score_basic','score_pos','score_neg','length']

def getBlankFrame():
    
    data = pd.DataFrame(columns=cats)
    
    return data


def addFileToData(filename, data):
    intColumns = ['No. Reader', 'No. Helpful', 'Cleanliness','Check in / front desk', 'Value', 'Overall', 'Service', 'Business service', 'Rooms', 'Location']
    characterThreshold = 60
    with open(filename, 'r') as content_file:
        content = content_file.read()
     
    reviews = content.split("\n\n")
    for r in reviews:
        thisReview = pd.Series([None]*len(cats), cats)
        splt = r.split("\n")
        for s in splt:
            for c in cats:
                if "<"+c+">" in s:
                    value = s.replace('<'+c+'>', '')
                    if c in intColumns:
                        value = int(value)
                    if value == -1: #we dont want -1 as this is going to mess up averaging, take np.nan
                        value = np.nan

                    if c == "Content":
                        value = value.lower()

                    thisReview[c] = value
        thisReview["score"]=0
        thisReview["score_basic_pos"]=0
        thisReview["score_basic_neg"]=0
        thisReview["score_basic"]=0
        thisReview["score_pos"]=0
        thisReview["score_neg"]=0
        thisReview["length"]=0

        if not thisReview["Content"] == None and len(thisReview["Content"]) > characterThreshold:
            #only add if theres content and its long enough
            data = data.append(thisReview, ignore_index=True)
    return data


In [351]:
data = pd.DataFrame(columns=cats)

data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_72579.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_72572.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_73855.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_73821.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_73985.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_75662.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_76061.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_77638.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_80083.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_80087.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_80797.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_80808.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_80864.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_80912.dat', data)
data = addFileToData('C:/Users/Abhay Pawar/Documents/GitHub/data/Review_Texts/hotel_80930.dat', data)

print(data.shape)
#data.head()

(1458, 21)


In [352]:
data['Content'][2]

'reasonably priced hotel in downtown seattle we stayed at the loyal inn because we wanted to be in downtown seattle, since we only had two nights there. it exceeded our expectations in many respects. the rooms were large and comfortable, with many extra amenities. the breakfast was very extensive, with many fresh fruits, eggs, breads and even waffles.the hotel staff was very friendly and helpful, especially with suggestions of places to see during our rather limited time in the city. (highlights were the columbia center, the second tallest building on the west coast, with marvelous views of the surrounding area, and also the pike street market.)we also appreciated the location, as it is within two blocks of the area of free bus transportation from early morning to late evening on all buses in the downtown area. this was a marvelous idea, and did much to cut down on traffic congestion.i plan to return to this hotel in the spring of 2009. '

In [353]:
dicttagger = DictionaryTagger(['dicts/f_negative.yml', 'dicts/f_positive.yml','dicts/f_inc.yml','dicts/f_dec.yml','dicts/f_inv.yml'])
splitter = Splitter()
postagger = POSTagger()

for i in range(0,len(data)):
    review=data['Content'][i].decode('utf-8')
    splitted_sentences = splitter.split(review)
    pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
    dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
    #data['sentiment'][i]=dict_tagged_sentences
    data['score'][i]=sentiment_score(dict_tagged_sentences)
    data["score_basic_pos"][i]=sentiment_score_basic_pos(dict_tagged_sentences)
    data["score_basic_neg"][i]=sentiment_score_basic_neg(dict_tagged_sentences)
    data["score_basic"][i]=sentiment_score_basic(dict_tagged_sentences)
    data["score_pos"][i]=sentiment_score_pos(dict_tagged_sentences)
    data["score_neg"][i]=sentiment_score_neg(dict_tagged_sentences)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

Unnamed: 0,Rooms,Date,Location,Service,Business service,Author,Check in / front desk,No. Helpful,Cleanliness,Content,...,No. Reader,Overall,sentiment,score,score_basic_pos,score_basic_neg,score_basic,score_pos,score_neg,length
0,5.0,"Jan 7, 2009",5.0,5.0,5.0,JeanMars,4.0,,5.0,a really good choice ! when returning from haw...,...,,5.0,,15.0,10.0,-1.0,9.0,16.0,-1.0,0.0
1,5.0,"Jan 5, 2009",5.0,4.0,5.0,kareemtownes,5.0,,5.0,everything you could hope for my friends and i...,...,,5.0,,16.5,17.0,-3.0,14.0,19.0,-2.5,0.0
2,,"Nov 14, 2008",,,,maestrolms,,,,reasonably priced hotel in downtown seattle we...,...,,5.0,,11.0,11.0,-1.0,10.0,12.0,-1.0,0.0
3,1.0,"Oct 28, 2008",1.0,3.0,,seychgo,3.0,,1.0,"stay away!!!! not worth it... dirty, run down,...",...,,1.0,,-7.0,11.0,-14.0,-3.0,9.0,-14.0,0.0
4,4.0,"Oct 27, 2008",3.0,4.0,,fudgemaker,2.0,1.0,4.0,worth the cost the room that i was given had n...,...,1.0,3.0,,5.0,7.0,-2.0,5.0,7.0,-2.0,0.0


In [383]:
#data['Overall'][data['Overall']!=0]


KeyError: 1

In [386]:
#'score','score_basic_pos','score_basic_neg',
#        'score_basic','score_pos','score_neg',

score=data['score'][data['Overall']!=0].reshape(-1,1)
score_basic_pos=data['score_basic_pos'][data['Overall']!=0].reshape(-1,1)
score_basic_neg=data['score_basic_neg'][data['Overall']!=0].reshape(-1,1)
score_basic=data['score_basic'][data['Overall']!=0].reshape(-1,1)
score_pos=data['score_pos'][data['Overall']!=0].reshape(-1,1)
score_neg=data['score_neg'][data['Overall']!=0].reshape(-1,1)

X=[]
for i in range(0,len(score)):
    #X.extend([[score[i][0],score_basic_pos[i][0],score_basic_neg[i][0],score_basic[i][0],score_pos[i][0],score_neg[i][0]]])
    X.extend([[score[i][0]]])


In [387]:
#'score','score_basic_pos','score_basic_neg',
#        'score_basic','score_pos','score_neg',

import numpy as np
from sklearn.svm import SVC
clf = SVC()
#clf.fit(X[0:1000],data['Overall'][0:1000])
clf.fit(X,data['Overall'][data['Overall']!=0]) 

#score(X, y, sample_weight=None)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
pred=clf.predict(X)
#score(data['score'].reshape(-1, 1), data['Overall'])

In [388]:
sum=float(0)
error=float(0)
#actual=data['Overall'].reshape(-1,1)[1001:len(X)]
actual=data['Overall'][data['Overall']!=0].reshape(-1,1)

#print len(pred)
#print len(actual)
for i in range(0,len(pred)):
    if pred[i]==actual[i][0]:
        sum+=1
    error_curr=abs(actual[i][0]-pred[i])
    error+=error_curr
accuracy=sum/len(pred)
mean_error=error/len(pred)

print accuracy
print mean_error

0.454982817869
0.699656357388


In [None]:
#code to create file in desired format

file=open('C:/Users/Abhay Pawar/Documents/GitHub/data/opinion-lexicon-English/positive-words.txt')
lines=file.readlines()
dict_positive={}
for line in lines:
    dict_positive[line.strip()]=['positive']
with open('dicts/f_positive.yml', 'w') as outfile:
    yaml.dump(dict_positive, outfile, default_flow_style=True)
    
file=open('C:/Users/Abhay Pawar/Documents/GitHub/data/opinion-lexicon-English/negative-words.txt')
lines=file.readlines()
dict_negative={}
for line in lines:
    dict_negative[line.strip()]=['negative']
with open('dicts/f_negative.yml', 'w') as outfile:
    yaml.dump(dict_negative, outfile, default_flow_style=True)

dict_inc={}
dict_inc['too']= ['inc']
dict_inc['very']= ['inc']
dict_inc['sorely']= ['inc']
dict_inc['extremely']= ['inc']
dict_inc['really']= ['inc']
with open('dicts/f_dec.yml', 'w') as outfile:
    yaml.dump(dict_inc, outfile, default_flow_style=True)

dict_dec={}
#dict_dec['barely']= ['dec']
dict_dec['little']= ['dec']
#dict_dec['hardly']= ['dec']
with open('dicts/f_inc.yml', 'w') as outfile:
    yaml.dump(dict_dec, outfile, default_flow_style=True)

dict_inv={}
dict_inv['lack of']= ['inv']
dict_inv['not']= ['inv']
dict_inv['lack']= ['inv']
with open('dicts/f_inv.yml', 'w') as outfile:
    yaml.dump(dict_inv, outfile, default_flow_style=True)


In [286]:
text = """What can I say about this place. The staff of the restaurant is nice and the eggplant is not bad. Apart from that, very uninspired food, lack of atmosphere and too expensive. I am a staunch vegetarian and was sorely dissapointed with the veggie options on the menu. Will be the last time I visit, I recommend others to avoid."""

splitter = Splitter()
postagger = POSTagger()

splitted_sentences = splitter.split(text)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)

dicttagger = DictionaryTagger(['dicts/f_negative.yml','dicts/f_positive.yml','dicts/f_inc.yml','dicts/f_dec.yml','dicts/f_inv.yml'])
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
#print dict_tagged_sentences
#print sentiment_score(dict_tagged_sentences)

#print sentiment_score_basic_pos(dict_tagged_sentences)
#print sentiment_score_basic_neg(dict_tagged_sentences)
#print sentiment_score_basic(dict_tagged_sentences)
print sentiment_score_pos(dict_tagged_sentences)
print sentiment_score_neg(dict_tagged_sentences)
#print dict_tagged_sentences

4.0
-5.0


In [284]:

print sentiment_score_pos(dict_tagged_sentences)

4.0


In [270]:
print dict_tagged_sentences

[[('What', 'What', ['WP']), ('can', 'can', ['MD']), ('I', 'I', ['PRP']), ('say', 'say', ['VBP']), ('about', 'about', ['IN']), ('this', 'this', ['DT']), ('place', 'place', ['NN']), ('.', '.', ['.'])], [('The', 'The', ['DT']), ('staff', 'staff', ['NN']), ('of', 'of', ['IN']), ('the', 'the', ['DT']), ('restaurant', 'restaurant', ['NN']), ('is', 'is', ['VBZ']), ('nice', 'nice', ['positive', 'JJ']), ('and', 'and', ['CC']), ('the', 'the', ['DT']), ('eggplant', 'eggplant', ['NN']), ('is', 'is', ['VBZ']), ('not', 'not', ['inv', 'RB']), ('bad', 'bad', ['negative', 'JJ']), ('.', '.', ['.'])], [('Apart', 'Apart', ['RB']), ('from', 'from', ['IN']), ('that', 'that', ['IN']), (',', ',', [',']), ('very', 'very', ['inc', 'RB']), ('uninspired', 'uninspired', ['JJ']), ('food', 'food', ['NN']), (',', ',', [',']), ('lack of', 'lack of', ['inv']), ('atmosphere', 'atmosphere', ['NN']), ('and', 'and', ['CC']), ('too', 'too', ['inc', 'RB']), ('expensive', 'expensive', ['negative', 'JJ']), ('.', '.', ['.'])], 

In [344]:
x=xrange(0,len(data))
print x

xrange(1458)
