In [1]:
%reset
import nltk.data
import nltk.tokenize
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

class Splitter(object):
    def __init__(self):
        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def split(self, text):
        """
        input format: a paragraph of text
        output format: a list of lists of words.
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        """
        sentences = self.nltk_splitter.tokenize(text)
        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
        return tokenized_sentences


class POSTagger(object):
    def __init__(self):
        pass
        
    def pos_tag(self, sentences):
        """
        input format: list of lists of words
            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
        output format: list of lists of tagged tokens. Each tagged tokens has a
        form, a lemma, and a list of tags
            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
        """

        pos = [nltk.pos_tag(sentence) for sentence in sentences]
        #adapt format
        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
        return pos


Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
#from yaml import load, dump
import yaml
class DictionaryTagger(object):
    def __init__(self, dictionary_paths):
        files = [open(path, 'r') for path in dictionary_paths]
            
        dictionaries = [yaml.load(dict_file) for dict_file in files]
        #dictionaries = [dict_positive,dict_negative]

        map(lambda x: x.close(), files)
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N) #avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    #self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence


In [3]:
def value_of(sentiment):
    if sentiment == 'positive': return 1
    if sentiment == 'negative': return -1
    return 0

def value_of_pos(sentiment):
    if sentiment == 'positive': return 1
    return 0

def value_of_neg(sentiment):
    if sentiment == 'negative': return -1
    return 0
    
def sentiment_score_basic_pos(sentences):
    sm = 0
    for sentence in sentences:
        for token in sentence:
            for tag in token[2]:
                sm += value_of_pos(tag)
    return sm

def sentiment_score_basic_neg(sentences):    
    sm = 0
    for sentence in sentences:
        for token in sentence:
            for tag in token[2]:
                sm += value_of_neg(tag)
    return sm

def sentiment_score_basic(sentences):    
    sm = 0
    for sentence in sentences:
        for token in sentence:
            for tag in token[2]:
                sm += value_of(tag)
    return sm

def sentence_score(sentence_tokens, previous_token, acum_score):  
    #$print(sentence_tokens)
    previous_token = None
    for current_token in sentence_tokens:
        tags = current_token[2]
        token_score = sum([value_of(tag) for tag in tags])
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        
        
        acum_score += token_score
        previous_token = current_token
    
    return acum_score
    
def sentence_score_pos(sentence_tokens, previous_token, acum_score):    
    previous_token = None
    for current_token in sentence_tokens:
        tags = current_token[2]
        token_score = sum([value_of_pos(tag) for tag in tags])
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        
        
        acum_score += token_score
        previous_token = current_token
    
    return acum_score

def sentence_score_neg(sentence_tokens, previous_token, acum_score):    
    previous_token = None
    for current_token in sentence_tokens:
        tags = current_token[2]
        token_score = sum([value_of_neg(tag) for tag in tags])
        if previous_token is not None:
            previous_tags = previous_token[2]
            if 'inc' in previous_tags:
                token_score *= 2.0
            elif 'dec' in previous_tags:
                token_score /= 2.0
            elif 'inv' in previous_tags:
                token_score *= -1.0
        
        
        acum_score += token_score
        previous_token = current_token
    
    return acum_score

def sentiment_score(review):
    sm = 0
    for sentence in review:
        sm += sentence_score(sentence, None, 0.0)
        
    return sm

def sentiment_score_pos(review):
    sm = 0
    for sentence in review:
        sm += sentence_score_pos(sentence, None, 0.0)
        
    return sm

def sentiment_score_neg(review):
    sm = 0
    for sentence in review:
        sm += sentence_score_neg(sentence, None, 0.0)
        
    return sm

In [4]:
import pandas as pd
import numpy as np
import proj_base



#TO CHANGE ASPECT uncomment this
#proj_base.aspect = "Location"

In [5]:
#getting the standard data if this takes too long drop it to 10 (but restart the kernel before as aspectSegmentation probably ran)

data = proj_base.getStandardData(numFiles=20)
proj_base.aspectSegmentationBayes(data["Content"])
data.head()

Unnamed: 0,Rooms,Date,Location,Service,Business service,Author,Check in / front desk,No. Helpful,Cleanliness,Content,Value,No. Reader,Overall
0,5.0,"Dec 23, 2008\r",5.0,5.0,,selizabethm\r,5.0,,5.0,wonderful time- even with the snow! what a gre...,4.0,,5.0
1,4.0,"Nov 13, 2008\r",5.0,5.0,,IndieLady\r,5.0,,4.0,"lovely hotel, unique decor, friendly front des...",5.0,,4.0
2,4.0,"Nov 11, 2008\r",3.0,,4.0,Hilobb\r,5.0,,4.0,"nice hotel, expensive parking we got a good de...",4.0,,4.0
3,5.0,"Nov 4, 2008\r",5.0,5.0,5.0,Chianti_girl24\r,5.0,,5.0,fabulous hotel location and service are great....,5.0,,5.0
4,,"Oct 18, 2008\r",,,,hothearted\r,,2.0,,"loved the monaco! staff was amazing, with a sm...",,2.0,5.0


In [None]:
td = trainingData[0:3].apply(addSentimentScores, axis = 1)
td

In [None]:
print(data.shape)
trainingData = proj_base.getTrainingData(data)
print(trainingData.shape)
trainingData.head()

In [None]:
dicttagger = DictionaryTagger(['dicts/f_negative.yml', 'dicts/f_positive.yml','dicts/f_inc.yml','dicts/f_dec.yml','dicts/f_inv.yml'])
splitter = Splitter()
postagger = POSTagger()
wordTokenizer = nltk.tokenize.TreebankWordTokenizer()

def addSentimentScores(review):
    
    tokenizedSentences = [wordTokenizer.tokenize(sent) for sent in  review["aspectSentences"]]
    pos_tagged_sentences = postagger.pos_tag(tokenizedSentences)
    dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
    
    #review['sentiment']=dict_tagged_sentences
    review['score'] = sentiment_score(dict_tagged_sentences)
    review["score_basic_pos"] = sentiment_score_basic_pos(dict_tagged_sentences)
    review["score_basic_neg"] = sentiment_score_basic_neg(dict_tagged_sentences)
    review["score_basic"] = sentiment_score_basic(dict_tagged_sentences)
    review["score_pos"] = sentiment_score_pos(dict_tagged_sentences)
    review["score_neg"] = sentiment_score_neg(dict_tagged_sentences)
    return review

#trainingData = trainingData.apply(addSentimentScores, axis = 1)

trainingData.head(20)

In [None]:
#data['Overall'][data['Overall']!=0]


In [None]:
#'score','score_basic_pos','score_basic_neg',
#        'score_basic','score_pos','score_neg',

score=data['score'][data['Overall']!=0].reshape(-1,1)
score_basic_pos=data['score_basic_pos'][data['Overall']!=0].reshape(-1,1)
score_basic_neg=data['score_basic_neg'][data['Overall']!=0].reshape(-1,1)
score_basic=data['score_basic'][data['Overall']!=0].reshape(-1,1)
score_pos=data['score_pos'][data['Overall']!=0].reshape(-1,1)
score_neg=data['score_neg'][data['Overall']!=0].reshape(-1,1)

X=[]
for i in range(0,len(score)):
    #X.extend([[score[i][0],score_basic_pos[i][0],score_basic_neg[i][0],score_basic[i][0],score_pos[i][0],score_neg[i][0]]])
    X.extend([[score[i][0]]])


In [None]:
#'score','score_basic_pos','score_basic_neg',
#        'score_basic','score_pos','score_neg',

import numpy as np
from sklearn.svm import SVC
clf = SVC()
#clf.fit(X[0:1000],data['Overall'][0:1000])
clf.fit(X,data['Overall'][data['Overall']!=0]) 

#score(X, y, sample_weight=None)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
pred=clf.predict(X)
#score(data['score'].reshape(-1, 1), data['Overall'])

In [None]:
sum=float(0)
error=float(0)
#actual=data['Overall'].reshape(-1,1)[1001:len(X)]
actual=data['Overall'][data['Overall']!=0].reshape(-1,1)

#print len(pred)
#print len(actual)
for i in range(0,len(pred)):
    if pred[i]==actual[i][0]:
        sum+=1
    error_curr=abs(actual[i][0]-pred[i])
    error+=error_curr
accuracy=sum/len(pred)
mean_error=error/len(pred)

print accuracy
print mean_error

In [None]:
#code to create file in desired format

file=open('C:/Users/Abhay Pawar/Documents/GitHub/data/opinion-lexicon-English/positive-words.txt')
lines=file.readlines()
dict_positive={}
for line in lines:
    dict_positive[line.strip()]=['positive']
with open('dicts/f_positive.yml', 'w') as outfile:
    yaml.dump(dict_positive, outfile, default_flow_style=True)
    
file=open('C:/Users/Abhay Pawar/Documents/GitHub/data/opinion-lexicon-English/negative-words.txt')
lines=file.readlines()
dict_negative={}
for line in lines:
    dict_negative[line.strip()]=['negative']
with open('dicts/f_negative.yml', 'w') as outfile:
    yaml.dump(dict_negative, outfile, default_flow_style=True)

dict_inc={}
dict_inc['too']= ['inc']
dict_inc['very']= ['inc']
dict_inc['sorely']= ['inc']
dict_inc['extremely']= ['inc']
dict_inc['really']= ['inc']
with open('dicts/f_dec.yml', 'w') as outfile:
    yaml.dump(dict_inc, outfile, default_flow_style=True)

dict_dec={}
#dict_dec['barely']= ['dec']
dict_dec['little']= ['dec']
#dict_dec['hardly']= ['dec']
with open('dicts/f_inc.yml', 'w') as outfile:
    yaml.dump(dict_dec, outfile, default_flow_style=True)

dict_inv={}
dict_inv['lack of']= ['inv']
dict_inv['not']= ['inv']
dict_inv['lack']= ['inv']
with open('dicts/f_inv.yml', 'w') as outfile:
    yaml.dump(dict_inv, outfile, default_flow_style=True)


In [None]:
text = """What can I say about this place. The staff of the restaurant is nice and the eggplant is not bad. Apart from that, very uninspired food, lack of atmosphere and too expensive. I am a staunch vegetarian and was sorely dissapointed with the veggie options on the menu. Will be the last time I visit, I recommend others to avoid."""

splitter = Splitter()
postagger = POSTagger()

splitted_sentences = splitter.split(text)
pos_tagged_sentences = postagger.pos_tag(splitted_sentences)

dicttagger = DictionaryTagger(['dicts/f_negative.yml','dicts/f_positive.yml','dicts/f_inc.yml','dicts/f_dec.yml','dicts/f_inv.yml'])
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
#print dict_tagged_sentences
#print sentiment_score(dict_tagged_sentences)

#print sentiment_score_basic_pos(dict_tagged_sentences)
#print sentiment_score_basic_neg(dict_tagged_sentences)
#print sentiment_score_basic(dict_tagged_sentences)
print sentiment_score_pos(dict_tagged_sentences)
print sentiment_score_neg(dict_tagged_sentences)
#print dict_tagged_sentences

In [None]:

print sentiment_score_pos(dict_tagged_sentences)

In [None]:
print dict_tagged_sentences

In [None]:
x=xrange(0,len(data))
print x