# Tokenizing Text

In [1]:
import os
os.chdir('/Users/Bya/git/predictEPL/MyFunctions/')
import tokenizers

In [18]:
sampleText = "I love you, Munjuu. I want you"
tokenizedText = tokenizers.bya_token(sampleText)

print("Before Tokenize: \n\n %s \n" % sampleText)
print("After Tokenizer: \n\n %s \n" % tokenizedText)

Before Tokenize: 

 I love you, Munjuu. I want you 

After Tokenizer: 

 ['I', 'love', 'you', 'Munjuu', 'I', 'want', 'you'] 



# Part-of-Speech Tagging

In [19]:
# load the tagger
import pickle

with open('/Users/Bya/nltk_data/taggers/treebank_aubt.pickle', 'rb') as f:
    tagger_aubt = pickle.load(f)

In [20]:
taggedAUBTwords = tagger_aubt.tag(tokenizedText)

print("AUBT tagger: \n\n %s \n" % taggedAUBTwords)

AUBT tagger: 

 [('I', 'PRP'), ('love', 'VB'), ('you', 'PRP'), ('Munjuu', 'NN'), ('I', 'PRP'), ('want', 'VBP'), ('you', 'PRP')] 



# SentiWordNet

In [21]:
def wordnet_sanitize(word):
    """
    Ensure that word is a (string, pos) pair that WordNet can understand.
 
    Argument: word (str, str) -- a (string, pos) pair
 
    Value: a possibly modified (string, pos) pair, where pos=None if
    the input pos is outside of WordNet.
    """
    string, tag = word
    string = string.lower()
    tag = tag.lower()
    if tag.startswith('v'):    tag = 'v'
    elif tag.startswith('n'):  tag = 'n'
    elif tag.startswith('j'):  tag = 'a'
    elif tag.startswith('rb'): tag = 'r'
    if tag in ('a', 'n', 'r', 'v'):
        return (string, tag)
    else:
        return (string, None)

wordsWNtag = list(map(lambda word: wordnet_sanitize(word), taggedAUBTwords))

print("With WordNet tags: \n \n %s \n" % wordsWNtag)

With WordNet tags: 
 
 [('i', None), ('love', 'v'), ('you', None), ('munjuu', 'n'), ('i', None), ('want', 'v'), ('you', None)] 



In [23]:
from nltk.corpus import sentiwordnet as swn

def senti_word_net(word):
    pos_score = 0.0
    neg_score = 0.0
    obj_score = 0.0
    
    string, tag = word
    
    if tag is None:
        return pos_score, neg_score, obj_score
    
    wordList = list(swn.senti_synsets(string, tag))

    word_num = len(wordList)

    if word_num:
        for word in wordList:
            pos_score += word.pos_score()
            neg_score += word.neg_score()
            obj_score += word.obj_score()
        
        return pos_score/word_num, neg_score/word_num, obj_score/word_num

    return pos_score, neg_score, obj_score

def pos_neg_score(words):
    pos_score = 0.0
    neg_score = 0.0
    obj_score = 0.0
    
    senti_words = 0
    
    for word in words:
        p, n, o = senti_word_net(word)
        
        if p:
            pos_score += p
            neg_score += n
            obj_score += o
            
            senti_words += 1
    
    return pos_score/senti_words, neg_score/senti_words, obj_score/senti_words

pos_neg_score(wordsWNtag)

(0.35, 0.090625, 0.559375)