## Sentiment Analysis

In [1]:
import nltk

def format_sentence(sent):
    return({word: True for word in nltk.word_tokenize(sent)})

print(nltk.word_tokenize("Batman is also cute!"))

['Batman', 'is', 'also', 'cute', '!']


In [2]:
#with open("nlp_data/pos_tweets.txt") as f:
#    print([i for i in f])

#with open("nlp_data/neg_tweets.txt") as f:
#    for i in f:
#        print(i)

In [3]:
# positive tweets
pos = []
with open("nlp_data/pos_tweets.txt") as f:
    for i in f:
        pos.append([format_sentence(i), 'pos']) # labelling pos
#print(pos)
len(pos)

617

In [4]:
# negative tweets
neg = []
with open("nlp_data/neg_tweets.txt") as f:
    for i in f:
        neg.append([format_sentence(i), 'neg']) # labelling neg
#print(neg)
len(neg)

1387

In [5]:
# Training data
training = pos[:int((0.9)*len(pos))] + neg[:int((0.9)*len(neg))]

In [6]:
testing = pos[int((0.1)*len(pos)):] + neg[int((0.1)*len(neg)):]

In [7]:
print(len(training))
print(len(testing))

1803
1805


In [8]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training)

In [9]:
classifier.show_most_informative_features()

Most Informative Features
                      no = True              neg : pos    =     20.6 : 1.0
                 awesome = True              pos : neg    =     18.7 : 1.0
                headache = True              neg : pos    =     18.0 : 1.0
               beautiful = True              pos : neg    =     14.2 : 1.0
                    love = True              pos : neg    =     14.2 : 1.0
                      Hi = True              pos : neg    =     12.7 : 1.0
                    glad = True              pos : neg    =      9.7 : 1.0
                   Thank = True              pos : neg    =      9.7 : 1.0
                     fan = True              pos : neg    =      9.7 : 1.0
                    lost = True              neg : pos    =      9.3 : 1.0


In [10]:
example0 = "I hate that bitch!"
print(classifier.classify(format_sentence(example0)))

neg


In [11]:
example1 = "I have no headache"
print(classifier.classify(format_sentence(example1)))

neg


In [12]:
from nltk.classify.util import accuracy
print(accuracy(classifier, testing))

0.9562326869806094


### Case Sensitivity

In [13]:
import re
rel = re.compile('python')
print(bool(rel.match('Python')))

False


### NLTK Parts of Speech Tagger

In [14]:
import nltk

text = nltk.word_tokenize("I saw two sexy girls walking on the beach.")
print(text[2])
nltk.pos_tag(text)

two


[('I', 'PRP'),
 ('saw', 'VBD'),
 ('two', 'CD'),
 ('sexy', 'JJ'),
 ('girls', 'NNS'),
 ('walking', 'VBG'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('beach', 'NN'),
 ('.', '.')]

In [15]:
nltk.help.upenn_tagset('CD')

CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...


### Unigram Models

In [16]:
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
#print(brown_tagged_sents)
brown_sents = brown.sents(categories='news')
#print(brown_sents)

unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

### Bigram Models

In [17]:
bigram_tagger = nltk.BigramTagger(brown_tagged_sents)
bigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'CS'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

## Normalizing Text

In [18]:
raw = "OMG, Natural Language Processing is so cool man! I am Awesome."
tokens = nltk.word_tokenize(raw)
tokens = [i.lower() for i in tokens]
print(tokens)

['omg', ',', 'natural', 'language', 'processing', 'is', 'so', 'cool', 'man', '!', 'i', 'am', 'awesome', '.']


## Stemming

In [20]:
lancaster = nltk.LancasterStemmer()
stems = [lancaster.stem(i) for i in tokens]
print(stems)

['omg', ',', 'nat', 'langu', 'process', 'is', 'so', 'cool', 'man', '!', 'i', 'am', 'awesom', '.']


In [22]:
porter = nltk.PorterStemmer()
stem = [porter.stem(i) for i in tokens]
print(stem)

['omg', ',', 'natur', 'languag', 'process', 'is', 'so', 'cool', 'man', '!', 'i', 'am', 'awesom', '.']


## Lemmatization

In [35]:
from nltk import WordNetLemmatizer

lemma = nltk.WordNetLemmatizer()
text = "Women in technologies are sexy and amusing at coding too."
ex = [i.lower() for i in text.split()]

lemmas = [lemma.lemmatize(i) for i in ex]
print(lemmas)

['woman', 'in', 'technology', 'are', 'sexy', 'and', 'amusing', 'at', 'coding', 'too.']
