In [None]:
import numpy as np
import pandas as pd

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer

## Defining the preprocessing methods

In [None]:
tokenizer = TreebankWordTokenizer()
stemmer = PorterStemmer()

In [None]:
tokenizer.tokenize("The input text.")


In [None]:
stemmer.stem("documents")


In [None]:

# The first time you use the stopwords, you have to download them!
# import nltk
# nltk.download("stopwords")

stop_words = stopwords.words("english")
stop_words[:100]

## Setting up the "corpus"

In [None]:
sentences = """Thomas Jefferson began building Monticello at the age of 26.\n"""
sentences += """Construction was done mostly by local masons and carpenters.\n"""
sentences += "He moved into the South Pavilion in 1770.\n"
sentences += """Turning Monticello into a neoclassical masterpiece was Jefferson's obsession."""

sentence = sentences.lower()
sentence

# Bag of words representation

Let us compute the BoW representation for our toy corpus

In [None]:
# Loading the corpus into a dictionary
corpus ={}
for i, sent in enumerate(sentences.split('\n')):
    sentence = sent.lower()                 # Case folding
    tokens = tokenizer.tokenize(sentence)   # Tokenisation 
    stems = [stemmer.stem(token) for token in tokens]
    
    corpus['sent{}'.format(i)] = dict((tok, 1) for tok in
         stems)

print(corpus)


In [None]:
# Loading it into a pandas dataframe
df = pd.DataFrame.from_records(corpus).fillna(0).astype(int).T

#df[df.columns[:10]]


print(df)


## Dot product


In [None]:
v1 = np.array([1, 2, 3])
v2 = np.array([2, 4, 6])

sum_dot = 0

for i in range(len(v1)):
    sum_dot += v1[i] * v2[i]
    print(sum_dot)
print("Result", sum_dot)


In [None]:

# Note that we are vectorizing the multiplication of the two vectors
dot = (v1 * v2).sum()
print(dot)

In [None]:
v1.dot(v2)

The dot product can be used to measure the overlapping between two documents

In [None]:
# We first need to compute the transpose of the matrix 
df = df.T

#How can I print it?
print(df)

In [None]:
df.sent0.dot(df.sent1)


In [None]:
df.sent0.dot(df.sent2)

In [None]:
df.sent0.dot(df.sent3)


In [None]:
# Where do these numbers come from?
print(sentences)
[(k, v) for (k, v) in (df.sent0 & df.sent3).items() if v]

This is your first **vector space model**!




# Rule-based sentiment analysis

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
# Let us have a look at the lexicon
#sa.lexicon
[(tok, score) for tok, score in sa.lexicon.items() if tok.startswith("c")]

In [None]:
# Let us see if there are bigrams
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]


In [None]:
# Finally, let's score!!
sa.polarity_scores(text="Python is very readable and it's great for NLP.")


In [None]:
sa.polarity_scores(text="Python is not a bad choice for most applications.")


In [None]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
          "Horrible! Completely useless. :(",
           "It was OK. Some good and some bad things."]

for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))


In [None]:
# Scoring the Amazon review

text = """"This monitor is denitely a good value. Does it have superb color and 
contrast? No. Does it boast the best refresh rate on the market? No. 
But if you're tight on money, this thing looks and preforms great for the money. 
It has a Matte screen which does a great job at eliminating glare. The chassis it's enclosed 
within is absolutely stunning.")"""
len(text.split())

for i in [10, 20, 45, 60]:
    t = " ".join(text.split()[:i])
    print(i,"\t", t)
    print("ONE TIME", sa.polarity_scores(t))
    print("THREE TIME", sa.polarity_scores(" ".join([t, t, t])))
    print


In [None]:
print(sa.polarity_scores("this is not good"))
print(sa.polarity_scores("this is not good at all"))

In [None]:
# Scoring the tweet
sa.polarity_scores("His ass didnt concede until July 12, 2016. Because he was throwing a tantrum. I can't say this enough: Fuck Bernie Sanders")