In [175]:
import requests
import nltk
import re
from collections import Counter
import numpy as np

In [201]:
r = requests.get("https://github.com/alexmill/techcamp_2017/raw/master/session7/movies.json")
all_movies = r.json()

In [202]:
movies = all_movies[0:10]
titles = [m["title"] for m in movies]
summaries = [m["summary"] for m in movies]

In [203]:
titles

['The Godfather',
 'The Shawshank Redemption',
 "Schindler's List",
 'Raging Bull',
 'Casablanca',
 "One Flew Over the Cuckoo's Nest",
 'Gone with the Wind',
 'Citizen Kane',
 'The Wizard of Oz',
 'Titanic']

In [204]:
def sanitize(raw_string):
    # Removes all non-alphanumeric characters
    s = re.sub(r"[^a-zA-Z0-9]+", ' ', raw_string)
    out = " ".join(s.split()).lower()
    return(out)

In [206]:
cleaned = sanitize(summaries[0])
cleaned[0:100]

'in late summer 1945 guests are gathered for the wedding reception of don vito corleone s daughter co'

In [207]:
def get_tokens(text):
    tokens = nltk.word_tokenize(text)
    return(tokens)

In [208]:
tokens = get_tokens(cleaned)
count = Counter(tokens)
print(count.most_common(10))

[('the', 176), ('to', 98), ('and', 96), ('a', 69), ('s', 64), ('in', 60), ('is', 59), ('his', 59), ('he', 58), ('michael', 56)]


# Important!

Before you proceed, make sure you download the NLTK data that contains English-language stopwords. To do so, run the following commands after import nltk into your Python session:

```python
nltk.download('punkt')
nltk.download('stopwords')
```

In [209]:
from nltk.corpus import stopwords
filtered = [w for w in tokens if not w in stopwords.words('english')]

In [210]:
from nltk.stem import porter
stemmer = porter.PorterStemmer()

def stem_tokens(tokens):
    stemmed = [stemmer.stem(t) for t in tokens]
    return(stemmed)

In [211]:
stemmed = stem_tokens(filtered)

In [212]:
count = Counter(stemmed)
print(count.most_common(10))

[('michael', 56), ('famili', 31), ('corleon', 30), ('sonni', 28), ('carlo', 24), ('conni', 19), ('meet', 18), ('sollozzo', 16), ('kill', 15), ('hagen', 13)]


In [213]:
# All at once!
def pre_process_docs(docs):
    documents = []
    for i, doc in enumerate(docs):
        cleaned = sanitize(doc)
        tokens = get_tokens(cleaned)
        filtered = [w for w in tokens if not w in stopwords.words('english')]
        stemmed = stem_tokens(filtered)
        processed = " ".join(stemmed)
        documents.append(processed)
        print("{: >3} {: <35}: {: <5} tokens".format(i, titles[i], len(stemmed)))
    return(documents)


In [214]:
# Construct set of sanitized/tokenized/stemmed documents
# This might take a while!
documents = pre_process_docs(summaries)

  0 The Godfather                      : 1686  tokens
  1 The Shawshank Redemption           : 1322  tokens
  2 Schindler's List                   : 661   tokens
  3 Raging Bull                        : 477   tokens
  4 Casablanca                         : 490   tokens
  5 One Flew Over the Cuckoo's Nest    : 625   tokens
  6 Gone with the Wind                 : 2225  tokens
  7 Citizen Kane                       : 737   tokens
  8 The Wizard of Oz                   : 588   tokens
  9 Titanic                            : 407   tokens


# Important!

Make sure you install sklearn. (It comes bundled with Anaconda, so no need to re-install if you installed the full distribution.)

```bash
pip install scikit-learn
```


# Term Frequency

Term frequency simply refers to the counts of each word in a given document. In NLP, it is common to represent every document as a vector, with every entry in the vector corresponding to a word in the the vocabulary of your corpus.


# TF-IDF

We sometimes want to characterize documents by the words that are *most* distinctive to those documents, among all the documents in our corpus. Note that any given term is more likely to appear the longer a document is. So sometimes, mere term frequency doesn't tell us everything we need to know about the relationship between a term and its document. In these situations, we use "term frequency, inverse document frequency" or TF-IDF.

$$ \text{score} = \left( \text{# occurrences of } term \text{ in } doc \right) \times \log \frac{\text{# documents}}{\text{# documents containing } term} $$

In [215]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tf_vectorizer = CountVectorizer(max_features=10000, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [216]:
# Shape of your data matrix
tf.shape

(10, 2762)

In [None]:
# = (number of documents, number of unique words in your corpus)

In [217]:
# View the raw matrix itself.
# But don't print the whole thing!!!
num_rows = 5
num_words = 20
words = tf_feature_names[0:num_words]
print(words)
tf[0:num_rows, 1:num_words].todense()

['000', '100', '14', '15', '16', '17', '1861', '1862', '1863', '1865', '1866', '1871', '1900', '1912', '1929', '1939', '1940', '1941', '1945', '1947']


matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

In [218]:
def display_top(vectorizer, tfidf_result, top_n=10):
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores[:top_n]:
        print("{0:20}: {1}".format(item[0], round(item[1], 2)))

In [219]:
display_top(tf_vectorizer, tf)

scarlett            : 113
andi                : 75
rhett               : 67
michael             : 57
tell                : 56
ashley              : 42
kane                : 42
melani              : 41
dorothi             : 40
red                 : 39


In [220]:
display_top(tfidf_vectorizer, tfidf)

andi                : 0.79
kane                : 0.78
scarlett            : 0.72
rick                : 0.7
dorothi             : 0.69
schindler           : 0.63
mcmurphi            : 0.58
rose                : 0.56
jake                : 0.54
michael             : 0.53


# Similarity

There are many ways to measure "similarity" among word count vectors. A simple and common one is called **cosine similarity**. If $\mathbf{A}$ and $\mathbf{B}$ are two n-dimensional vectors, their cosine simialirty is defined as:

$$ \text{similarity} = \cos(\theta) = {\mathbf{A} \cdot \mathbf{B} \over \|\mathbf{A}\| \|\mathbf{B}\|} = \frac{ \sum\limits_{i=1}^{n}{A_i  B_i} }{ \sqrt{\sum\limits_{i=1}^{n}{A_i^2}}  \sqrt{\sum\limits_{i=1}^{n}{B_i^2}} } $$

One could also think of similarity as the opposite of distance. In this case, we could take a simple distance function, e.g., $n$-dimensional Euclidean distance, and define similarity as it's negative or inverse.

$$ \text{similarity} = - \|\mathbf{A - B}\| $$

In [221]:
# Calculate similarity between documents
from sklearn.metrics.pairwise import cosine_similarity
sim_mat = cosine_similarity(tfidf)

In [222]:
def get_most_similar(movie_title, n=2, similarity_matrix=sim_mat, movie_names=titles):
    focal_index = movie_names.index(movie_title)
    row = similarity_matrix[focal_index,:]
    sorted_row = sorted([m for m in enumerate(row) if m[0]!=focal_index], key=lambda x: x[1], reverse=True)
    print("Most similar movies:")
    for i in range(n):
        entry = sorted_row[i]
        title_index = entry[0]
        title = movie_names[title_index]
        print("{}: {}".format(i+1,title))
    

In [223]:
get_most_similar("The Godfather", n=1)

Most similar movies:
1: Gone with the Wind
