In [1]:
import pandas as pd
import numpy as np
import json
from pprint import pprint

In [2]:
from pymongo import MongoClient

In [17]:
from copy import deepcopy

In [3]:
client = MongoClient()

In [5]:
list(client.list_databases())

[{'name': 'admin', 'sizeOnDisk': 32768.0, 'empty': False},
 {'name': 'config', 'sizeOnDisk': 73728.0, 'empty': False},
 {'name': 'local', 'sizeOnDisk': 73728.0, 'empty': False},
 {'name': 'movies', 'sizeOnDisk': 117374976.0, 'empty': False}]

In [6]:
db = client.movies
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'movies')

In [8]:
db.list_collection_names()

['movies']

In [10]:
movies = db.movies
movies

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'movies'), 'movies')

In [50]:
movies.find_one()

{'_id': ObjectId('5e4ef8ff7a577357a4db83c3'),
 'wikipedia_id': 975900,
 'freebase_id': '/m/03vyhn',
 'title': 'Ghosts of Mars',
 'release_date': '2001-08-24',
 'box_office_revenue': 14010832.0,
 'runtime': 98.0,
 'languages': ['English Language'],
 'countries': ['United States of America'],
 'genres': ['Thriller',
  'Science Fiction',
  'Horror',
  'Adventure',
  'Supernatural',
  'Action',
  'Space western'],
 'summary': 'Set in the second half of the 22nd century, the film depicts Mars as a planet that has been 84% terraformed, allowing humans to walk on the surface without wearing pressure suits. The Martian society has become largely matriarchal, with women in most positions of authority. The story concerns a police officer, Melanie Ballard , second in command of a small team alongside Sergeant Jericho  sent to pick up and transport a prisoner named Desolation Williams . Arriving at the remote mining town where Williams is being held, Ballard finds virtually all of the people missi

In [13]:
movies.count_documents({})

81840

In [49]:
test = movies.find_one()
test

{'_id': ObjectId('5e4ef8ff7a577357a4db83c3'),
 'wikipedia_id': 975900,
 'freebase_id': '/m/03vyhn',
 'title': 'Ghosts of Mars',
 'release_date': '2001-08-24',
 'box_office_revenue': 14010832.0,
 'runtime': 98.0,
 'languages': ['English Language'],
 'countries': ['United States of America'],
 'genres': ['Thriller',
  'Science Fiction',
  'Horror',
  'Adventure',
  'Supernatural',
  'Action',
  'Space western'],
 'summary': 'Set in the second half of the 22nd century, the film depicts Mars as a planet that has been 84% terraformed, allowing humans to walk on the surface without wearing pressure suits. The Martian society has become largely matriarchal, with women in most positions of authority. The story concerns a police officer, Melanie Ballard , second in command of a small team alongside Sergeant Jericho  sent to pick up and transport a prisoner named Desolation Williams . Arriving at the remote mining town where Williams is being held, Ballard finds virtually all of the people missi

True

In [None]:
list(test['languages'].values())

In [31]:
for m in movies.find({}):
    updates = {}
    
    if "languages" in m and isinstance(m['languages'],dict):
        updates['languages'] = list(m['languages'].values())
        
    if "countries" in m and isinstance(m['countries'],dict):
        updates['countries'] = list(m['countries'].values())
        
    if "genres" in m and isinstance(m['genres'],dict):
        updates['genres'] = list(m['genres'].values())
    if len(updates):
        movies.update_one(
            {"_id": m['_id']},
            {"$set": updates}
        )

In [47]:
print("%% w/ languages: %.5f" % (movies.count_documents({"languages":{"$exists":True}}) / movies.count_documents({})))
print("%% w/ countries: %.5f" % (movies.count_documents({"countries":{"$exists":True}}) / movies.count_documents({})))
print("%% w/ genres:    %.5f" % (movies.count_documents({"genres":{"$exists":True}}) / movies.count_documents({})))

% w/ languages: 0.99879
% w/ countries: 0.99879
% w/ genres:    0.99879


In [48]:
list(movies.find({"languages":{"$exists":False}}))

[{'_id': ObjectId('5e4efda326d8a3264deeb769'),
  'wikipedia_id': 2862137,
  'summary': 'Sathyanarayan Murthy  is a rough but kind man who lives in Amalapuram with his niece, mother, sister and her husband. Sathya and his niece Lavanya share a special bond and are the best of friends. One day, Lavanya dies while playing on the swing. Sathya later finds out that she has been murdered by gun dealer Raj Malhotra ([[Arbaaz Khan . He meets and falls in love with Shilaza . Sathyanarayan challenges Raj Malhotra and wants to go the US but cannot get a permanent visa because he was not educated. He meets and marries Neelu , then goes to the US and kills Malhotra.'},
 {'_id': ObjectId('5e4efdad26d8a3264deeb8a1'),
  'wikipedia_id': 33334420,
  'summary': '{{more plot}} Mural tells the story of Zhu Xiaolian, a poor scholar who travels to the capital to take an examination. During this trip, Zhu becomes distracted by a mural and enters into the fantasy land depicted therein. There he meets the fairi

In [61]:
def avg(vals):
    i = 0
    total = 0
    for v in vals:
        i += 1
        total += v
    if i > 0:
        return total / i
    else:
        return None

print("Maximum summary length (in characters)",max(map(
    lambda e: e['summary_length'],
    movies.aggregate([
    {
      "$match": {
          "summary": {"$exists":True}
      }  
    },
    {
        "$project":{
            "summary_length": { "$strLenCP": "$summary" }
        }
    }
]))))
print("Average summary length (in characters)",avg(map(
    lambda e: e['summary_length'],
    movies.aggregate([
    {
      "$match": {
          "summary": {"$exists":True}
      }  
    },
    {
        "$project":{
            "summary_length": { "$strLenCP": "$summary" }
        }
    }
]))))
print("Minimum summary length (in characters)",min(map(
    lambda e: e['summary_length'],
    movies.aggregate([
    {
      "$match": {
          "summary": {"$exists":True}
      }  
    },
    {
        "$project":{
            "summary_length": { "$strLenCP": "$summary" }
        }
    }
]))))

Maximum summary length (in characters) 28159
Average summary length (in characters) 1784.0342292508806
Minimum summary length (in characters) 99


***

Now let's try to analyze the summaries

In [81]:
movies.count_documents({"summary":{"$exists":False}})

39537

In [63]:
movies.find_one({})

{'_id': ObjectId('5e4ef8ff7a577357a4db83c3'),
 'wikipedia_id': 975900,
 'freebase_id': '/m/03vyhn',
 'title': 'Ghosts of Mars',
 'release_date': '2001-08-24',
 'box_office_revenue': 14010832.0,
 'runtime': 98.0,
 'languages': ['English Language'],
 'countries': ['United States of America'],
 'genres': ['Thriller',
  'Science Fiction',
  'Horror',
  'Adventure',
  'Supernatural',
  'Action',
  'Space western'],
 'summary': 'Set in the second half of the 22nd century, the film depicts Mars as a planet that has been 84% terraformed, allowing humans to walk on the surface without wearing pressure suits. The Martian society has become largely matriarchal, with women in most positions of authority. The story concerns a police officer, Melanie Ballard , second in command of a small team alongside Sergeant Jericho  sent to pick up and transport a prisoner named Desolation Williams . Arriving at the remote mining town where Williams is being held, Ballard finds virtually all of the people missi

In [68]:
next(movies.find({},{"_id":0,"wikipedia_id":1,"summary":1,"title":1}))

{'wikipedia_id': 975900,
 'title': 'Ghosts of Mars',
 'summary': 'Set in the second half of the 22nd century, the film depicts Mars as a planet that has been 84% terraformed, allowing humans to walk on the surface without wearing pressure suits. The Martian society has become largely matriarchal, with women in most positions of authority. The story concerns a police officer, Melanie Ballard , second in command of a small team alongside Sergeant Jericho  sent to pick up and transport a prisoner named Desolation Williams . Arriving at the remote mining town where Williams is being held, Ballard finds virtually all of the people missing. She learns that the miners had discovered an underground doorway created by an ancient Martian civilization. When the door was opened it released "ghosts," disembodied spirits which possessed the miners. Violence ensues, as the possessed miners commit horrific acts of death and destruction, as well as self-mutilation. With their team leader Helena Bradock

In [82]:
summary_df = pd.DataFrame(movies.find({"summary":{"$exists":True}},{"_id":0,"wikipedia_id":1,"summary":1,"title":1}))
summary_df = summary_df[["wikipedia_id","title","summary"]]
summary_df.head()

Unnamed: 0,wikipedia_id,title,summary
0,975900,Ghosts of Mars,"Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,A series of murders of rich young women throug...
2,261236,A Woman in Flames,"Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,"Every hundred years, the evil Morgana returns..."
4,6631279,Little city,"Adam, a San Francisco-based artist who works a..."


In [83]:
summary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 3 columns):
wikipedia_id    42303 non-null int64
title           42204 non-null object
summary         42303 non-null object
dtypes: int64(1), object(2)
memory usage: 991.6+ KB


In [84]:
import string

import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, NMF, LatentDirichletAllocation

In [221]:
ls

[1m[36mdata[m[m/                   movie_lens_setup.ipynb  [1m[36mpickles[m[m/
mongo_movies.ipynb      names.csv               [1m[36mx_ideas[m[m/
movie_data_eda.ipynb    names.json


In [85]:
with open("names.json") as f:
    name_list = json.loads(f.read())
name_list[:5]

['aaban', 'aabha', 'aabid', 'aabriella', 'aadam']

In [94]:
"–" in string.punctuation

False

In [180]:
letters = list("abcdefg")
n = 4
for i in range(len(letters)-n+1):
    print(" ".join(letters[i:i+n]))

a b c d
b c d e
c d e f
d e f g


In [199]:
import re

In [206]:
re.sub(r"[^\w\s]",""," Test 123 - ' a,.'").strip()

'Test 123   a'

In [200]:
r = re.compile(r"[^\w\s]")
r

re.compile(r'[^\w\s]', re.UNICODE)

In [207]:
r.sub(""," Test 123 - ' a,.'").strip()

'Test 123   a'

In [217]:
class WordCleaner:
    def __init__(self,ngram_range=(1,1)):
        self.ngram_range = ngram_range
        self.stemmer = SnowballStemmer("english")
        self.stop_words = set(nltk.corpus.stopwords.words("english"))
        self.names = set(name_list)
        self.punctuation = set(string.punctuation).union({"’","–","“","”","—","—"})
        self.digits = set(string.digits)
        
        self.BAD_CHARS = re.compile(r"[^\w\s]")
        self.EXTRA_SPACE = re.compile(r"\s+")
        return
    
    def __call__(self,text):
        return list(self.to_ngram(text))
    
    def sent_gen(self,text):
        for w in nltk.word_tokenize(text):
            w = self.BAD_CHARS.sub("",w).strip()
            w = self.EXTRA_SPACE.sub(" ",w)
            if w in self.stop_words or w in self.names: 
                continue
            sw = self.stemmer.stem(w)
            if sw in self.stop_words or w in self.names: 
                continue
            if not w: continue
            yield sw
    
    def to_ngram(self,text):
        ng_start, ng_end = self.ngram_range
        words = list(self.sent_gen(text))
        for n in range(ng_start,ng_end+1):
            for ng in self.make_ngram(words,n):
                yield ng
    
    def make_ngram(self,word_list,n):
        for i in range(len(word_list)-(n-1)):
            yield " ".join(word_list[i:i+n])
            
wc = WordCleaner((1,1))
wc(summary_df['summary'][0])[-10:]

['wrong',
 'year',
 'year ago',
 'year later',
 'year old',
 'yearold',
 'yet',
 'youth',
 'zombi',
 '—']

In [152]:
tfidf = TfidfVectorizer(
    tokenizer=wc,
    max_features=1000
)
tfidf_summ = tfidf.fit_transform(summary_df['summary'])
tfidf_summ

<42303x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2570625 stored elements in Compressed Sparse Row format>

In [153]:
tfidf.vocabulary_

{'second': 759,
 'half': 396,
 'film': 335,
 'planet': 631,
 'allow': 31,
 'human': 426,
 'walk': 950,
 'without': 977,
 'wear': 959,
 'suit': 849,
 'societi': 810,
 'becom': 90,
 'larg': 480,
 'women': 979,
 'posit': 644,
 'author': 76,
 'concern': 187,
 'polic': 638,
 'polic offic': 639,
 'offic': 584,
 'command': 179,
 'small': 807,
 'team': 871,
 'sent': 770,
 'pick': 625,
 'transport': 906,
 'prison': 656,
 'name': 558,
 'arriv': 61,
 'mine': 539,
 'town': 902,
 'held': 404,
 'find': 339,
 'peopl': 617,
 'miss': 542,
 'learn': 488,
 'discov': 257,
 'creat': 214,
 'door': 264,
 'open': 592,
 'releas': 703,
 'ghost': 373,
 'possess': 645,
 'ensu': 294,
 'commit': 180,
 'act': 9,
 'death': 233,
 'well': 963,
 'leader': 487,
 'murder': 555,
 'must': 556,
 'fight': 332,
 'attack': 69,
 'escap': 297,
 'destroy': 247,
 'possibl': 646,
 'unfortun': 930,
 'intent': 451,
 'fact': 320,
 'kill': 470,
 'anoth': 45,
 'eventu': 300,
 'decid': 234,
 'blow': 109,
 'tri': 911,
 'sever': 779,
 'poin

In [154]:
tfidf_summ.shape

(42303, 1000)

In [155]:
words = np.array(tfidf.get_feature_names())

In [156]:
nmf = NMF(
    100
)
nmf_words = nmf.fit_transform(tfidf_summ.T)
print("NMF Shape: %d words x %d categories" % nmf_words.shape)

(1000, 100)

In [157]:
n_words = 10
for i in range(nmf.n_components):
    print(f"Topic {i:2d}: " + ", ".join(words[nmf_words[:,i].argsort()[::-1][:n_words]] ))

Topic  0: tri, tri get, keep, save, convinc, stop, fail, desper, meanwhil, struggl
Topic  1: escap, plan, attempt, captur, discov, rescu, howev, manag, arriv, reveal
Topic  2: film, follow, director, scene, featur, base, interview, narrat, includ, film end
Topic  3: marriag, arrang, wed, propos, accept, agre, love, divorc, engag, also
Topic  4: murder, investig, detect, suspect, crime, wit, accus, found, evid, involv
Topic  5: school, high school, high, teacher, class, student, attend, board, teach, miss
Topic  6: villag, small, mountain, land, nativ, teacher, return, water, fish, reach
Topic  7: father, wish, dad, die, want, also, childhood, learn, never, whose
Topic  8: death, die, dead, accid, bodi, mysteri, sentenc, investig, respons, funer
Topic  9: cite, web, cite web, news, base, book, travel, polit, follow, teenag
Topic 10: famili, member, support, problem, relat, cousin, poor, busi, grandfath, move
Topic 11: human, use, planet, robot, scientist, destroy, space, machin, anim, c

In [158]:
nmf_words = pd.Series(
    [", ".join(words[nmf_words[:,i].argsort()[::-1][:n_words]]) for i in range(nmf.n_components)]
)
nmf_words.head()

0    tri, tri get, keep, save, convinc, stop, fail,...
1    escap, plan, attempt, captur, discov, rescu, h...
2    film, follow, director, scene, featur, base, i...
3    marriag, arrang, wed, propos, accept, agre, lo...
4    murder, investig, detect, suspect, crime, wit,...
dtype: object

In [159]:
import re

In [160]:
nmf_words[nmf_words.apply(re.compile(r"[^,] ").search).notna()]

0     tri, tri get, keep, save, convinc, stop, fail,...
2     film, follow, director, scene, featur, base, i...
5     school, high school, high, teacher, class, stu...
9     cite, web, cite web, news, base, book, travel,...
13    polic, inspector, arrest, crimin, polic offic,...
15    come, know, come back, start, across, also, lo...
17    war, world war, nazi, second, resist, join, co...
26    back, bring, go, get back, come back, go back,...
32    new, new citi, move, career, learn, includ, me...
34    year, old, year old, year later, later, ago, y...
37    end, film end, eventu, final, turn, first, wel...
41    home, return, return home, back home, visit, a...
46    men, two men, four, gun, shoot, shot, wound, o...
48    take, place, take place, reveng, part, turn, a...
49    get, start, goe, get marri, tri get, thing, tr...
52    one, night, one night, anoth, last, seem, pers...
55    citi, new citi, big, street, build, search, lo...
56    two, two men, togeth, sister, lover, diffe

In [161]:
nmf_labels = pd.Series(
    nmf.components_.argmax(0)
)
nmf_labels.head()

0    11
1    27
2    15
3    34
4    20
dtype: int64

In [162]:
nmf_movies = pd.DataFrame({
    "category": nmf_labels,
    "label": nmf_words.iloc[nmf_labels].reset_index(drop=True),
    "title": summary_df.title,
    "summary": summary_df['summary']
}).sort_values("category")
nmf_movies.head()

Unnamed: 0,category,label,title,summary
19290,0,"tri, tri get, keep, save, convinc, stop, fail,...",Calendar Girl Murders,The plot revolves around the relationship betw...
38228,0,"tri, tri get, keep, save, convinc, stop, fail,...",Dil Ka Rishta,Dil Ka Rishta is about a young man named Jai ....
5775,0,"tri, tri get, keep, save, convinc, stop, fail,...",Penelope,{{plot}} Penelope Elcott is the wife of wealt...
5771,0,"tri, tri get, keep, save, convinc, stop, fail,...",Five Fingers,"Martijn , a gifted Dutch jazz pianist, flies t..."
29146,0,"tri, tri get, keep, save, convinc, stop, fail,...",I Can Hardly Wait,"The Stooges, who are defense workers, prepare ..."


In [163]:
nmf_movies.groupby("category").head(5)

Unnamed: 0,category,label,title,summary
19290,0,"tri, tri get, keep, save, convinc, stop, fail,...",Calendar Girl Murders,The plot revolves around the relationship betw...
38228,0,"tri, tri get, keep, save, convinc, stop, fail,...",Dil Ka Rishta,Dil Ka Rishta is about a young man named Jai ....
5775,0,"tri, tri get, keep, save, convinc, stop, fail,...",Penelope,{{plot}} Penelope Elcott is the wife of wealt...
5771,0,"tri, tri get, keep, save, convinc, stop, fail,...",Five Fingers,"Martijn , a gifted Dutch jazz pianist, flies t..."
29146,0,"tri, tri get, keep, save, convinc, stop, fail,...",I Can Hardly Wait,"The Stooges, who are defense workers, prepare ..."
27270,1,"escap, plan, attempt, captur, discov, rescu, h...",Highlander: The Final Dimension,16th century Some time after the death of his ...
32073,1,"escap, plan, attempt, captur, discov, rescu, h...",Naruto the Movie: Snow Princess' Book of Ninja...,"The film opens with Naruto Uzumaki, Sakura Har..."
22041,1,"escap, plan, attempt, captur, discov, rescu, h...",Rob Roy,Robert Roy MacGregor is a cattle drover and t...
34195,1,"escap, plan, attempt, captur, discov, rescu, h...",The Lives of a Bengal Lancer,On the northwest frontier of India during the ...
41676,1,"escap, plan, attempt, captur, discov, rescu, h...",British Intelligence,A master German spy named Franz Strendler has ...
