In [46]:
import os
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# import cpi
import pymongo
from tqdm import tqdm
from scipy import stats
import math
import numpy as np
import pandas as pd
import json

In [2]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
db = myclient["imdb"]
imdb = db['imdb_cleaned']

In [3]:
num_movies = imdb.count_documents({})
num_movies

8181

In [4]:
def getMovie(imdb_id):
    for x in imdb.find({'imdb_id': imdb_id}):
        return x
    return None

In [260]:
# test_movie = getMovie('tt0371746') # 
test_movie = getMovie('tt2406422')
test_movie

{'_id': ObjectId('60a66671ef235d48e48a2710'),
 '@context': 'http://schema.org',
 '@type': 'Movie',
 'url': '/title/tt2406422/',
 'name': 'Walking the Camino: Six Ways to Santiago',
 'image': 'https://m.media-amazon.com/images/M/MV5BMTU4ODIyNTc4Ml5BMl5BanBnXkFtZTgwMTg0ODM3MDE@._V1_.jpg',
 'genre': 'Documentary',
 'contentRating': 'Not Rated',
 'actor': [{'@type': 'Person',
   'url': '/name/nm12387145/',
   'name': "Annie O'Neil"},
  {'@type': 'Person', 'url': '/name/nm12387146/', 'name': 'Anna-Marie Misser'},
  {'@type': 'Person', 'url': '/name/nm12387147/', 'name': 'Tomas Moreno'},
  {'@type': 'Person', 'url': '/name/nm8681670/', 'name': 'Wayne Emde'}],
 'director': {'@type': 'Person',
  'url': '/name/nm0809166/',
  'name': 'Lydia B. Smith'},
 'description': "Walking the Camino: Six Ways to Santiago is a movie starring Annie O'Neil, Anna-Marie Misser, and Tomas Moreno. A profile of six pilgrims taking the Camino De Santiago pilgrimage.",
 'datePublished': '2013-02-27',
 'keywords': 'pi

In [333]:
def getInformation(movie):
    year                = movie['cleaned_year']
    rating              = movie['aggregateRating']['ratingValue']
    num_rating_givers   = movie['aggregateRating']['ratingCount']
    revenue             = movie['cleaned_Revenue']
    budget              = movie['cleaned_Budget']
    # opening_weekend     = movie['details']['Opening Weekend USA'][0]
    # return year, float(rating), int(num_rating_givers), revenue, budget
    return {
        'movie_id'  : movie['imdb_id'],
        'year'      : int(year),
        'rating'    : float(rating),
        'num_raters': int(num_rating_givers),
        'revenue'   : revenue,
        'budget'    : budget
    }

In [334]:
getInformation(test_movie)

{'movie_id': 'tt2406422',
 'year': 2013,
 'rating': 7.2,
 'num_raters': 297,
 'revenue': 1254162,
 'budget': 416618}

In [335]:
def updateDict(dct, info):
    # print(json.dumps(dct, indent=2))
    # print(dct[2008])
    dct[info['year']]['num_movies']         += 1
    dct[info['year']]['sum_rating']         += info['rating']
    dct[info['year']]['num_rating_givers']  += info['num_raters']
    dct[info['year']]['sum_revenue']        += info['revenue']
    dct[info['year']]['sum_budget']         += info['budget']

In [336]:
def getEntityID(entity):
    # print(entity)
    id = entity['url'].split('?')[0].split('/')[-2]
    return id

In [337]:
entity = {'@type': 'Person', 'url': '/name/nm0905579/', 'name': 'Michael Wadleigh'}

In [338]:
getEntityID(entity)

'nm0905579'

In [339]:
genre_features      = {}
star_features       = {}
director_features   = {}
prod_co_features  = {}
creator_features    = {}

def addEntityToFeatureDictionary(entity_id, entity_info, feature_dict):
    if(entity_id not in feature_dict):
        features = ["num_movies", "sum_rating", "sum_revenue", "sum_budget", "num_rating_givers"] #"sum_opening_weekend", "num_opening_weekend"#]
        feature_dict[entity_id] = entity_info
        feature_dict[entity_id]["year-wise-performance"] = {}
        for year in range(1967, 2021):
        # for year in range(2014, 2015):
            feature_dict[entity_id]["year-wise-performance"][year] = {}
            for f in features:
                feature_dict[entity_id]["year-wise-performance"][year][f] = 0
        feature_dict[entity_id]['movie_arr'] = []
    # return feature_dict    

In [340]:
def processMovie(movie, printLog=False):
    info = getInformation(movie)
    
    # genre
    genre = movie['genre']
    for g in genre:
        if(g not in genre_features):
            addEntityToFeatureDictionary(g, {'name': g}, genre_features)
        genre_features[g]['movie_arr'].append(info['movie_id'])
        updateDict(genre_features[g]["year-wise-performance"], info)

    # cast
    cast_ensamble = movie['cast_and_character']
    if(type(cast_ensamble) is not list):
        if(printLog == True):
            print(movie['imdb_id'], cast_ensamble)
    else:
        for cast in cast_ensamble:
            star = cast['actor']
            star_id = getEntityID(star)
            if(star_id not in star_features):
                addEntityToFeatureDictionary(star_id, star, star_features)
            star_features[star_id]['movie_arr'].append(info['movie_id'])
            updateDict(star_features[star_id]['year-wise-performance'], info)

    # directors
    directors = movie['director']
    if(type(directors) is not list):
        directors = [directors]
    for dr in directors:
        dr_id = getEntityID(dr)
        if(dr_id not in director_features):
            addEntityToFeatureDictionary(dr_id, dr, director_features)
        updateDict(director_features[dr_id]['year-wise-performance'], info)

    # creator
    if('creator' not in movie):
        if(printLog == True):
            print(movie['imdb_id'], 'error loading creator information')
    else:
        creators = movie['creator']
        if(type(creators) is not list):
            creators = [creators]
        for cr in creators:
            # print(cr)
            cr_id = getEntityID(cr)
            if(cr_id not in creator_features):
                addEntityToFeatureDictionary(cr_id, cr, creator_features)
            updateDict(creator_features[cr_id]['year-wise-performance'], info)

    if('Production Co' in movie['details']):
        # Production Co
        prod_arr = movie['details']['Production Co']
        if(type(prod_arr) is not list):
            prod_arr = [prod_arr]
        for pr in prod_arr:
            if(pr not in prod_co_features):
                addEntityToFeatureDictionary(pr, {'name': pr}, prod_co_features)
            updateDict(prod_co_features[pr]['year-wise-performance'], info)
    else:
        if(printLog==True):
            print(movie['imdb_id'], "This movie does not have production company information")

In [341]:
# processMovie(test_movie, printLog=True)

In [342]:
# director_features

In [343]:
cnt = 0
print_step = 500
total_movies = imdb.count_documents({})

for movie in imdb.find():
    try:
        processMovie(movie)
    except:
        print("Error >> ", movie['imdb_id'])
        break
    cnt += 1
    if(cnt % print_step == 0):
        print('processed {} out of {}'.format(cnt, total_movies))

processed 500 out of 8181
processed 1000 out of 8181
processed 1500 out of 8181
processed 2000 out of 8181
processed 2500 out of 8181
processed 3000 out of 8181
processed 3500 out of 8181
processed 4000 out of 8181
processed 4500 out of 8181
processed 5000 out of 8181
processed 5500 out of 8181
processed 6000 out of 8181
processed 6500 out of 8181
processed 7000 out of 8181
processed 7500 out of 8181
processed 8000 out of 8181


In [344]:
path = 'SavedFeatures'

# genre_features      = {}
# star_features       = {}
# director_features   = {}
# prod_co_features  = {}
# creator_features    = {}

In [345]:
with open(path+'/genre_features.json', 'w') as f:
    json.dump(genre_features, f)
    print("Genre Feature saved successfully")

Genre Feature saved successfully
