In [1]:
import os
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# import cpi
import pymongo
from tqdm import tqdm
from scipy import stats
import math
import numpy as np
import pandas as pd
import json

In [2]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
db = myclient["imdb"]
imdb = db['imdb_cleaned']

In [3]:
num_movies = imdb.count_documents({})
num_movies

8181

In [4]:
def getMovie(imdb_id):
    for x in imdb.find({'imdb_id': imdb_id}):
        return x
    return None

In [5]:
# test_movie = getMovie('tt0371746') # 
test_movie = getMovie('tt1230215')
test_movie

{'_id': ObjectId('60a664e6ef235d48e488dcca'),
 '@context': 'http://schema.org',
 '@type': 'Movie',
 'url': '/title/tt1230215/',
 'name': 'Not Fade Away',
 'image': 'https://m.media-amazon.com/images/M/MV5BMzU2NDU2MzIyMl5BMl5BanBnXkFtZTcwMDk2NTY1OA@@._V1_.jpg',
 'genre': 'Drama',
 'contentRating': 'R',
 'actor': [{'@type': 'Person',
   'url': '/name/nm1910274/',
   'name': 'John Magaro'},
  {'@type': 'Person', 'url': '/name/nm1658935/', 'name': 'Jack Huston'},
  {'@type': 'Person', 'url': '/name/nm2939030/', 'name': 'Will Brill'},
  {'@type': 'Person', 'url': '/name/nm4326044/', 'name': 'Brahm Vaccarella'}],
 'director': {'@type': 'Person',
  'url': '/name/nm0153740/',
  'name': 'David Chase'},
 'creator': [{'@type': 'Person',
   'url': '/name/nm0153740/',
   'name': 'David Chase'},
  {'@type': 'Organization', 'url': '/company/co0005072/'},
  {'@type': 'Organization', 'url': '/company/co0071947/'},
  {'@type': 'Organization', 'url': '/company/co0215519/'},
  {'@type': 'Organization', 'u

In [9]:
def getInformation(movie):
    year                = movie['cleaned_year']
    rating              = movie['aggregateRating']['ratingValue']
    num_rating_givers   = movie['aggregateRating']['ratingCount']
    revenue             = movie['cleaned_Revenue']
    budget              = movie['cleaned_Budget']
    # opening_weekend     = movie['details']['Opening Weekend USA'][0]
    # return year, float(rating), int(num_rating_givers), revenue, budget
    return {
        'movie_id'  : movie['imdb_id'],
        'year'      : int(year),
        'rating'    : float(rating),
        'num_raters': int(num_rating_givers),
        'revenue'   : revenue,
        'budget'    : budget
    }

In [10]:
getInformation(test_movie)

{'movie_id': 'tt1230215',
 'year': 2013,
 'rating': 6.0,
 'num_raters': 4354,
 'revenue': 707027,
 'budget': 22219637}

In [12]:
def updateDict(dct, info):
    # print(json.dumps(dct, indent=2))
    # print(dct[2008])
    dct[info['year']]['num_movies']         += 1
    dct[info['year']]['sum_rating']         += info['rating']
    dct[info['year']]['num_raters']         += info['num_raters']
    dct[info['year']]['sum_revenue']        += info['revenue']
    dct[info['year']]['sum_budget']         += info['budget']

In [13]:
def getEntityID(entity):
    # print(entity)
    id = entity['url'].split('?')[0].split('/')[-2]
    return id

In [14]:
entity = {'@type': 'Person', 'url': '/name/nm0905579/', 'name': 'Michael Wadleigh'}

In [15]:
getEntityID(entity)

'nm0905579'

In [12]:
genre_features      = {}
star_features       = {}
director_features   = {}
prod_co_features  = {}
creator_features    = {}

def addEntityToFeatureDictionary(entity_id, entity_info, feature_dict):
    if(entity_id not in feature_dict):
        features = ["num_movies", "sum_rating", "sum_revenue", "sum_budget", "num_raters"] #"sum_opening_weekend", "num_opening_weekend"#]
        feature_dict[entity_id] = entity_info
        feature_dict[entity_id]["year-wise-performance"] = {}
        for year in range(1967, 2021):
        # for year in range(2014, 2015):
            feature_dict[entity_id]["year-wise-performance"][year] = {}
            for f in features:
                feature_dict[entity_id]["year-wise-performance"][year][f] = 0
        feature_dict[entity_id]['movie_arr'] = []
    # return feature_dict    

In [13]:
def processMovie(movie, printLog=False):
    info = getInformation(movie)
    
    # genre
    genre = movie['genre']
    if(type(genre) is not list):
        genre = [genre]
    for g in genre:
        if(g not in genre_features):
            addEntityToFeatureDictionary(g, {'name': g}, genre_features)
        genre_features[g]['movie_arr'].append(info['movie_id'])
        updateDict(genre_features[g]["year-wise-performance"], info)

    # cast
    cast_ensamble = movie['cast_and_character']
    if(type(cast_ensamble) is not list):
        if(printLog == True):
            print(movie['imdb_id'], cast_ensamble)
    else:
        for cast in cast_ensamble:
            star = cast['actor']
            star_id = getEntityID(star)
            if(star_id not in star_features):
                addEntityToFeatureDictionary(star_id, star, star_features)
            star_features[star_id]['movie_arr'].append(info['movie_id'])
            updateDict(star_features[star_id]['year-wise-performance'], info)

    # directors
    directors = movie['director']
    if(type(directors) is not list):
        directors = [directors]
    for dr in directors:
        dr_id = getEntityID(dr)
        if(dr_id not in director_features):
            addEntityToFeatureDictionary(dr_id, dr, director_features)
        director_features[dr_id]['movie_arr'].append(info['movie_id'])
        updateDict(director_features[dr_id]['year-wise-performance'], info)

    # creator
    if('creator' not in movie):
        if(printLog == True):
            print(movie['imdb_id'], 'error loading creator information')
    else:
        creators = movie['creator']
        if(type(creators) is not list):
            creators = [creators]
        for cr in creators:
            # print(cr)
            cr_id = getEntityID(cr)
            if(cr_id not in creator_features):
                addEntityToFeatureDictionary(cr_id, cr, creator_features)
            creator_features[cr_id]['movie_arr'].append(info['movie_id'])
            updateDict(creator_features[cr_id]['year-wise-performance'], info)

    if('Production Co' in movie['details']):
        # Production Co
        prod_arr = movie['details']['Production Co']
        if(type(prod_arr) is not list):
            prod_arr = [prod_arr]
        for pr in prod_arr:
            if(pr not in prod_co_features):
                addEntityToFeatureDictionary(pr, {'name': pr}, prod_co_features)
            prod_co_features[pr]['movie_arr'].append(info['movie_id'])
            updateDict(prod_co_features[pr]['year-wise-performance'], info)
    else:
        if(printLog==True):
            print(movie['imdb_id'], "This movie does not have production company information")

In [14]:
# processMovie(test_movie, printLog=True)

In [15]:
# director_features

In [16]:
cnt = 0
print_step = 500
total_movies = imdb.count_documents({})

for movie in imdb.find():
    try:
        processMovie(movie)
    except:
        print("Error >> ", movie['imdb_id'])
        break
    cnt += 1
    if(cnt % print_step == 0):
        print('processed {} out of {}'.format(cnt, total_movies))

processed 500 out of 8181
processed 1000 out of 8181
processed 1500 out of 8181
processed 2000 out of 8181
processed 2500 out of 8181
processed 3000 out of 8181
processed 3500 out of 8181
processed 4000 out of 8181
processed 4500 out of 8181
processed 5000 out of 8181
processed 5500 out of 8181
processed 6000 out of 8181
processed 6500 out of 8181
processed 7000 out of 8181
processed 7500 out of 8181
processed 8000 out of 8181


In [17]:
def calculateIndividualTotal(feature_dct):

    # print(feature_dct)
    # print(json.dumps(feature_dct, intend=2))

    feature_dct['total_movies']   = 0
    feature_dct['total_rating']   = 0
    feature_dct['total_raters']   = 0
    feature_dct['total_revenue']  = 0
    feature_dct['total_budget']   = 0
    
    for year in feature_dct['year-wise-performance']:
        feature_dct['total_movies']   += feature_dct['year-wise-performance'][year]['num_movies']
        feature_dct['total_rating']   += feature_dct['year-wise-performance'][year]['sum_rating']
        feature_dct['total_raters']   += feature_dct['year-wise-performance'][year]['num_raters']
        feature_dct['total_revenue']  += feature_dct['year-wise-performance'][year]['sum_revenue']
        feature_dct['total_budget']   += feature_dct['year-wise-performance'][year]['sum_budget']

In [18]:
def updateTotal(feature_dct):
    for entity in feature_dct:
        # print(entity)
        calculateIndividualTotal(feature_dct[entity])

In [19]:
updateTotal(genre_features)

In [20]:
for genre in genre_features:
    print(genre, genre_features[genre]['total_movies'])

Action 2031
Adventure 1442
Sci-Fi 937
Comedy 3066
Fantasy 980
Drama 4397
Mystery 902
Thriller 2516
Romance 1726
Crime 1623
Horror 982
Family 831
Animation 397
Musical 193
Music 310
Biography 535
Western 136
Sport 295
War 344
History 365
Documentary 167
News 5
Short 2


In [21]:
updateTotal(star_features)

In [22]:
updateTotal(director_features)

In [23]:
updateTotal(prod_co_features)

In [24]:
updateTotal(creator_features)

In [18]:
path = 'SavedFeatures'

# genre_features      = {}
# star_features       = {}
# director_features   = {}
# prod_co_features  = {}
# creator_features    = {}

In [26]:
with open(path+'/genre_features.json', 'w') as f:
    json.dump(genre_features, f)
    print("Genre Feature saved successfully")

Genre Feature saved successfully


In [27]:
with open(path+'/star_features.json', 'w') as f:
    json.dump(star_features, f)
    print("star_features saved successfully")

star_features saved successfully


In [28]:
with open(path+'/director_features.json', 'w') as f:
    json.dump(director_features, f)
    print("director_features saved successfully")

director_features saved successfully


In [29]:
with open(path+'/prod_co_features.json', 'w') as f:
    json.dump(prod_co_features, f)
    print("prod_co_features saved successfully")

prod_co_features saved successfully


In [30]:
with open(path+'/creator_features.json', 'w') as f:
    json.dump(creator_features, f)
    print("creator_features saved successfully")

creator_features saved successfully


# Star familiarity

In [6]:
familiarity_feature = {}

def updateFamiliarityMatrix(dct, info):
    if('num_movies' not in dct):
        dct['num_movies'] = 1
        dct['sum_rating'] = info['rating']
        dct['num_raters'] = info['num_raters']
        dct['sum_revenue'] = info['revenue']
        dct['sum_budget'] = info['budget']

    else:
        dct['num_movies'] += 1
        dct['sum_rating'] += info['rating']
        dct['num_raters'] += info['num_raters']
        dct['sum_revenue'] += info['revenue']
        dct['sum_budget'] += info['budget']



def updateStarFamiliarity(movie, printLog=False):
    info = getInformation(movie)
    year = movie['cleaned_year']

    cast_ensamble = movie['cast_and_character']
    if(type(cast_ensamble) is not list):
        if(printLog == True):
            print(movie['imdb_id'], cast_ensamble)
    else:
        for i_cast in cast_ensamble:
            i_star = i_cast['actor']
            i_star_id = getEntityID(i_star)
            if(i_star_id not in familiarity_feature):
                familiarity_feature[i_star_id] = {}
            for j_cast in cast_ensamble:
                j_star = j_cast['actor']
                j_star_id = getEntityID(j_star)
                if(j_star_id not in familiarity_feature[i_star_id]):
                    familiarity_feature[i_star_id][j_star_id] = {}
                if(year not in familiarity_feature[i_star_id][j_star_id]):
                    familiarity_feature[i_star_id][j_star_id][year] = {}

                updateFamiliarityMatrix(familiarity_feature[i_star_id][j_star_id][year], info)

In [16]:
cnt = 0
print_step = 500
total_movies = imdb.count_documents({})

for movie in imdb.find():
    # try:
    updateStarFamiliarity(movie)
    # except:
    #     print("Error >> ", movie['imdb_id'])
    #     break
    cnt += 1
    if(cnt % print_step == 0):
        print('processed {} out of {}'.format(cnt, total_movies))

processed 500 out of 8181
processed 1000 out of 8181
processed 1500 out of 8181
processed 2000 out of 8181
processed 2500 out of 8181
processed 3000 out of 8181
processed 3500 out of 8181
processed 4000 out of 8181
processed 4500 out of 8181
processed 5000 out of 8181
processed 5500 out of 8181
processed 6000 out of 8181
processed 6500 out of 8181
processed 7000 out of 8181
processed 7500 out of 8181
processed 8000 out of 8181


In [19]:
with open(path+'/familiarity_feature.json', 'w') as f:
    json.dump(familiarity_feature, f)
    print("familiarity_feature saved successfully")

familiarity_feature saved successfully
