In [2]:
import os
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# import cpi
import pymongo
from tqdm import tqdm
from scipy import stats
import math
import numpy as np
import pandas as pd
import json

In [3]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
db = myclient["imdb"]
imdb = db['imdb_cleaned']

In [4]:
num_movies = imdb.count_documents({})
num_movies

8181

In [5]:
def getMovie(imdb_id):
    for x in imdb.find({'imdb_id': imdb_id}):
        return x
    return None

In [6]:
# test_movie = getMovie('tt0371746') # 
test_movie = getMovie('tt0371746')
test_movie

{'_id': ObjectId('60a6633eef235d48e488a31c'),
 '@context': 'http://schema.org',
 '@type': 'Movie',
 'url': '/title/tt0371746/',
 'name': 'Iron Man',
 'image': 'https://m.media-amazon.com/images/M/MV5BMTczNTI2ODUwOF5BMl5BanBnXkFtZTcwMTU0NTIzMw@@._V1_.jpg',
 'genre': ['Action', 'Adventure', 'Sci-Fi'],
 'contentRating': 'PG-13',
 'actor': [{'@type': 'Person',
   'url': '/name/nm0000375/',
   'name': 'Robert Downey Jr.'},
  {'@type': 'Person', 'url': '/name/nm0000569/', 'name': 'Gwyneth Paltrow'},
  {'@type': 'Person', 'url': '/name/nm0005024/', 'name': 'Terrence Howard'},
  {'@type': 'Person', 'url': '/name/nm0000313/', 'name': 'Jeff Bridges'}],
 'director': {'@type': 'Person',
  'url': '/name/nm0269463/',
  'name': 'Jon Favreau'},
 'creator': [{'@type': 'Person',
   'url': '/name/nm1318843/',
   'name': 'Mark Fergus'},
  {'@type': 'Person', 'url': '/name/nm1319757/', 'name': 'Hawk Ostby'},
  {'@type': 'Person', 'url': '/name/nm1436466/', 'name': 'Art Marcum'},
  {'@type': 'Person', 'url'

In [7]:
def getInformation(movie):
    year                = movie['cleaned_year']
    rating              = movie['aggregateRating']['ratingValue']
    num_rating_givers   = movie['aggregateRating']['ratingCount']
    revenue             = movie['cleaned_Revenue']
    budget              = movie['cleaned_Budget']
    # opening_weekend     = movie['details']['Opening Weekend USA'][0]
    # return year, float(rating), int(num_rating_givers), revenue, budget
    return {
        'movie_id'  : movie['imdb_id'],
        'year'      : int(year),
        'rating'    : float(rating),
        'num_raters': int(num_rating_givers),
        'revenue'   : revenue,
        'budget'    : budget
    }

In [8]:
getInformation(test_movie)

{'movie_id': 'tt0371746',
 'year': 2008,
 'rating': 7.9,
 'num_raters': 956424,
 'revenue': 704172781,
 'budget': 168290920}

In [9]:
month_track = {
        'January': (1, 57402403.634036146),
        'February': (2, 74235592.49102773),
        'March': (3, 82181438.64876632),
        'April': (4, 59965216.39048991),
        'May': (5, 156523313.91325694),
        'June': (6, 170183081.491654),
        'July': (7, 152447106.1206349),
        'August': (8, 68545242.55675675),
        'September': (9, 49954451.7446198),
        'October': (10, 60086038.029336736),
        'November': (11, 130818934.29671897),
        'December': (12, 163332990.71816882)
    }

In [10]:
genre_track = {
        'Action': (0, 169388692.13589364),
        'Adventure': (1, 255502542.74965325),
        'Sci-Fi': (2, 205304741.40341514),
        'Comedy': (3, 98024501.8939987),
        'Fantasy': (4, 200222315.72959185),
        'Drama': (5, 67705931.90470776),
        'Mystery': (6, 86606633.73725055),
        'Thriller': (7, 90004692.5918124),
        'Romance': (8, 80306682.65585168),
        'Crime': (9, 70616080.00800985),
        'Horror': (10, 58328445.15274949),
        'Family': (11, 202603250.5475331),
        'Animation': (12, 253666143.74559194),
        'Musical': (13, 192324158.4041451),
        'Music': (14, 77118085.2032258),
        'Biography': (15, 63399900.22803738),
        'Western': (16, 93041041.61764705),
        'Sport': (17, 71139162.43728814),
        'War': (18, 81059365.2005814),
        'History': (19, 66552163.24931507),
        'Documentary': (20, 15596859.45508982),
        'News': (21, 12874080.0),
        'Short': (22, 740558.0)
    }
genre_features = {}
with open('SavedFeatures/genre_features.json', 'r') as f:
    genre_features = json.load(f)
len(genre_features.keys())

23

In [11]:
star_features = {}
with open('SavedFeatures/star_features.json', 'r') as f:
    star_features = json.load(f)
len(star_features.keys())

57064

In [12]:
director_features = {}
with open('SavedFeatures/director_features.json', 'r') as f:
    director_features = json.load(f)
len(director_features.keys())

4078

In [13]:
prod_features = {}
with open('SavedFeatures/prod_co_features.json', 'r') as f:
    prod_features = json.load(f)
len(prod_features.keys())

7422

In [14]:
creator_features = {}
with open('SavedFeatures/creator_features.json', 'r') as f:
    creator_features = json.load(f)
len(creator_features.keys())

21666

In [15]:
def getValues(st_year, nd_year, entity):
    info = {}
    info['total_revenue'] = 0
    info['total_budget'] = 0
    info['num_movies'] = 0
    info['num_raters'] = 0
    info['sum_rating'] = 0
    for year in range(st_year, nd_year+1):
        year = str(year)
        info['total_revenue']   += entity['year-wise-performance'][year]['sum_revenue']
        info['total_budget']    += entity['year-wise-performance'][year]['sum_budget']
        info['num_movies']      += entity['year-wise-performance'][year]['num_movies']
        info['num_raters']      += entity['year-wise-performance'][year]['num_raters']
        info['sum_rating']      += entity['year-wise-performance'][year]['sum_rating']
    return info

In [16]:
def getValuesBefore(entity_id, year, dictionary):
    return getValues(1967, year-1, dictionary[entity_id])

In [17]:
getValuesBefore('Action', 2007, genre_features)

{'total_revenue': 153972358202,
 'total_budget': 59622506498,
 'num_movies': 992,
 'num_raters': 101879326,
 'sum_rating': 6066.6}

In [18]:
def getGenreVector(movie):
    # genre
    limit = 5
    vector = []
    genre = movie['genre']
    year = movie['cleaned_year']
    if(type(genre) is not list):
        genre = [genre]
    cnt = 0
    for g in genre:
        vector.append(genre_track[g][1])
        info = getValuesBefore(g, int(year), genre_features)
        vector.append(info['num_movies'])
        vector.append(info['num_raters'])
        vector.append(info['total_revenue'])
        if(info['num_movies'] != 0):
            vector.append(info['num_raters']/info['num_movies'])
            vector.append(info['total_revenue']/info['num_movies'])
        else:
            vector.append(0)
            vector.append(0)

        cnt += 1
        if(cnt == limit):
            break

    while(len(vector) < limit*6):
        vector.append(0)
    return vector

tst = getGenreVector(test_movie)
len(tst), tst

(30,
 [169388692.13589364,
  1048,
  111126976,
  164993501728,
  106037.19083969465,
  157436547.45038167,
  255502542.74965325,
  761,
  94382434,
  161879199656,
  124024.22339027595,
  212719053.42444152,
  205304741.40341514,
  508,
  63094864,
  79559120917,
  124202.48818897638,
  156612442.75,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])

In [19]:
def getEntityID(entity):
    # print(entity)
    id = entity['url'].split('?')[0].split('/')[-2]
    return id

In [20]:
def getStarVector(movie, printLog=False):
    limit = 10
    vector = []
    year = movie['cleaned_year']

    cast_ensamble = movie['cast_and_character']
    if(type(cast_ensamble) is not list):
        if(printLog == True):
            print(movie['imdb_id'], cast_ensamble)
    else:
        cnt = 0
        for cast in cast_ensamble:
            star = cast['actor']
            star_id = getEntityID(star)
            info = getValuesBefore(star_id, int(year), star_features)
            # print(star['name'],info)

            vector.append(info['num_movies'])
            vector.append(info['num_raters'])
            vector.append(info['total_revenue'])
            if(info['num_movies'] != 0):
                vector.append(info['num_raters']/info['num_movies'])
                vector.append(info['sum_rating']/info['num_movies'])
                vector.append(info['total_revenue']/info['num_movies'])
            else:
                vector.append(0)
                vector.append(0)
                vector.append(0)
            # vector.append(-1)

            cnt += 1
            if(cnt == limit):
                break

    while(len(vector) < limit*6):
        vector.append(0)
    return vector

tst = getStarVector(test_movie)
len(tst), tst

(60,
 [32,
  1831093,
  1684379189,
  57221.65625,
  6.271874999999999,
  52636849.65625,
  32,
  1761620,
  1960190448,
  55050.625,
  6.640625,
  61255951.5,
  4,
  263857,
  349566015,
  65964.25,
  6.250000000000001,
  87391503.75,
  2,
  33674,
  112207008,
  16837.0,
  6.1,
  56103504.0,
  6,
  129954,
  85611449,
  21659.0,
  5.8500000000000005,
  14268574.833333334,
  7,
  1851799,
  1981163805,
  264542.71428571426,
  7.142857142857144,
  283023400.71428573,
  2,
  134640,
  59376678,
  67320.0,
  6.550000000000001,
  29688339.0,
  1,
  80446,
  203299006,
  80446.0,
  6.5,
  203299006.0,
  18,
  876637,
  1250786359,
  48702.055555555555,
  6.199999999999999,
  69488131.05555555,
  21,
  2604833,
  1774427577,
  124039.66666666667,
  6.457142857142857,
  84496551.28571428])

In [21]:
def getDirectorVector(movie, printLog=False):
    # genre
    limit = 3
    vector = []
    year = movie['cleaned_year']

    directors = movie['director']
    if(type(directors) is not list):
        directors = [directors]
    # print(directors)
    
    cnt = 0
    for dr in directors:
        dr_id = getEntityID(dr)
        info = getValuesBefore(dr_id, int(year), director_features)
        # print(dr['name'],info)
        vector.append(info['num_movies'])
        vector.append(info['num_raters'])
        vector.append(info['total_revenue'])
        if(info['num_movies'] != 0):
            vector.append(info['num_raters']/info['num_movies'])
            vector.append(info['sum_rating']/info['num_movies'])
            vector.append(info['total_revenue']/info['num_movies'])
        else:
            vector.append(0)
            vector.append(0)
            vector.append(0)
        # vector.append(-1)
        cnt += 1
        if(cnt == limit):
            break

    while(len(vector) < limit*6):
        vector.append(0)
    return vector

tst = getDirectorVector(test_movie)
len(tst), tst

(18,
 [3,
  346564,
  409120505,
  115521.33333333333,
  6.533333333333334,
  136373501.66666666,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])

In [22]:
def getCreatorVector(movie, printLog=False):
    # genre
    limit = 5
    vector = []
    year = movie['cleaned_year']

    # creator
    if('creator' not in movie):
        if(printLog == True):
            print(movie['imdb_id'], 'error loading creator information')
    else:
        creators = movie['creator']
        if(type(creators) is not list):
            creators = [creators]
    
        cnt = 0
        for cr in creators:
            dr_id = getEntityID(cr)
            info = getValuesBefore(dr_id, int(year), creator_features)
            # print(cr,info)
            vector.append(info['num_movies'])
            vector.append(info['num_raters'])
            vector.append(info['total_revenue'])
            if(info['num_movies'] != 0):
                vector.append(info['num_raters']/info['num_movies'])
                vector.append(info['sum_rating']/info['num_movies'])
                vector.append(info['total_revenue']/info['num_movies'])
            else:
                vector.append(0)
                vector.append(0)
                vector.append(0)
            # vector.append(-1)
            cnt += 1
            if(cnt == limit):
                break

    while(len(vector) < limit*6):
        vector.append(0)
    return vector

tst = getCreatorVector(test_movie)
len(tst), tst

(30,
 [2,
  478039,
  88832890,
  239019.5,
  7.050000000000001,
  44416445.0,
  2,
  478039,
  88832890,
  239019.5,
  7.050000000000001,
  44416445.0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  6,
  2577885,
  4548904821,
  429647.5,
  6.283333333333334,
  758150803.5])

In [23]:
def getProdVector(movie, printLog=False):
    # genre
    limit = 5
    vector = []
    year = movie['cleaned_year']

    if('Production Co' not in movie['details']):
        if(printLog == True):
            print(movie['imdb_id'], cast_ensamble)
    else:
        prod_arr = movie['details']['Production Co']
        if(type(prod_arr) is not list):
            prod_arr = [prod_arr]
        
        cnt = 0
        for pr in prod_arr:
            pr_id = pr
            info = getValuesBefore(pr_id, int(year), prod_features)
            # print(pr,info)
            vector.append(info['num_movies'])
            vector.append(info['num_raters'])
            vector.append(info['total_revenue'])

            if(info['num_movies'] != 0):
                vector.append(info['num_raters']/info['num_movies'])
                vector.append(info['sum_rating']/info['num_movies'])
                vector.append(info['total_revenue']/info['num_movies'])
            else:
                vector.append(0)
                vector.append(0)
                vector.append(0)
            # vector.append(-1)
            cnt += 1
            if(cnt == limit):
                break

    while(len(vector) < limit*6):
        vector.append(0)
    return vector

tst = getProdVector(test_movie)
len(tst), tst

(30,
 [294,
  31916135,
  53247754417,
  108558.28231292516,
  6.392517006802722,
  181114810.94217688,
  13,
  4305562,
  5500158143,
  331197.07692307694,
  6.276923076923078,
  423089087.9230769,
  2,
  759710,
  1494004981,
  379855.0,
  5.9,
  747002490.5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])

In [24]:
mpaa_track = {'PG': 0, 'R': 1, 'TV': 2, 'G': 3, 'Unrated': 4, 'Others': 5}
clusters = {
    'PG'        : ['PG-13', 'PG'],
    'R'         : ['R', 'NC-17'],
    'Others'    : ["Approved", "X", "M", "M/PG", "GP", "Passed", "Passed"],
    'TV'        : ["TV-MA", "TV-PG", "TV-14", "TV-Y7", "TV-G"],
    'Unrated'   : ["Unrated", "Not Rated"] 
}

def getCluster(key):
    for cluster in clusters:
        if(key in clusters[cluster]):
            return cluster
    return key

def getMPAAonehot(movie):
    mpaa = movie['contentRating']
    vec = [0]*len(list(mpaa_track.keys()))
    vec[mpaa_track[getCluster(mpaa)]] = 1
    return vec

In [25]:
getMPAAonehot(test_movie)

[1, 0, 0, 0, 0, 0]

In [32]:
familiarity_features = {}
with open('SavedFeatures/familiarity_feature.json', 'r') as f:
    familiarity_features = json.load(f)
len(familiarity_features.keys())

57064

In [33]:
familiarity_features['nm0000375']

 'sum_budget': 38700710}},
 'nm0465664': {'1997': {'num_movies': 1,
   'sum_rating': 5.8,
   'num_raters': 6589,
   'sum_revenue': 4261888,
   'sum_budget': 38700710}},
 'nm0000176': {'1997': {'num_movies': 1,
   'sum_rating': 5.8,
   'num_raters': 6589,
   'sum_revenue': 4261888,
   'sum_budget': 38700710}},
 'nm0001840': {'1997': {'num_movies': 1,
   'sum_rating': 5.8,
   'num_raters': 6589,
   'sum_revenue': 4261888,
   'sum_budget': 38700710}},
 'nm0667207': {'1997': {'num_movies': 1,
   'sum_rating': 5.8,
   'num_raters': 6589,
   'sum_revenue': 4261888,
   'sum_budget': 38700710}},
 'nm0130492': {'1997': {'num_movies': 1,
   'sum_rating': 5.8,
   'num_raters': 6589,
   'sum_revenue': 4261888,
   'sum_budget': 38700710}},
 'nm0000372': {'1997': {'num_movies': 1,
   'sum_rating': 5.8,
   'num_raters': 6589,
   'sum_revenue': 4261888,
   'sum_budget': 38700710}},
 'nm0002006': {'1997': {'num_movies': 1,
   'sum_rating': 5.8,
   'num_raters': 6589,
   'sum_revenue': 4261888,
   'sum_

In [54]:
def processFamiliarityOf__StarPair(star1, star2, year):
    feature_pair = familiarity_features[star1][star2]
    total_movies = 0
    total_rating = 0
    total_revenue = 0
    total_raters = 0
    for year in range(1967, int(year)):
        if(str(year) not in feature_pair):
            continue
        total_movies += feature_pair[str(year)]['num_movies']
        total_rating += feature_pair[str(year)]['sum_rating']
        total_revenue += feature_pair[str(year)]['sum_revenue']
        total_raters += feature_pair[str(year)]['num_raters']
    if(total_movies == 0):
        total_movies += 1

    return {
        'total_movies'  : total_movies,
        'avg_rating'    : total_rating/total_movies,
        'avg_raters'    : total_raters/total_movies,
        'avg_revenue'   : total_revenue/total_movies
    }

In [55]:
familiarity_features['nm0000375']['nm0000569']

{'2012': {'num_movies': 1,
  'sum_rating': 8.0,
  'num_raters': 1278885,
  'sum_revenue': 1712092486,
  'sum_budget': 247996114},
 '2008': {'num_movies': 1,
  'sum_rating': 7.9,
  'num_raters': 956424,
  'sum_revenue': 704172781,
  'sum_budget': 168290920},
 '2017': {'num_movies': 1,
  'sum_rating': 7.4,
  'num_raters': 528704,
  'sum_revenue': 929328009,
  'sum_budget': 184774498},
 '2013': {'num_movies': 1,
  'sum_rating': 7.1,
  'num_raters': 769509,
  'sum_revenue': 1349633258,
  'sum_budget': 222196371},
 '2010': {'num_movies': 1,
  'sum_rating': 7.0,
  'num_raters': 736440,
  'sum_revenue': 740547425,
  'sum_budget': 237380305}}

In [56]:
processFamiliarityOf__StarPair('nm0000375', 'nm0000569', 2017)

{'total_movies': 4,
 'avg_rating': 7.5,
 'avg_raters': 935314.5,
 'avg_revenue': 1126611487.5}

In [60]:
from scipy.spatial import distance

def calculate_familiarity(arr2D):
    if(len(arr2D) < 2):
        return 0
    num_actor = len(arr2D)
    nC2 = num_actor*(num_actor-1)/2
    sum = 0
    for idx in range(len(arr2D)):
        for jdx in range(idx+1, len(arr2D)):
            sum += distance.cosine(arr2D[idx], arr2D[jdx])
    return sum/nC2

In [68]:
def processFamiliarityOf__Movie(movie, printLog=False):
    limit = 10
    vector = []
    year = movie['cleaned_year']

    cast_ensamble = movie['cast_and_character']
    if(type(cast_ensamble) is not list):
        if(printLog == True):
            print(movie['imdb_id'], cast_ensamble)
        return [0, 0, 0, 0]
    else:
        familiarity_matrix = np.zeros( ( len(cast_ensamble), len(cast_ensamble) ) ) 
        max_avg_rating = 0
        max_avg_raters = 0
        max_avg_revenue = 0
        for i in range(len(cast_ensamble)):
            star = cast_ensamble[i]['actor']
            id_1 = getEntityID(star)
            for j in range(len(cast_ensamble)):
                star = cast_ensamble[j]['actor']
                id_2 = getEntityID(star)

                familiarity_info = processFamiliarityOf__StarPair(id_1, id_2, movie['cleaned_year'])
                familiarity_matrix[i][j] = familiarity_info['total_movies']
                if(i != j):
                    max_avg_rating = max(max_avg_rating, familiarity_info['avg_rating'])
                    max_avg_raters = max(max_avg_raters, familiarity_info['avg_raters'])
                    max_avg_revenue = max(max_avg_revenue, familiarity_info['avg_revenue'])

        familiarity_val = calculate_familiarity(familiarity_matrix)

        # return {
        #     'fam_val'           : familiarity_val,
        #     'max_pair_rating'   : max_avg_rating,
        #     'max_pair_raters'   : max_avg_raters,
        #     'max_pair_revenue'  : max_avg_revenue
        # }
        return [familiarity_val, max_avg_rating, max_avg_raters, max_avg_revenue]

In [69]:
test_movie = getMovie('tt0371746')
processFamiliarityOf__Movie(test_movie)

[0.5476092503602503, 6.5, 80446.0, 203299006.0]

In [70]:
def getFeatureVector(movie, printLog=False):
    budget = movie['cleaned_Budget']
    month = movie['cleaned_month']
    month_idx, month_avg_rev = month_track[month]
    runtime = movie['cleaned_Runtime_min']
    genre_vector = getGenreVector(movie)
    mpaa_onehot = getMPAAonehot(movie)
    star_features = getStarVector(movie)
    star_familiarity = processFamiliarityOf__Movie(movie)
    director = getDirectorVector(movie)
    creator = getCreatorVector(movie)
    prod = getProdVector(movie)

    feature_vector = [budget, month_idx, month_avg_rev, runtime] + genre_vector + mpaa_onehot + star_features + star_familiarity + director + creator + prod
    
    return feature_vector

In [71]:
len(getFeatureVector(test_movie))

182

In [72]:
xy = []

step = 500
cnt = 0
for movie in imdb.find():
    feature_vector = getFeatureVector(movie)
    target = movie['cleaned_Revenue']
    xy.append({
        'feature'   : feature_vector,
        'target'    : target 
    })

    cnt += 1
    if(cnt % step == 0):
        print('processed {} out of {}'.format(cnt, imdb.count_documents({})))

processed 500 out of 8181
processed 1000 out of 8181
processed 1500 out of 8181
processed 2000 out of 8181
processed 2500 out of 8181
processed 3000 out of 8181
processed 3500 out of 8181
processed 4000 out of 8181
processed 4500 out of 8181
processed 5000 out of 8181
processed 5500 out of 8181
processed 6000 out of 8181
processed 6500 out of 8181
processed 7000 out of 8181
processed 7500 out of 8181
processed 8000 out of 8181


In [73]:
with open('SavedFeatures/final_feature_vector___with_familiarity.json', 'w') as f:
    json.dump(xy, f)

In [74]:
len(xy)

8181