In [1]:
import os
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# import cpi
import pymongo
from tqdm import tqdm
from scipy import stats
import math
import numpy as np
import pandas as pd
import json

In [2]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
db = myclient["imdb"]
imdb = db['imdb_cleaned']

In [3]:
num_movies = imdb.count_documents({})
num_movies

8181

In [4]:
def getMovie(imdb_id):
    for x in imdb.find({'imdb_id': imdb_id}):
        return x
    return None

In [5]:
star_features = {}
with open('SavedFeatures/star_features.json', 'r') as f:
    star_features = json.load(f)

In [6]:
len(star_features.keys())

57064

In [7]:
rdj = star_features['nm0000288']

In [23]:
# rdj

In [10]:
rolling_year = 5
present = 2020
first_movie_year = 1967

for year in rdj['year-wise-performance']:
    if(rdj['year-wise-performance'][year]['num_movies'] != 0):
        first_movie_year = int(year)
        break
first_movie_year

1987

In [24]:
def getValues(st_year, nd_year, entity):
    info = {}
    info['total_revenue'] = 0
    info['total_budget'] = 0
    info['num_movies'] = 0
    info['num_raters'] = 0
    info['max_revenue'] = 0
    for year in range(st_year, nd_year+1):
        year = str(year)
        info['total_revenue']   += entity['year-wise-performance'][year]['sum_revenue']
        info['total_budget']    += entity['year-wise-performance'][year]['sum_budget']
        info['num_movies']      += entity['year-wise-performance'][year]['num_movies']
        info['num_raters']      += entity['year-wise-performance'][year]['num_raters']
        info['max_revenue']     = max(info['max_revenue'], entity['year-wise-performance'][year]['sum_revenue'])
    return info

In [12]:
getValues(first_movie_year, present, rdj)

{'total_revenue': 6711286432,
 'total_budget': 2527211438,
 'num_movies': 34,
 'num_raters': 11991070}

In [13]:
year_label = []
revenue_arr = []
budget_arr = []
num_movie_arr = []

for year in range(first_movie_year+rolling_year-1, present):
    st_year = year - rolling_year + 1
    nd_year = year
    info = getValues(st_year, nd_year, rdj)
    year_label.append('{} - {}'.format(st_year, nd_year))

    if(info['num_movies'] != 0):
        revenue_arr.append(info['total_revenue']/info['num_movies'])
        budget_arr.append(info['total_budget']/info['num_movies'])
        num_movie_arr.append(info['num_movies'])
    else:
        revenue_arr.append(0)
        budget_arr.append(0)
        num_movie_arr.append(0) 

In [14]:
def plotFigure(xrr, yrr, zrr):
    fig = make_subplots()
    fig.add_trace(
        go.Scatter(
            x=xrr, 
            y=yrr, 
            name="Revenue",
            mode='lines+markers',
            line_color='rgba(102, 0, 204, .8)'
        ),
    )
    
    fig.add_trace(
        go.Scatter(
            x=xrr, 
            y=zrr, 
            name="Budget",
            mode='lines+markers',
            line_color='rgba(153, 0, 51, .8)'
        ),
    )
    return fig

In [15]:
fig = plotFigure(year_label, revenue_arr, budget_arr)
fig.show()

# Star Power -- KS Test

In [16]:
def getEntityID(entity):
    # print(entity)
    id = entity['url'].split('?')[0].split('/')[-2]
    return id

In [31]:
def getStarInfo(star_id):
    star_info = star_features[star_id]
    return star_info['total_movies'], star_info['total_raters']

In [32]:
movie = getMovie('tt0371746')
# movie

In [33]:
def getStarInfoBefore(star_id, year):
    return getValues(1967, year-1, star_features[star_id])

In [34]:
getStarInfo('nm0000375')

(50, 11559434)

In [36]:
getStarInfoBefore('nm0000375', 2017)

{'total_revenue': 10908647800,
 'total_budget': 3301492644,
 'num_movies': 46,
 'num_raters': 9241622,
 'max_revenue': 1712092486}

In [101]:
th_num_movie = 40
th_avg_raters = 200000
th_revenue = 1000*1000000

def getMovieCriteria(movie, printLog=False):
    cast_ensamble = movie['cast_and_character']
    if(type(cast_ensamble) is not list):
        if(printLog == True):
            print(movie['imdb_id'], "could ont find cast_ensamble")
    else:
        for cast in cast_ensamble:
            star = cast['actor']
            star_id = getEntityID(star)
            # num_movie, total_raters = getStarInfo(star_id)
            info = getStarInfoBefore(star_id, int(movie['cleaned_year']))
            num_movie = info['num_movies']
            total_raters = info['num_raters']
            mx_revenue = info['max_revenue']
            avg_raters = 0
            if(num_movie != 0):
                avg_raters = total_raters/num_movie
            # print(star_id, num_movie, th_num_movie, total_raters, total_raters/num_movie, th_avg_raters,  star['name'])
            # print(num_movie > th_num_movie, avg_raters > th_avg_raters)
            if(num_movie > th_num_movie or mx_revenue > th_revenue):
                # print(mx_revenue, th_revenue)
                return True
    return False

In [102]:
movie = getMovie('tt0974015')

getMovieCriteria(movie)
# movie

True

In [103]:
has_star = []
no_star = []

for movie in imdb.find():
    st = getMovieCriteria(movie)
    revenue = movie['cleaned_Revenue']
    if(st == True):
        has_star.append((movie['imdb_id'], movie['name'], revenue))
    else:
        no_star.append((movie['imdb_id'], movie['name'], revenue))

In [104]:
len(has_star), len(no_star)

(3230, 4951)

In [105]:
has_star_revenue = []
no_star_revenue = []

for movie in has_star:
    has_star_revenue.append(movie[2])

for movie in no_star:
    no_star_revenue.append(movie[2])

In [106]:
len(has_star_revenue), len(no_star_revenue)

(3230, 4951)

In [107]:
from scipy.stats import ks_2samp
import numpy as np

has_star_revenue = np.array(has_star_revenue)
no_star_revenue = np.array(no_star_revenue)

In [108]:
ks_2samp(has_star_revenue, no_star_revenue)

KstestResult(statistic=0.21792982998087138, pvalue=5.306261778550361e-82)

In [99]:
has_star_revenue.mean(), no_star_revenue.mean()

(155171694.37785423, 65836210.00702247)

In [100]:
has_star_revenue.mean()-no_star_revenue.mean()

89335484.37083176