In [72]:
import pandas as pd 
from sklearn.datasets import load_iris
import plotly.express as px
import numpy as np

In [73]:
# To see the non-null values of data
data = pd.read_csv("tmdb_5000_movies.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
budget                  4803 non-null int64
genres                  4803 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4803 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null object
tagline                 3959 non-null object
title                   4803 non-null object
vote_average            4803 non-null float64
vote_count              4803 non-null 

In [80]:
# get 'revenue' column and describe
from pandas import DataFrame
list_budget_revenue = [[row[0],row[12]] for row in data.values]
df_budget_revenue = DataFrame(list_budget_revenue, columns=['budget', 'revenue'])
df_budget_revenue['revenue'].describe()

count    4.803000e+03
mean     8.226064e+07
std      1.628571e+08
min      0.000000e+00
25%      0.000000e+00
50%      1.917000e+07
75%      9.291719e+07
max      2.787965e+09
Name: revenue, dtype: float64

# More than 25% film could not sell any ticket

In [85]:
# To see more about revenue and budget
import plotly.express as px
fig = px.histogram(df_budget_revenue, x="budget", y="revenue", color="budget", marginal="rug", hover_data=df_budget_revenue.columns)
fig.show()

# The revenue is not ratio with budget

In [89]:
#correlation matrix
from pandas import DataFrame
list_data = [[row[0],row[3],row[5],row[8],row[12],row[13],row[18],row[19]] for row in data.values]
df_train = DataFrame(list_data, columns=['budget','id','original_language','popularity', 'revenue','runtime','vote_average','vote_count'])
corrmat = df_train.corr()

#create the chart
sns_colorscale = [[0.0, '#3f7f93'], #cmap = sns.diverging_palette(220, 10, as_cmap = True)
 [0.071, '#5890a1'],
 [0.143, '#72a1b0'],
 [0.214, '#8cb3bf'],
 [0.286, '#a7c5cf'],
 [0.357, '#c0d6dd'],
 [0.429, '#dae8ec'],
 [0.5, '#f2f2f2'],
 [0.571, '#f7d7d9'],
 [0.643, '#f2bcc0'],
 [0.714, '#eda3a9'],
 [0.786, '#e8888f'],
 [0.857, '#e36e76'],
 [0.929, '#de535e'],
 [1.0, '#d93a46']]

heat = go.Heatmap(z=corrmat,
                  x=df_budget_revenue.columns,
                  y=df_budget_revenue.columns,
                  xgap=1, ygap=1,
                  colorscale=sns_colorscale,
                  colorbar_thickness=20,
                  colorbar_ticklen=3,
                  hovertext =df_budget_revenue.columns,
                  hoverinfo='text'
                   )


title = 'Correlation Matrix'               

layout = go.Layout(title_text=title, title_x=0.5, 
                   width=600, height=600,
                   xaxis_showgrid=False,
                   yaxis_showgrid=False,
                   yaxis_autorange='reversed')
   
fig=go.Figure(data=[heat], layout=layout)        
fig.show() 

# Original Language and Popularity is linear with Vote Average 

# Question 1: What areas have the most influence on revenue?

In [22]:
fields = ["production_countries","revenue"]
data = pd.read_csv("tmdb_5000_movies.csv", skipinitialspace=True, usecols=fields)
countries = []
# Extract List of Countries
for i in range(len(data)):
    country_data_string = data.values[i][0]    
    country_array = country_data_string.split('{')    
    for i in range(1, len(country_array)):
        j = country_array[i].split('"') 
        countries.append(j[7]) 

countries = list(dict.fromkeys(countries))
revenue_by_countries = np.zeros(len(countries))
# Extract The Sum of Revenue By Country
for i in range(len(data)):    
    for j in range(len(countries)):        
        if data.values[i][0].find('"' + countries[j] + '"') > 0:
             revenue_by_countries[j] += data.values[i][1]   


from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected = True)
import plotly.graph_objs as go
trace = go.Pie(labels = countries, values = revenue_by_countries)
data_trace = [trace]
fig = go.Figure(data = data_trace)
iplot(fig)

Oneway ANOVA Statistical test:

In [23]:
usa_revenue, uk_revenue, ger_revenue, ca_revenue, fra_revenue= [], [], [], [], []

for i in range(len(data)):
    if data.values[i][1] > 0:
        if data.values[i][0].find('"United States of America"') > 0:
             usa_revenue.append(data.values[i][1])
        if data.values[i][0].find('"United Kingdom"') > 0:
             uk_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Germany"') > 0:
             ger_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Canada"') > 0:
             ca_revenue.append(data.values[i][1])
        if data.values[i][0].find('"France"') > 0:
             fra_revenue.append(data.values[i][1])
from scipy.stats import f_oneway
f_oneway(usa_revenue, uk_revenue, ger_revenue, ca_revenue, fra_revenue)


F_onewayResult(statistic=9.232704853330217, pvalue=1.967802896508718e-07)

p_value < 0.05: Reject null hypothesis, Accept the difference of countries' revenue

# The most revenue influence areas are USA, UK, Germany, Canada and France

# Question 2: How is a movie’s revenue and average score affected by its
# genre?

In [35]:
fields = ["genres","revenue"]
data = pd.read_csv("tmdb_5000_movies.csv", skipinitialspace=True, usecols=fields)
genres = []
for i in range(len(data)):
    genre_data_string = data.values[i][0]    
    genre_array = genre_data_string.split('{')    
    for i in range(1, len(genre_array)):
        j = genre_array[i].split('"') 
        genres.append(j[5]) 

genres = list(dict.fromkeys(genres))
revenue_by_genres = np.zeros(len(genres))
for i in range(len(data)):    
    for j in range(len(genres)):        
        if data.values[i][0].find('"' + genres[j] + '"') > 0:
            revenue_by_genres[j] += data.values[i][1]  

from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected = True)
import plotly.graph_objs as go
trace = go.Pie(labels = genres, values = revenue_by_genres)
data_trace = [trace]
fig = go.Figure(data = data_trace)
iplot(fig)

In [27]:
# Extract data with the sample 5 most influence genres 
adventure_revenue, action_revenue, comedy_revenue, drama_revenue, thriller_revenue= [], [], [], [], []

for i in range(len(data)):
    if data.values[i][1] > 0:
        if data.values[i][0].find('"Adventure"') > 0:
             adventure_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Action"') > 0:
             action_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Comedy"') > 0:
             comedy_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Drama"') > 0:
             drama_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Thriller"') > 0:
             thriller_revenue.append(data.values[i][1])
from scipy.stats import f_oneway
f_oneway(adventure_revenue, action_revenue, comedy_revenue, drama_revenue, thriller_revenue)

F_onewayResult(statistic=112.58046462622667, pvalue=3.0691387425871625e-92)

p_value < 0.05: Reject null hypothesis, Accept the difference of genres' revenue

# Genres affect revenue by type

In [28]:
# extract data
fields = ["genres", "vote_average"]
data = pd.read_csv("tmdb_5000_movies.csv", skipinitialspace=True, usecols=fields)
vote_average_by_genres = np.zeros(len(genres))
for i in range(len(data)):    
    for j in range(len(genres)):        
        if data.values[i][0].find('"' + genres[j] + '"') > 0:
            vote_average_by_genres[j] += data.values[i][1]  
# create the chart
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected = True)
import plotly.graph_objs as go
trace = go.Pie(labels = genres, values = vote_average_by_genres)
data_trace = [trace]
fig = go.Figure(data = data_trace)
iplot(fig)

In [30]:
# Extract data with the sample 5 most influence genres for vote average
adventure_revenue, action_revenue, comedy_revenue, drama_revenue, thriller_revenue= [], [], [], [], []

for i in range(len(data)):
    if data.values[i][1] > 0:
        if data.values[i][0].find('"Adventure"') > 0:
             adventure_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Action"') > 0:
             action_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Comedy"') > 0:
             comedy_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Drama"') > 0:
             drama_revenue.append(data.values[i][1])
        if data.values[i][0].find('"Thriller"') > 0:
             thriller_revenue.append(data.values[i][1])
from scipy.stats import f_oneway
f_oneway(adventure_revenue, action_revenue, comedy_revenue, drama_revenue, thriller_revenue)

F_onewayResult(statistic=85.75244792713407, pvalue=2.862715260883635e-71)

p_value < 0.05: Reject null hypothesis, Accept the difference of genres' vote average

# Genres affect vote average by type

# Question 3: What influence does release date have on revenue?

In [45]:
from datetime import datetime
import plotly.graph_objects as go
fields = ["release_date", "revenue","year","month","day"]
data = pd.read_csv("tmdb_5000_movies.csv", skipinitialspace=True, usecols=fields)
years = []
quarters = ["Quarter 1", "Quarter 2", "Quarter 3", "Quarter 4"]
# Extract Data
for i in range(len(data)):    
    try:
        years.append(data.values[i][2])
    except:
        pass
years = list(dict.fromkeys(years))
years.sort() 
# Extract data and count sum revenue by year, quarter, the first and second half of month, the odd and even days
revenue_by_years = np.zeros(len(years))
revenue_by_quarters = np.zeros(len(quarters))
revenue_by_first_half_of_month,revenue_by_second_half_of_month, odd_days, even_days = 0, 0, 0, 0

for i in range(len(data)): 
    if data.values[i][1] > 0:
        for j in range(len(years)):        
            if data.values[i][2] == years[j]:
                revenue_by_years[j] += data.values[i][1]  


        if data.values[i][3] in [1, 2, 3]:
            revenue_by_quarters[0] += data.values[i][1] 
        if data.values[i][3] in [4, 5, 6]:
            revenue_by_quarters[1] += data.values[i][1]
        if data.values[i][3] in [7, 8, 9]:
            revenue_by_quarters[2] += data.values[i][1] 
        if data.values[i][3] in [10, 11, 12]:
            revenue_by_quarters[3] += data.values[i][1]

        if data.values[i][4] < 15:
            revenue_by_first_half_of_month += data.values[i][1]  
        else:
            revenue_by_second_half_of_month += data.values[i][1]  

        if (data.values[i][4] % 2) == 0:
            even_days += data.values[i][1]  
        else:
            odd_days += data.values[i][1]  
# Create Chart Revenue by Year

fig = go.Figure(data=go.Scatter(x=years, y=revenue_by_years))
fig.show()

Oneway ANOVA test:
    

In [None]:
# Extract data revenue by year
data_test_by_year = []
for j in range(len(years)):
    this_year_data = []
    for i in range(len(data)):
        if data.values[i][2] == years[j]:
            this_year_data.append(data.values[i][1]) 
    data_test_by_year.append(this_year_data)
    



In [63]:
f_oneway(*data_test_by_year)

F_onewayResult(statistic=0.9343745458379351, pvalue=0.6545284333236787)

pvalue > 0.05: Accept null hypothesis 


# there is no difference of revenue by year

In [46]:
# create chart revenue by quarter
trace = go.Pie(labels = quarters, values = revenue_by_quarters)
data_trace = [trace]
fig = go.Figure(data = data_trace)
iplot(fig)


Oneway ANOVA test:

In [66]:
# Extract data revenue by quarter
data_test_by_quarter_1,data_test_by_quarter_2,data_test_by_quarter_3,data_test_by_quarter_4 = [],[],[],[]
for i in range(len(data)): 
    if data.values[i][1] > 0:        
        if data.values[i][3] in [1, 2, 3]:
            data_test_by_quarter_1.append(data.values[i][1]) 
        if data.values[i][3] in [4, 5, 6]:
            data_test_by_quarter_2.append(data.values[i][1]) 
        if data.values[i][3] in [7, 8, 9]:
            data_test_by_quarter_3.append(data.values[i][1]) 
        if data.values[i][3] in [10, 11, 12]:
            data_test_by_quarter_4.append(data.values[i][1]) 
f_oneway(data_test_by_quarter_1, data_test_by_quarter_2, data_test_by_quarter_3, data_test_by_quarter_4)

F_onewayResult(statistic=39.45903572877659, pvalue=4.68220986185768e-25)

pvalue < 0.05: Reject H0, Accept the difference of revenue by quarter

In [47]:
trace = go.Pie(labels = ["First Half Of Months", "Second Half Of Months"], 
               values = [revenue_by_first_half_of_month, revenue_by_second_half_of_month])
data_trace = [trace]
fig = go.Figure(data = data_trace)
iplot(fig)


In [67]:
# Extract data revenue by first and second half of month
data_test_first_half_of_month = []
data_test_second_half_of_month = []
for i in range(len(data)): 
    if data.values[i][1] > 0: 
        if data.values[i][4] < 15:
            data_test_first_half_of_month.append(data.values[i][1]) 
        else:
            data_test_second_half_of_month.append(data.values[i][1]) 
f_oneway(data_test_first_half_of_month, data_test_second_half_of_month)

F_onewayResult(statistic=4.214072225166447, pvalue=0.04016694473812119)

In [None]:
pvalue < 0.05: Reject H0, Accept the difference of revenue by the halfs of month

In [48]:
trace = go.Pie(labels = ["Odd Days", "Even Days"], 
               values = [odd_days, even_days])
data_trace = [trace]
fig = go.Figure(data = data_trace)
iplot(fig)

In [68]:
# Extract data revenue by odd and even days
data_test_even_days = []
data_test_odd_days = []
for i in range(len(data)): 
    if data.values[i][1] > 0:       
        
        if (data.values[i][4] % 2) == 0:
            data_test_even_days.append(data.values[i][1]) 
        else:
            data_test_odd_days.append(data.values[i][1]) 
f_oneway(data_test_even_days, data_test_odd_days)

F_onewayResult(statistic=0.005072563399748989, pvalue=0.9432253487627192)

pvalue > 0.05: Accept the similar of revenue by odd even days