In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import scipy
scipy.__version__
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
from urllib.parse import quote_plus as urlquote
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
df_2001 = pd.read_csv("data/final_tmdb_data_2001.csv.gz")
df_2001.head(2)

In [None]:
df_2002 = pd.read_csv("data/final_tmdb_data_2002.csv.gz")
df_2002.head(2)

In [None]:
df = pd.concat([df_2011, df_2012], ignore_index=True, sort=False)

In [None]:
# Setting the id as the index
df = df.set_index('imdb_id')
df.info()

In [None]:
df = df.dropna()

Q1: Does rating affect revenue?

Null = Rating has no affect on revenue.

Alternative = Rating does affect revenue.

In [None]:
df['certification'].value_counts()

In [None]:
df['revenue'].describe()

Multiple numerical samples point to use of Tukey/ANOVA

In [None]:
ax = sns.barplot(data=df, x='certification', y='revenue')
ax.set_xticklabels(ax.get_xticklabels(), ha='right');

In [None]:
cols_used = ['certification','revenue']
df[cols_used]

In [None]:
## Create a dictionary
groups ={}

for certification in df['certification'].unique():
    temp = df.loc[df['certification']== certification, 'revenue']
    groups[certification] = temp

groups.keys()

In [None]:
## Loop through groups dict
for certification, data in groups.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(data)) > 3
    
    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the {certification} group.")

    ## Remove the outiers from data and overwrite the sector data in the dict
    data = data.loc[~outliers]
    groups[certification] = data

No outliers. Normality and equal variance.

In [None]:
## Running normal test on each group and confirming there are >20 in each group

## Save a list with an inner list of column names
norm_results = [['group','n','pval','sig?']]


## loop through group dict
for certification, data in groups.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(data)
    
    ## Append the right info into norm_resutls (as a list)
    norm_results.append([certification,len(data), p, p<.05])
    
    
## Make norm_results a dataframe (first row is columns, everything else data)
normal_results = pd.DataFrame(norm_results[1:], columns = norm_results[0])
normal_results

In [None]:
## Levene's test for equal variance
result = stats.levene(*groups.values())
print(result)

In [None]:
if result.pvalue < .05:
    print(f"The groups do NOT have equal variance.")
else:
    print(f"The groups DO have equal variance.")

In [None]:
## Running Krukal Test for Original Hypothesis
result = stats.kruskal(*groups.values())
print(result)
result.pvalue<.05

Post-hoc multiple comparison tests

In [None]:
## slice a test sector
temp = groups['G']
temp

In [None]:
## test a dataframe 
pd.DataFrame({'revenue':temp, 'certification':'G'})

In [None]:
## make a list for saving the dataframes to
tukeys_dfs = []

## Loop through groups dict's items
for certification, temp in groups.items():
    
    ## make a temp_df with the data and the sector name
    temp_df = pd.DataFrame({'revenue':temp, 'certification':certification})
    
    ## append to tukeys_dfs
    tukeys_dfs.append(temp_df)
    
## concatenate them into 1 dataframe    
tukeys_data = pd.concat(tukeys_dfs)
tukeys_data

In [None]:
## save the values 
values = tukeys_data['revenue']
labels = tukeys_data['certification']

## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

In [None]:
## make a barplot of final data to go with results
ax = sns.barplot(data=tukeys_data, x='certification', y='revenue', ci=68)
ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical', ha='right');

In [None]:
## Not significantly different
tukeys_df[tukeys_df['reject'] ==False]

In [None]:
## tukeys_reuslts.plot_simultaneous
tukeys_results.plot_simultaneous();

Certification does impact revenue.

In [None]:
ax = sns.barplot(data=df, x='budget', y='revenue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right');

In [None]:
## Create a dictionary with each group as key and funded_amount as values
groups ={}

for budget in df['budget'].unique():
    temp = df.loc[df['budget']== budget, 'revenue']
    groups[budget] = temp

groups.keys()

In [None]:
## Loop through groups dict
for budget, data in groups.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(data)) > 3
    
    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the {sector} group.")

    ## Remove the outiers from data and overwrite the sector data in the dict
    data = data.loc[~outliers]
    groups[budget] = data

In [None]:
## Running normal test on each group and confirming there are >20 in each group

## Save a list with an inner list of column names
norm_results = [['group','n','pval','sig?']]


## loop through group dict
for budget, data in groups.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(data)
    
    ## Append the right info into norm_resutls (as a list)
    norm_results.append([budget,len(data), p, p<.05])
    
    
## Make norm_results a dataframe (first row is columns, everything else data)
normal_results = pd.DataFrame(norm_results[1:], columns = norm_results[0])
normal_results

Groups are large enough that we can ignore assumption of normality.

In [None]:
## Use Levene's test for equal variance
result = stats.levene(*groups.values())
print(result)

In [None]:
if result.pvalue < .05:
    print(f"The groups do NOT have equal variance.")
else:
    print(f"The groups DO have equal variance.")

In [None]:
## Running Krukal Test for Original Hypothesis
result = stats.kruskal(*groups.values())
print(result)
result.pvalue<.05

P-Value less than .05. Null hypothesis rejected.

Multiple Comparison Test

In [None]:
## slice a test 
temp = groups[165000000.0]
temp

In [None]:
## test making a datafrae from the test 
pd.DataFrame({'budget':temp, 'revenue':'165000000.0'})

In [None]:
## make a list for saving the dataframes to
tukeys_dfs = []

## Loop through groups dict's items
for budget, temp in groups.items():
    
    ## make a temp_df with the name
    temp_df = pd.DataFrame({'revenue':temp, 'budget':budget})
    
    ## append to tukeys_dfs
    tukeys_dfs.append(temp_df)
    
## concatenate them into 1 dataframe    
tukeys_data = pd.concat(tukeys_dfs)
tukeys_data

In [None]:
## save the values 
values = tukeys_data['budget']
labels = tukeys_data['revenue']

## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
summary_table = tukeys_results.summary()
limited_summary = summary_table.head(10)
print(limited_summary)

In [None]:
## make a barplot of final data to go with results
ax = sns.barplot(data=tukeys_data, x='budget', y='revenue', ci=68)
ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical', ha='right');

In [None]:
## Not significantly different
tukeys_df[tukeys_df['reject'] ==False]

In [None]:
## tukeys_reuslts.plot_simultaneous
tukeys_results.plot_simultaneous();

Q: Does rating affect popularity?

Null: rating does not affect popularity.
Alternative: Rating does affect popularity.

In [None]:
ax = sns.barplot(data=df, x='certification', y='popularity')
ax.set_xticklabels(ax.get_xticklabels(), ha='right'

In [None]:
called_cols =  ['certification','popularity']
df[called_cols]

In [None]:
## Create a dictionary
groups ={}

for certification in df['certification'].unique():
    temp = df.loc[df['certification']== certification, 'popularity']
    groups[certification] = temp

groups.keys()

In [None]:
## Loop through groups dict
for certification, data in groups.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(data)) > 3
    
    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the {certification} group.")

    ## Remove the outiers from data and overwrite the sector data in the dict
    data = data.loc[~outliers]
    groups[certification] = data

In [None]:
## Running normal test on each group and confirming there are >20 in each group

## Save a list with an inner list of column names
norm_results = [['group','n','pval','sig?']]


## loop through group dict
for certification, data in groups.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(data)
    
    ## Append the right info into norm_resutls (as a list)
    norm_results.append([certification,len(data), p, p<.05])
    
    
## Make norm_results a dataframe (first row is columns, everything else data)
normal_results = pd.DataFrame(norm_results[1:], columns = norm_results[0])
normal_results

In [None]:
## Levene's test for equal variance
result = stats.levene(*groups.values())
print(result)

In [None]:
if result.pvalue < .05:
    print(f"The groups do NOT have equal variance.")
else:
    print(f"The groups DO have equal variance.")

In [None]:
result = stats.f_oneway( *groups.values())
result

In [None]:
## slice a test 
temp = groups["G"]
temp

In [None]:
pd.DataFrame({'popularity':temp, 'certification':'G'})

## make a list for saving the dataframes to
tukeys_dfs = []

## Loop through groups dict's items
for certification, temp in groups.items():
    
    ## make a temp_df with the data and the sector name
    temp_df = pd.DataFrame({'popularity':temp, 'certification':certification})
    
    ## append to tukeys_dfs
    tukeys_dfs.append(temp_df)
    
## concatenate them into 1 dataframe    
tukeys_data = pd.concat(tukeys_dfs)
tukeys_data

In [None]:
## save the values 
values = tukeys_data['popularity']
labels = tukeys_data['certification']

## perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

In [None]:
## make a barplot of final data to go with results
ax = sns.barplot(data=tukeys_data, x='certification', y='popularity', errorbar=('ci', 68))
ax.set_xticklabels(ax.get_xticklabels(), rotation='vertical', ha='right');

In [None]:
tukeys_results.plot_simultaneous();

In [None]:
Null hypothesis rejected.