# Cuestionamiento 3: Existe una relacion entre el exito ($ y ratings) de una pelicula y el director y/o budget?

Data exploration

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

In [2]:
# Import csv files
movies = pd.read_csv('IMDb movies.csv')
# actors = pd.read_csv('IMDb names.csv')
# ratings = pd.read_csv('IMDb ratings.csv')
title_principals = pd.read_csv('IMDb title_principals.csv')

In [3]:
movies = pd.DataFrame(movies)
# actors = pd.DataFrame(actors)
# ratings = pd.DataFrame(ratings)
title_principals = pd.DataFrame(title_principals)

In [4]:
movies.count()

imdb_title_id            81273
title                    81273
original_title           81273
year                     81273
date_published           81273
genre                    81273
duration                 81273
country                  81234
language                 80518
director                 81200
writer                   79780
production_company       76948
actors                   81207
description              78843
avg_vote                 81273
votes                    81273
budget                   22804
usa_gross_income         15094
worlwide_gross_income    29892
metascore                12722
reviews_from_users       74196
reviews_from_critics     70286
dtype: int64

Cleaning Data

In [5]:
# Delet columns that we don't need, drop all NaN and reset index
movies = movies.drop(['reviews_from_users','reviews_from_critics','original_title','duration','description'],axis=1)
movies = movies.dropna(how='any')
movies = movies.reset_index(drop=True)

# Delet budget and gross income in other currencies
movies = movies[~movies['budget'].str.match(r'([A-Z]+)')]
movies = movies[~movies['usa_gross_income'].str.match(r'($[A-Z]+\w)')]
movies = movies[~movies['worlwide_gross_income'].str.match(r'($[A-Z]+\w)')]

# Delet '$' symbol from table
movies['budget'] = movies['budget'].str.replace('$','')
movies['usa_gross_income'] = movies['usa_gross_income'].str.replace('$','')
movies['worlwide_gross_income'] = movies['worlwide_gross_income'].str.replace('$','')

movies

Unnamed: 0,imdb_title_id,title,year,date_published,genre,country,language,director,writer,production_company,actors,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore
1,tt0021749,City Lights,1931,1931-08-21,"Comedy, Drama, Romance",USA,English,Charles Chaplin,Charles Chaplin,Charles Chaplin Productions,"Virginia Cherrill, Florence Lee, Harry Myers, ...",8.5,152716,1500000,19181,32609,99.0
2,tt0027977,Modern Times,1936,1936-10-16,"Comedy, Drama, Family",USA,English,Charles Chaplin,Charles Chaplin,Charles Chaplin Productions,"Charles Chaplin, Paulette Goddard, Henry Bergm...",8.5,197969,1500000,163577,445226,96.0
3,tt0029583,Snow White and the Seven Dwarfs,1937,1938-07-08,"Animation, Family, Fantasy",USA,English,"William Cottrell, David Hand","Jacob Grimm, Wilhelm Grimm",Walt Disney Productions,"Roy Atwell, Stuart Buchanan, Adriana Caselotti...",7.6,168735,1499000,184925486,184925486,95.0
4,tt0031381,Gone with the Wind,1939,1942-09-04,"Drama, History, Romance",USA,English,"Victor Fleming, George Cukor","Margaret Mitchell, Sidney Howard",Selznick International Pictures,"Thomas Mitchell, Barbara O'Neil, Vivien Leigh,...",8.1,269664,3977000,200852579,402352579,97.0
5,tt0031679,Mr. Smith Goes to Washington,1939,1940-05-03,"Comedy, Drama",USA,English,Frank Capra,"Sidney Buchman, Lewis R. Foster",Columbia Pictures,"Jean Arthur, James Stewart, Claude Rains, Edwa...",8.1,100206,1900000,144738,144738,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6459,tt8772262,Midsommar,2019,2019-07-03,"Drama, Horror, Mystery","USA, Sweden, Hungary","English, Swedish",Ari Aster,Ari Aster,Proton Cinema,"Florence Pugh, Jack Reynor, Vilhelm Blomgren, ...",7.2,78830,10000000,27426361,41123770,72.0
6460,tt9024106,Unplanned,2019,2019-10-04,"Biography, Drama",USA,English,"Chuck Konzelman, Cary Solomon","Abby Johnson, Chuck Konzelman",Unplanned Movie,"Ashley Bratcher, Brooks Ryan, Robia Scott, Jar...",5.8,10783,6000000,19005109,19320481,10.0
6461,tt9082020,Cold Blood Legacy,2019,2019-05-15,"Action, Thriller","France, Ukraine, Belgium","French, English",Frédéric Petitjean,Frédéric Petitjean,Ascot Elite Entertainment Group,"Jean Reno, Sarah Lind, Joe Anderson, David Gya...",4.5,2193,2700000,5083,1009888,25.0
6462,tt9134216,Playing with Fire,2019,2019-11-08,"Comedy, Family",USA,English,Andy Fickman,"Dan Ewen, Matt Lieberman",Broken Road Productions,"John Cena, Keegan-Michael Key, John Leguizamo,...",4.5,1067,29900000,18847824,21788746,24.0


In [6]:
# Convert budget and gross income to number
movies['budget'] = pd.to_numeric(movies['budget'])
movies['usa_gross_income'] = pd.to_numeric(movies['usa_gross_income'])
movies['worlwide_gross_income'] = pd.to_numeric(movies['worlwide_gross_income'])

# Transform values to k
movies['budget'] = movies['budget']/100
movies['usa_gross_income'] = movies['usa_gross_income']/100
movies['worlwide_gross_income'] = movies['worlwide_gross_income']/100

In [9]:
# Create revenues from income minus budget
revenue_us = movies['usa_gross_income'] - movies['budget']
revenue_ww = movies['worlwide_gross_income'] - movies['budget']

# Add values to dataframe
movies['revenue_usa'] = revenue_us
movies['revenue_worlwide'] = revenue_ww

# Display preview of new columns
movies[['title','budget','usa_gross_income','worlwide_gross_income','revenue_usa','revenue_worlwide']]

Unnamed: 0,title,budget,usa_gross_income,worlwide_gross_income,revenue_usa,revenue_worlwide
1,City Lights,15000.0,191.81,326.09,-14808.19,-14673.91
2,Modern Times,15000.0,1635.77,4452.26,-13364.23,-10547.74
3,Snow White and the Seven Dwarfs,14990.0,1849254.86,1849254.86,1834264.86,1834264.86
4,Gone with the Wind,39770.0,2008525.79,4023525.79,1968755.79,3983755.79
5,Mr. Smith Goes to Washington,19000.0,1447.38,1447.38,-17552.62,-17552.62
...,...,...,...,...,...,...
6459,Midsommar,100000.0,274263.61,411237.70,174263.61,311237.70
6460,Unplanned,60000.0,190051.09,193204.81,130051.09,133204.81
6461,Cold Blood Legacy,27000.0,50.83,10098.88,-26949.17,-16901.12
6462,Playing with Fire,299000.0,188478.24,217887.46,-110521.76,-81112.54


Summary stats

In [10]:
# Group data base by director and count movies
director = movies.groupby('director')
director_movies = movies['director'].value_counts()

# Stats information of budget
director_budget_sum = director['budget'].sum()
director_budget_mean = director['budget'].mean()
director_budget_median = director['budget'].median()
director_budget_var = director['budget'].var()
director_budget_sem = director['budget'].sem()

# # Stats information of revenue in us
director_usrevenue_sum = director['revenue_usa'].sum()
director_usrevenue_mean = director['revenue_usa'].mean()
director_usrevenue_median = director['revenue_usa'].median()
director_usrevenue_var = director['revenue_usa'].var()
director_usrevenue_sem = director['revenue_usa'].sem()

# Stats information of revenue
director_wwrevenue_sum = director['revenue_worlwide'].sum()
director_wwrevenue_mean = director['revenue_worlwide'].mean()
director_wwrevenue_median = director['revenue_worlwide'].median()
director_wwrevenue_var = director['revenue_worlwide'].var()
director_wwrevenue_sem = director['revenue_worlwide'].sem()

# Stats information of ratings
director_ratings_sum = director['avg_vote'].sum()
director_ratings_mean = director['avg_vote'].mean()
director_ratings_median = director['avg_vote'].median()
director_ratings_var = director['avg_vote'].var()
director_ratings_sem = director['avg_vote'].sem()