# Predicting Movie Revenue

## Features  

**Variable to Predict:** Revenue


- Budget  
- Runtime  
- Genre  
- Release Date  
 - Season  
 - Holiday   
- Production Companies  
 - Number of companies involved  
 - Number of movies a company has made (previously)  
   - Histogram of how often top 10% of production companies make movies
- Crew
 - Number of crewmembers  
- Cast
 - Gender Ratio for top 2/5/10/25
 - Number of previous movies top 10 cast have been in
- Collection
 - Is sequel  


In [1]:
import pandas as pd
import sqlite3

In [2]:
db = sqlite3.connect(r'../../../Data Science Data/Unit 3/db.sqlite')

In [3]:
## SQL import statements to create dataframes

# Base dataframe to join other onto
base_df = pd.read_sql("""SELECT movie_id, budget, release_date, revenue, runtime, imdb_budget, imdb_revenue, title 
                         FROM movies 
                         WHERE (budget > 0 OR imdb_budget > 0) AND (revenue > 0 OR imdb_revenue > 0)
                      """, db, index_col='movie_id')
base_df.index = base_df.index.map(int)

# Pull in genre table to create dummy variables
genre_df = pd.read_sql("""SELECT * FROM genres""", db)

# Create series for number of production companies associated with each movie_id
number_of_prod_companies = pd.read_sql("""SELECT movie_id, COUNT(company_name) AS prod_company_count
                                          FROM production_companies
                                          GROUP BY 1
                                          """, db, index_col='movie_id')
number_of_prod_companies.index = number_of_prod_companies.index.map(int)

# Create series for number of crew members associated with each movie_id
number_of_crewmembers = pd.read_sql("""SELECT movie_id, COUNT(crew_member_id) AS crewmember_count
                                       FROM crew
                                       GROUP BY 1
                                       """, db, index_col='movie_id')
number_of_crewmembers.index = number_of_crewmembers.index.map(int)

# Dataframe for cast data to create gender ratio, and number of previous movies for cast
cast_data = pd.read_sql("""SELECT cc.movie_id, cc.cast_member_id, cc.cast_order, c.gender, m.release_date
                                  FROM cast_credit cc
                                  JOIN cast c ON cc.cast_member_id = c.cast_member_id
                                  JOIN movies m ON cc.movie_id = m.movie_id
                                  """, db, index_col = 'movie_id')
cast_data.index = cast_data.index.map(int)

In [4]:
# Create dummy variables for genre
pivot_genre = genre_df.pivot_table(index='movie_id',columns='genre_name', values='genre_name',aggfunc=len, fill_value=0)
pivot_genre.index = pivot_genre.index.map(int)
pivot_genre = pivot_genre.apply(pd.to_numeric)

In [5]:
# Create column for gender ratio
gender_data = cast_data.groupby(by=cast_data.index).mean()['gender']
gender_data.index = gender_data.index.map(int)

In [None]:
cast_data.sort_values(['cast_member_id', 'release_date'], inplace=True)

# cast_data_selection = cast_data.iloc[list(base_df.index)]

current = None
counter = 0

for index, row in cast_data.iterrows():
    if current == row['cast_member_id']:
        counter += 1
        cast_data.set_value(index,'cast_member_id', counter)
    else:
        current = row['cast_member_id']
        counter = 0
        cast_data.set_value(index,'cast_member_id', counter)

cast_data_selection.head()

In [30]:
# Join 'em up

df = base_df.join(pivot_genre, how='inner')
df = df.join(number_of_prod_companies, how='inner')
df = df.join(number_of_crewmembers, how='inner')
df = df.join(gender_data, how='inner')

df.head()

Unnamed: 0_level_0,budget,release_date,revenue,runtime,imdb_budget,imdb_revenue,title,Action,Adventure,Animation,...,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,prod_company_count,crewmember_count,gender
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
819,44000000,1996-10-18,165615285,147.0,,,Sleepers,0,0,0,...,0,0,0,0,1,0,0,3,16,1.571429
8195,55000000,1998-09-25,41610884,122.0,,,Ronin,1,1,0,...,0,0,0,0,1,0,0,2,75,1.066667
184341,20000000,2016-08-26,0,105.0,20000000.0,4711736.0,Hands of Stone,0,0,0,...,0,0,0,0,0,0,0,4,4,1.545455
376866,9000000,2016-12-02,13960394,100.0,,,Jackie,0,0,0,...,0,0,0,0,0,0,0,9,97,0.5
45094,12500000,2010-09-11,6732980,107.0,,,Conviction,0,0,0,...,0,0,0,0,0,0,0,6,16,1.066667
