Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
189 lines (110 sloc) 7.3 KB
import pandas as pd
import numpy as np
def movie_clean_df(dat):
"""takes in a data frame of movies and scraped data points, then performs cleaning steps to create a data frame that's ready for EDA for objective: Analyze cult movie phenomenon. Returns ready data frame."""
#Step 1: subset data frame.
# let's only use the data points for which BOMOJO returned matches
dat_eda = dat.loc[dat.rev_totalGross.notnull(),:]
# or just read in latest pickle!
with open(r"Movie_DF_latest_apr24_10pm.p", "rb") as input_file:
dat_eda = pickle.load(input_file)
#Steps 1.x: Adjust revenue numbers for inflation!
##ADJUST for inflation the following columns: rev_totalGross, adjusted by CPI; rev_opening adjusted by BOMOJO ticket sales;
# prod_budget, adjusted by CPI; recalculate rev_postOpening!
#read in BOMOJO ticket sales adjuster by year
bomojo_adj = pd.read_csv('./bomojo_ticket_price_adjuster.csv',header=0)
bomojo_adj.columns=bomojo_adj.columns.str.replace('\.\s','')
#bomojo_adj.Year = bomojo_adj.Year.astype('str')
bomojo_adj.AvgPrice = bomojo_adj.AvgPrice.str.replace('$','').astype('float')
#bomojo_adj.Year = bomojo_adj.Year.astype('int')
bomojo_adj.head()
cpi = pd.read_csv("./cpi_by_year.csv", header=0)
cpi.columns=cpi.columns.str.replace('\.\s','')
cpi['CPI']=cpi.CPI.astype('float')
cpi['year']=cpi.year.astype('int')
mov['releaseYear'] = mov.releaseDate.apply(lambda val: val.year)
mov['rev_opening_ADJ'] = (mov.rev_opening/mov.AvgPrice)*8.58
mov['rev_totalGross'] = (mov.rev_totalGross/mov.AvgPrice)*8.58
mov['prod_budget_ADJ'] = (mov.prod_budget/mov.CPI) * 238 #2016 CPI
#step 2: Create post_opening_rev feature
#This is intended to be the revenue difference ebtween opening weekend and lifetime gross.
#dat_eda['rev_postOpening'] = dat_eda.apply(lambda row: row.rev_totalGross - row.rev_opening, axis=1)
mov['rev_postOpening'] = mov.rev_totalGross_ADJ - mov.rev_opening_ADJ
# Step 3: Coerce date entries
dat_eda.releaseDate=pd.to_datetime(dat_eda.releaseDate, errors='coerce')
# Step 4: Categorize columns director, genre, distributor, genre_bomojo, rating
dat_eda=pd.concat([ dat_eda[['director','cast','genre','studios','distributor','genre_bomojo','rating']].apply(lambda col: col.astype('category'), axis=0),
dat_eda[['title','releaseDate','canontitle','rev_totalGross','rev_opening','num_theaters','runtime','prod_budget','rev_postOpening']] ],
axis=1)
# Step 5: Extract leading cast member
dat_eda['leadActor']=dat_eda.cast.apply(lambda val: re.split(r',',val)[0]).astype('category')
# Step 6: Remove wiki's "studio". use "distributor" as studio??
# both "genre" and "genre_bomojo" suck! bmojo is better
dat_eda.drop(['studios','cast','genre'], axis=1, inplace=True)
# Step 7: convert "runtime" to minutes field
dat_eda['runtime']=dat_eda.runtime.str.replace('N/A','0', case=False)
dat_eda['runtime_mins']=(dat_eda.runtime.str.split(' ').str.get(0).astype('float'))*60 + (dat_eda.runtime.str.split(' ').str.get(2).astype('float'))
# step 9: convert production budget to numeric, subout "million" word
dat_eda['prod_budget']=dat_eda.prod_budget.str.replace('million','000000',case=False)
dat_eda['prod_budget']=dat_eda.prod_budget.str.replace('N/A|NA','0',case=False)
dat_eda['prod_budget']=dat_eda.prod_budget.astype('float')
# step 10: read in a separate list of cult movies, merge on canonical names and label new
cultlist = pd.read_csv('/Users/ash/Downloads/cult movie list.csv', header=0)
cultlist = cultdf.set_index('title')
mov = mov.merge(cultlist, how='left')
#done with cleaning!!!!!!
return dat_eda
#==============================================================================
#not run
def movie_eda(mov):
"""Exploratory steps and manipulations on the movie dataset before training. Returns a final,
training and test datasets"""
#IMPUTE MISSING VALUES
##rev_totalGross_ADJ has only 10 missing datapoints, whack years! delete
mov.rev_totalGross_ADJ.dropna(inplace=True) #IMPUTE (delete in this case, all gross revs missing)
#Next, we'll look at rev_opening_ADJ
#first, impute by mean revenue of distributor and genre for that year,
mov['rev_opening_ADJ'].fillna(mov.groupby(['year','distributor','genre_bomojo'])['rev_opening_ADJ'].transform('mean'), inplace=True)
#only catches a few because of the year constraint, now impute based on distributor and genre
mov['rev_opening_ADJ'].fillna(mov.groupby(['distributor','genre_bomojo'])['rev_opening_ADJ'].transform('mean'), inplace=True)
#fills about a 2/3rds of Nas, drop the rest...
mov['rev_opening_ADJ'].dropna(inplace=True)
#Okay, next! prod_budget. There are 3610 missing values!!! Sensitive to imputing, but this could also be an important feature. Let's try the strategy we tried uptop.
mov.prod_budget_ADJ[mov.prod_budget_ADJ==0]=np.nan
mov.prod_budget_ADJ.fillna(mov.groupby(['year','distributor','genre_bomojo'])['prod_budget_ADJ'].transform('mean'), inplace=True)
mov.prod_budget_ADJ.fillna(mov.groupby(['distributor','genre_bomojo'])['prod_budget_ADJ'].transform('median'), inplace=True)
#these cut down nans to more than half. will drop the rest, too much noise at just the distributor level... fuck it ill try that
mov.prod_budget_ADJ.fillna(mov.groupby(['distributor'])['prod_budget_ADJ'].transform('median'), inplace=True)
# okay, down to 371 missing values. Imputed 90% of them! drop the rest
mov.prod_budget_ADJ.dropna(inplace=True)
# REMOVE OUTLIERS!!!!
#look at hist. kernel density:
plt.figure(figsize=(20,10))
sns.distplot(mov.rev_opening_ADJ, rug=True)
mov.sort_index(inplace=True)
#MAIN HYPOTHESIS: The CULTINDEX is a somewhat significant gauge of a movie's cult status
#Ideally, it takes into account post release variables
#Cult Index Calculation - v0
##Penalize index for higher production budget (reward for lower): CI = constant * 1/prod_budget
##Penalize index heavily for high opening weekend (adhering to the "exclusive" definition of cult movies: CI = constant * 1/(rev_opening)^2 (consider polynomial in v0.1)
##Penalize index for higher number of opening theaters (reward for lower): CI = constant * 1/num_theaters
##Reward index somewhat for high post opening lifetime gross revenue, this is rev_postOpening as a fraction of the opening revenue; I want to reward the index if fraction gets very large: CI = constant * exp(rev_postOpening/rev_opening)
#Cult Index v0 = [exp(rev_postOpening/rev_opening)] / [prod_budget * rev_opening^2 * num_theaters], scaled to 0:1
mov['rev_fraction'] = mov.rev_postOpening/mov.rev_opening
mov.ix[mov.prod_budget==0,'prod_budget'] = np.nan
mov['CULT_INDEX'] = np.exp(np.log(mov.rev_totalGross_ADJ)/(np.log(mov.rev_opening_ADJ**2)*(mov.num_theaters+1)))
mov=mov.sort_values(by='CULT_INDEX',ascending=False)
#boxplot
plt.figure(figsize=(15, 7))
sns.set_style('whitegrid')
#movtemp = mov[mov.CULT_INDEX<2]
sns.boxplot(x='isCult',y='CULT_INDEX',data=mov)
plt.ylim(1,1.003)
plt.title('Cult Index Variation in Movies Dataset\n(0=not cult, 1=is cult)')
#distplot
plt.figure(figsize=(15, 7))
sns.set_style('whitegrid')
sns.distplot(cult.CULT_INDEX,
rug=False, axlabel='Cult Index...',
color='r')
# CATEGORICALS!!!!!!!!!!!!!!
You can’t perform that action at this time.