In [2]:
import math
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imdb import IMDb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
%matplotlib inline


df = pd.read_csv("complete_movies.csv")
df.head()

Unnamed: 0,year,imdb,title,clean_test,binary,genre,age_rating,budget_2013$,star_rating,director,cast,writer
0,2013,tt1711425,21 &amp; Over,notalk,FAIL,Comedy,R,13000000,5.9,Jon Lucas,"['Miles Teller', 'Skylar Astin', 'Justin Chon'...",
1,2012,tt1343727,Dredd 3D,ok,PASS,Action,R,45658735,7.0,Pete Travis,"['Karl Urban', 'Rachel Wood', 'Andile Mngadi',...","['John Wagner', 'Carlos Ezquerra', 'Alex Garla..."
2,2013,tt2024544,12 Years a Slave,notalk,FAIL,Biography,R,20000000,8.1,Steve McQueen,"['Chiwetel Ejiofor', 'Dwight Henry', 'Dickie G...","['John Ridley', 'Solomon Northup']"
3,2013,tt1272878,2 Guns,notalk,FAIL,Action,R,61000000,6.8,Baltasar Kormákur,"['Denzel Washington', 'Mark Wahlberg', 'Paula ...","['Blake Masters', 'Steven Grant']"
4,2013,tt0453562,42,men,FAIL,Biography,PG13,40000000,7.5,Brian Helgeland,"['Chadwick Boseman', 'Harrison Ford', 'Nicole ...",['Brian Helgeland']


Now as you can see we have data on the imdb star rating of the movie, as well as the writer, director, and cast. I'll now use a decision tree to try to predict based on the rest of the data whether or not a movie will pass the bechdel test.

Categorizing the splits will not be so straightforward for each label

years - split into 5 year periods

ids and titles - keep, but don't analyze

clean_test, binary, genre, rating, and director - use sci-kit's LabelEncoder()

budgets - maybe check the distribution, section into 5 or so levels

star_ratings - make integer cutoffs

cast - going to have to do something special

writer - use the same technique as for cast

In [3]:
#making clean_test, binary, genre, age_rating, and director into ints

lab = preprocessing.LabelEncoder()
categoricals = ['clean_test','binary','genre','age_rating','director']
for categorical in categoricals: df[categorical] = lab.fit_transform(df[categorical])
df.head()

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


Unnamed: 0,year,imdb,title,clean_test,binary,genre,age_rating,budget_2013$,star_rating,director,cast,writer
0,2013,tt1711425,21 &amp; Over,2,0,5,5,13000000,5.9,439,"['Miles Teller', 'Skylar Astin', 'Justin Chon'...",
1,2012,tt1343727,Dredd 3D,4,1,1,5,45658735,7.0,671,"['Karl Urban', 'Rachel Wood', 'Andile Mngadi',...","['John Wagner', 'Carlos Ezquerra', 'Alex Garla..."
2,2013,tt2024544,12 Years a Slave,2,0,4,5,20000000,8.1,828,"['Chiwetel Ejiofor', 'Dwight Henry', 'Dickie G...","['John Ridley', 'Solomon Northup']"
3,2013,tt1272878,2 Guns,2,0,1,5,61000000,6.8,65,"['Denzel Washington', 'Mark Wahlberg', 'Paula ...","['Blake Masters', 'Steven Grant']"
4,2013,tt0453562,42,1,0,4,4,40000000,7.5,102,"['Chadwick Boseman', 'Harrison Ford', 'Nicole ...",['Brian Helgeland']


In [4]:
#makes star_ratings into ints

for i in range (len(df)):
    star = df.ix[i,8]
    if not math.isnan(star):
        id = int(star)
        df.ix[i, 8] = id
    else:
        df.ix[i, 8] = -1
        
df
        

Unnamed: 0,year,imdb,title,clean_test,binary,genre,age_rating,budget_2013$,star_rating,director,cast,writer
0,2013,tt1711425,21 &amp; Over,2,0,5,5,13000000,5,439,"['Miles Teller', 'Skylar Astin', 'Justin Chon'...",
1,2012,tt1343727,Dredd 3D,4,1,1,5,45658735,7,671,"['Karl Urban', 'Rachel Wood', 'Andile Mngadi',...","['John Wagner', 'Carlos Ezquerra', 'Alex Garla..."
2,2013,tt2024544,12 Years a Slave,2,0,4,5,20000000,8,828,"['Chiwetel Ejiofor', 'Dwight Henry', 'Dickie G...","['John Ridley', 'Solomon Northup']"
3,2013,tt1272878,2 Guns,2,0,1,5,61000000,6,65,"['Denzel Washington', 'Mark Wahlberg', 'Paula ...","['Blake Masters', 'Steven Grant']"
4,2013,tt0453562,42,1,0,4,4,40000000,7,102,"['Chadwick Boseman', 'Harrison Ford', 'Nicole ...",['Brian Helgeland']
5,2013,tt1335975,47 Ronin,1,0,1,4,225000000,6,118,"['Keanu Reeves', 'Hiroyuki Sanada', 'Ko Shibas...","['Chris Morgan', 'Hossein Amini', 'Chris Morga..."
6,2013,tt1606378,A Good Day to Die Hard,2,0,1,5,92000000,5,425,"['Bruce Willis', 'Jai Courtney', 'Sebastian Ko...","['Skip Woods', 'Roderick Thorp']"
7,2013,tt2194499,About Time,4,1,8,5,12000000,7,715,"['Domhnall Gleeson', 'Rachel McAdams', 'Bill N...",['Richard Curtis']
8,2013,tt1814621,Admission,4,1,5,4,13000000,5,665,"['Tina Fey', 'Ann Harada', 'Ben Levin', 'Dan L...","['Karen Croner', 'Jean Hanff Korelitz']"
9,2013,tt1815862,After Earth,2,0,1,4,130000000,4,527,"['Jaden Smith', 'Will Smith', 'Sophie Okonedo'...",


In [5]:
#puts years into year ranges

for i in range (len(df)):
    year = df.ix[i,0]
    #gets decade
    new_year = (year % 100) - (year % 10)
    #rounds new_year to the nearest half decade
    if year%10 >= 5:
        new_year += 5
    df.ix[i,0] = new_year

In [6]:
# makes actor/writer strings back into arrays of actors/writers

def string_to_arr(thing):
    thing = string.replace(thing,"'", "")
    thing = string.replace(thing,"[","")
    thing = string.replace(thing,"]","")
    new = string.split(thing, ',')
    return new

#first set all the nans to ""

df.cast.fillna("", inplace=True)
df.writer.fillna("", inplace=True)

#dataframes don't store arrays well, so i'll make a separate array of arrays

actor_array = []
writer_array = [] 
for i in range (len(df)):
    actor_array.append(string_to_arr(df.ix[i,10]))
    writer_array.append(string_to_arr(df.ix[i,11]))


Next I'll tackle how to tell how influential an actor or writer might be in the passing of a movie. I'm going to make a dictionary with actors as keys, and an array of 1s and 0s as values. Each 1 represents a passing movie the actor has appeared in, while each 0 represents a fail. Then I can easily check for any given actor what percentage of their movies has passed the test. I'll do the same for writers

In [7]:
def make_into_dict(arr, df):
    dictionary = {}
    for i in range(len(arr)):
        #verdict is 1 for pass, 0 for fail 
        verdict = df.ix[i,4]
        #gets the cast list for movie index i 
        people = arr[i]
        for person in people:
            #puts a person in the dictionary and/or adds their passing/failing movie
            if person in dictionary:
                dictionary[person].append(verdict)
            else:
                dictionary[person] = [verdict]
    return dictionary
            
    


In [8]:
# Making the actual dictionaries for actors and writers

actor_pass_dict = make_into_dict(actor_array, df)
writer_pass_dict = make_into_dict(writer_array, df)


In [9]:
# Dividing the budget_2013$ by 1000 so it can fit into int32 format

for i in range (len(df)):
    budget = df.ix[i,7]
    if not math.isnan(budget):
        df.ix[i,7] = budget/1000
    else:
        df.ix[i,7] = 10

In [10]:
#Checking to make sure all NaN's are filled

df.isnull().any()

year            False
imdb            False
title           False
clean_test      False
binary          False
genre           False
age_rating      False
budget_2013$    False
star_rating     False
director        False
cast            False
writer          False
dtype: bool

Actually Analyzing the Data, Constructing a Decision Tree

In [11]:
df_new = df.drop(['imdb','title','clean_test','cast','writer'], axis = 1)
feature_cols = df_new.drop(['binary'], axis = 1).columns 
#all but the imdb, title, binary, cast, writer columns
X = df_new[feature_cols] # have to add in the other columns from arrays
y = df_new.binary

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print X_train
print y_train
print X_test
print y_test
df_new

[[  5.00000000e+00   3.00000000e+00   0.00000000e+00   1.08573000e+05
    7.00000000e+00   6.90000000e+02]
 [  5.00000000e+00   3.00000000e+00   3.00000000e+00   2.08780000e+04
    6.00000000e+00   1.57000000e+02]
 [  1.00000000e+01   1.00000000e+00   4.00000000e+00   1.81599000e+05
    7.00000000e+00   4.38000000e+02]
 ..., 
 [  0.00000000e+00   6.00000000e+00   4.00000000e+00   5.43970000e+04
    6.00000000e+00   1.17000000e+02]
 [  1.00000000e+01   3.00000000e+00   3.00000000e+00   1.55353000e+05
    7.00000000e+00   3.76000000e+02]
 [  0.00000000e+00   2.00000000e+00   5.00000000e+00   8.22950000e+04
    6.00000000e+00   7.61000000e+02]]
[0 1 1 ..., 1 1 1]
[[  0.00000000e+00   8.00000000e+00   0.00000000e+00   9.90910000e+04
    7.00000000e+00   5.27000000e+02]
 [  5.00000000e+00   3.00000000e+00   0.00000000e+00   1.94806000e+05
    8.00000000e+00   4.30000000e+01]
 [  5.00000000e+00   5.00000000e+00   4.00000000e+00   4.04520000e+04
    6.00000000e+00   1.90000000e+02]
 ..., 
 [ 

Unnamed: 0,year,binary,genre,age_rating,budget_2013$,star_rating,director
0,10,0,5,5,13000,5,439
1,10,1,1,5,45658,7,671
2,10,0,4,5,20000,8,828
3,10,0,1,5,61000,6,65
4,10,0,4,4,40000,7,102
5,10,0,1,4,225000,6,118
6,10,0,1,5,92000,5,425
7,10,1,8,5,12000,7,715
8,10,1,5,4,13000,5,665
9,10,0,1,4,130000,4,527


In [12]:
from sklearn.tree import DecisionTreeRegressor
treereg = DecisionTreeRegressor(random_state=1)
treereg.fit(X_train, y_train)

DecisionTreeRegressor(compute_importances=None, criterion='mse',
           max_depth=None, max_features=None, max_leaf_nodes=None,
           min_density=None, min_samples_leaf=1, min_samples_split=2,
           random_state=1, splitter='best')

In [13]:
preds = treereg.predict(X_test)

print preds
print y_test

[ 0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  1.  1.  0.  0.  0.  1.  0.  0.
  0.  1.  0.  1.  0.  1.  1.  0.  1.  0.  0.  1.  1.  0.  0.  1.  0.  0.
  1.  1.  1.  1.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  1.  1.  1.  0.
  1.  0.  1.  1.  0.  0.  1.  1.  0.  1.  1.  0.  1.  1.  0.  0.  0.  0.
  0.  1.  1.  0.  1.  0.  0.  1.  0.  0.  1.  1.  0.  1.  0.  0.  1.  0.
  1.  0.  0.  1.  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  1.  1.  1.  0.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.
  1.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.
  0.  0.  1.  1.  1.  0.  0.  0.  0.  1.  1.  0.  0.  1.  1.  0.  0.  0.
  0.  0.  1.  1.  1.  1.  1.  0.  0.  1.  0.  0.  1.  1.  1.  1.  0.  0.
  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  0.  0.  0.  1.  1.  0.  0.  1.
  1.  1.  1.  1.  0.  1.  0.  0.  1.  0.  0.  1.  1.  1.  1.  0.  1.  0.
  1.  0.  1.  1.  0.  1.  1.  1.  0.  0.  1.  1.  0.  0.  1.  1.  1.  0.
  0.  0.  0.  1.  0.  1.  0.  1.  1.  0.  0.  0.  1

In [14]:
from sklearn import metrics
import numpy as np
np.sqrt(metrics.mean_squared_error(y_test, preds))

0.2004459314343183

In [32]:
from sklearn.cross_validation import cross_val_score
treereg = DecisionTreeRegressor(max_depth=14, random_state=1)
scores = cross_val_score(treereg, X, y, cv=3, scoring='mean_squared_error')
np.mean(np.sqrt(-scores))

0.16925112066535353

In [63]:
treereg = DecisionTreeRegressor(max_depth=14, random_state=1)
treereg.fit(X, y)

DecisionTreeRegressor(compute_importances=None, criterion='mse', max_depth=14,
           max_features=None, max_leaf_nodes=None, min_density=None,
           min_samples_leaf=1, min_samples_split=2, random_state=1,
           splitter='best')

In [64]:
pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_})

Unnamed: 0,feature,importance
0,year,0.007426
1,genre,0.004572
2,age_rating,0.933641
3,budget_2013$,0.026643
4,star_rating,0.011545
5,director,0.016174


In [65]:
#Seems like age_rating, budget, director, and 
#star_rating matter more

!ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
!brew install libtool
!brew install graphviz

It appears Homebrew is already installed. If your intent is to reinstall you
should do the following before running this installer again:
    rm -rf /usr/local/Cellar /usr/local/.git && brew cleanup


In [66]:
from sklearn.tree import export_graphviz
with open("bechdel.dot", 'wb') as f:
    f = export_graphviz(treereg, out_file=f, feature_names=feature_cols)
