In [1]:
#1)import all the stuff
import numpy as np
import pandas as pd
from  sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.model_selection import StratifiedKFold

In [2]:
#2)import data scraped from web and saved to csv
movie_data = pd.read_csv("IMDBTop250.csv")

In [3]:
#3)operationalize y into two categories: "amazing"=1 and "good"=0
#this is the target variable
is_amazing = movie_data["imdbRating"].apply(lambda x: 0 if x <= np.median(movie_data["imdbRating"]) else 1)

In [7]:
#4)make X (doing this as a pipeline in case I want to scrape more data later)

#4a)write all my definitions
#column extractors:
def actors_col(df):
    return df["Actors"]

def director_col(df):
    return df["Director"]

def genre_col(df):
    return df["Genre"]

def plot_col(df):
    return df["Plot"]

def rated_col(df):
    return df["Rated"]

def runtime_col(df):
    return df["Runtime"]

def title_col(df):
    return df["Title"]

def year_col(df):
    return df["Year"].values.reshape(-1, 1)

#clean runtime col
def runtime_fix(df):
    df = df.str.replace(" min","")
    df = df.apply(int)
    return df.values.reshape(-1, 1)

#todense for CountVectorizer pipes
def dense(sparse):
    return sparse.todense()

#returns array when needed
def return_array(column):
    return column.values.reshape(-1, 1)

#fill nan for rated col
def not_rated(column):
    column.fillna("NOT RATED", inplace=True)
    return column

In [8]:
#5b) make pipelines:
actors_pipe = make_pipeline(FunctionTransformer(actors_col, validate=False),
                         CountVectorizer(ngram_range=(2, 3), min_df=2),
                         FunctionTransformer(dense, validate=False))

director_pipe = make_pipeline(FunctionTransformer(director_col, validate=False),
                              LabelBinarizer())

genre_pipe = make_pipeline(FunctionTransformer(genre_col, validate=False), 
                           CountVectorizer())

rated_pipe = make_pipeline(FunctionTransformer(rated_col, validate=False),
                           FunctionTransformer(not_rated, validate=False),
                           LabelBinarizer())

runtime_pipe = make_pipeline(FunctionTransformer(runtime_col, validate=False),
                             FunctionTransformer(runtime_fix, validate=False))

year_pipe = make_pipeline(FunctionTransformer(year_col, validate=False), LabelBinarizer())

In [10]:
#4c) make feature union:
# union = make_union(actors_pipe, director_pipe, genre_pipe, rated_pipe, runtime_pipe, year_pipe)
union = make_union(actors_pipe, director_pipe, genre_pipe, runtime_pipe, year_pipe, rated_pipe)
X = union.fit_transform(movie_data)


In [11]:
X.todense()[0:5]

matrix([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 1, ..., 0, 1, 0],
        [0, 0, 1, ..., 0, 1, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
kfolds = StratifiedKFold()
for train, test in kfolds.split(X, is_amazing):
    rf = RandomForestClassifier()
    rf.fit(X[train], is_amazing[train])
    print (rf.score(X[test], is_amazing[test]))
    actual_values = is_amazing[test]
    predicted_values = rf.predict(X[test])
    #add confusion matrices and classification reports
    print(confusion_matrix(actual_values, predicted_values))
    print(classification_report(actual_values, predicted_values))

0.642857142857
[[48  4]
 [26  6]]
             precision    recall  f1-score   support

          0       0.65      0.92      0.76        52
          1       0.60      0.19      0.29        32

avg / total       0.63      0.64      0.58        84

0.571428571429
[[46  6]
 [30  2]]
             precision    recall  f1-score   support

          0       0.61      0.88      0.72        52
          1       0.25      0.06      0.10        32

avg / total       0.47      0.57      0.48        84

0.609756097561
[[46  5]
 [27  4]]
             precision    recall  f1-score   support

          0       0.63      0.90      0.74        51
          1       0.44      0.13      0.20        31

avg / total       0.56      0.61      0.54        82



In [20]:
rf_fd = RandomForestClassifier()
rf_fd.fit(X, is_amazing)
print (rf.score(X, is_amazing))
actual_values_fd = is_amazing
predicted_values_fd = rf.predict(X)
print(confusion_matrix(actual_values_fd, predicted_values_fd))
print(classification_report(actual_values_fd, predicted_values_fd))

0.852
[[149   6]
 [ 31  64]]
             precision    recall  f1-score   support

          0       0.83      0.96      0.89       155
          1       0.91      0.67      0.78        95

avg / total       0.86      0.85      0.85       250



In [31]:
X.todense()[0:5]

matrix([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 1, ..., 0, 1, 0],
        [0, 0, 1, ..., 0, 1, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
#okay so I think the feature_importances is kind of useless here
#because the columns don't have names (remember need to add .todense() to retrieve all)
#I either need to figure out what data corresponds with what column
#or I need to redo this as a regular df
# and I could do what Richard did today with the classifiers and name each dummy column

feature_importances = pd.DataFrame(rf_fd.feature_importances_).sort_values(0, ascending=False)
feature_importances.head(20)

#see this code for a way to get the feature names (from )
#feature_importances = pd.DataFrame(dt.feature_importances_,
#                                   index = X.columns,
#                                    columns=['importance']).sort_values('importance',
#                                                                        ascending=False)


Unnamed: 0,0
344,0.076488
403,0.030796
408,0.019347
181,0.01457
429,0.014268
404,0.013325
298,0.013044
327,0.01247
350,0.012447
328,0.011717
