In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [23]:
shows = pd.read_csv('good_shows_data2.csv')

In [24]:
pd.set_option('max_seq_items', 200)

In [25]:
shows.shape

(2515, 168)

In [26]:
## dropping old shows
shows = shows[shows['release_date']>=1980]

In [27]:
## dropping shows where I don't know whether they were cancelled
shows = shows[shows['seasons'] != 0]

In [28]:
shows.shape

(2061, 168)

In [29]:
shows.columns

Index([u'json', u'title_rough', u'check', u'title', u'imdb_id', u'big_json',
       u'seasons', u'cancelled', u'runtime', u'genres', u'imdb_rating',
       u'release_date', u'plot', u'year', u'type', u'votes', u'keywords',
       u'first_year', u'is_new', u'fixed_runtime', u'is_action',
       u'is_adventure', u'is_animation', u'is_biography', u'is_comedy',
       u'is_crime', u'is_documentary', u'is_drama', u'is_family',
       u'is_fantasy', u'is_game', u'is_history', u'is_horror', u'is_music',
       u'is_musical', u'is_mystery', u'is_news', u'is_reality', u'is_romance',
       u'is_sci', u'is_short', u'is_sport', u'is_talk', u'is_thriller',
       u'is_war', u'is_western', u'release_month', u'release_weekday',
       u'release_monthday', u'stemmed_plot', u'stemmed_keywords',
       u'keyword_adult', u'keyword_african', u'keyword_alien',
       u'keyword_american', u'keyword_angel', u'keyword_anim', u'keyword_base',
       u'keyword_best', u'keyword_black', u'keyword_book', u'keywor

In [30]:
shows.dtypes

json                  object
title_rough           object
check                  int64
title                 object
imdb_id               object
big_json              object
seasons                int64
cancelled              int64
runtime               object
genres                object
imdb_rating          float64
release_date          object
plot                  object
year                  object
type                  object
votes                 object
keywords              object
first_year             int64
is_new                 int64
fixed_runtime        float64
is_action              int64
is_adventure           int64
is_animation           int64
is_biography           int64
is_comedy              int64
is_crime               int64
is_documentary         int64
is_drama               int64
is_family              int64
is_fantasy             int64
                      ...   
from_CBS               int64
from_Fox               int64
from_Nickelodeon       int64
from_Cartoon  

In [31]:
shows = shows.drop(['json', u'title_rough', u'check', u'title', 'imdb_id', u'big_json',
       u'seasons', 'runtime', u'genres', u'imdb_rating',
       u'release_date', u'plot', u'year', u'type', u'votes', u'keywords', u'is_new', 
        'stemmed_plot', u'stemmed_keywords', 'plot_cleaned', u'network'], axis=1)

In [34]:
shows.drop('fixed_runtime', inplace=True, axis=1)

In [39]:
shows.drop('keyword_new', inplace=True, axis=1)

In [35]:
shows.shape

(2061, 146)

In [40]:
shows.columns

Index([u'cancelled', u'first_year', u'is_action', u'is_adventure',
       u'is_animation', u'is_biography', u'is_comedy', u'is_crime',
       u'is_documentary', u'is_drama', u'is_family', u'is_fantasy', u'is_game',
       u'is_history', u'is_horror', u'is_music', u'is_musical', u'is_mystery',
       u'is_news', u'is_reality', u'is_romance', u'is_sci', u'is_short',
       u'is_sport', u'is_talk', u'is_thriller', u'is_war', u'is_western',
       u'release_month', u'release_weekday', u'release_monthday',
       u'keyword_adult', u'keyword_african', u'keyword_alien',
       u'keyword_american', u'keyword_angel', u'keyword_anim', u'keyword_base',
       u'keyword_best', u'keyword_black', u'keyword_book', u'keyword_boy',
       u'keyword_boyfriend', u'keyword_brother', u'keyword_california',
       u'keyword_celebr', u'keyword_charact', u'keyword_child',
       u'keyword_citi', u'keyword_comedi', u'keyword_comedian',
       u'keyword_comic', u'keyword_cult', u'keyword_daughter',
       u'key

In [37]:
shows.shape

(2061, 146)

In [38]:
shows.dropna().shape

(2061, 146)

In [41]:
## defining dependent/independent variables
y = shows['cancelled']
x = shows.drop('cancelled', axis=1)

In [42]:
## splitting data
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [51]:
x_train.shape

(1545, 144)

In [52]:
x_test.shape

(516, 144)

In [53]:
y_train.shape

(1545,)

In [54]:
y_test.shape

(516,)

In [84]:
def run_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train)
    print "Training set score: ", model.score(x_train, y_train)
    print "Test set score: ", model.score(x_test, y_test)
    predictions = model.predict(x_test)
    print "\nConfusion Matrix:\n", confusion_matrix(y_test, predictions), "\n"
    print "Classification Report:\n",classification_report(y_test, predictions)

## Random Forest

In [43]:
rf = RandomForestClassifier()

In [65]:
run_model(x_train, y_train, x_test, y_test, rf)

Training set score:  0.981877022654
Test set score:  0.643410852713

Confusion Matrix:
[[263  54]
 [130  69]]


             precision    recall  f1-score   support

          0       0.67      0.83      0.74       317
          1       0.56      0.35      0.43       199

avg / total       0.63      0.64      0.62       516



## Random Forest with grid search

In [68]:
parameters = {
    "n_estimators" : [10, 25, 50, 100],
    "max_depth" : [2,3,5,7,10],
    "max_features" : [0.25, 0.5, 0.75],
    "min_samples_leaf" : [2,4]
}

In [67]:
gs_rf = RandomForestClassifier()
grid_search = GridSearchCV(gs_rf,param_grid=parameters, cv=7, n_jobs=-1)

In [69]:
## 12:21 - 12:22
grid_search.fit(x_train, y_train)

GridSearchCV(cv=7, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 25, 50, 100], 'max_features': [0.25, 0.5, 0.75], 'max_depth': [2, 3, 5, 7, 10], 'min_samples_leaf': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [72]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=0.25, max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [71]:
run_model(x_train, y_train, x_test, y_test, grid_search.best_estimator_)

Training set score:  0.815533980583
Test set score:  0.699612403101

Confusion Matrix:
[[278  39]
 [116  83]]


             precision    recall  f1-score   support

          0       0.71      0.88      0.78       317
          1       0.68      0.42      0.52       199

avg / total       0.70      0.70      0.68       516



## Decision Tree

In [80]:
## tune hyperparameters here
dt = DecisionTreeClassifier(
    max_depth=10, 
    min_samples_leaf=3
)

In [78]:
dt

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [85]:
run_model(x_train, y_train, x_test, y_test, dt)

Training set score:  0.780582524272
Test set score:  0.612403100775

Confusion Matrix:
[[222  95]
 [105  94]] 

Classification Report:
             precision    recall  f1-score   support

          0       0.68      0.70      0.69       317
          1       0.50      0.47      0.48       199

avg / total       0.61      0.61      0.61       516

