In [52]:
import pandas as pd
import numpy as np
import urllib2
from bs4 import BeautifulSoup
from time import sleep
import requests
import json
from datetime import datetime
import pickle

In [53]:
pd.set_option("display.max_columns", 200)
pd.set_option('max_seq_items', 200)

# Misc Functions

In [54]:
def bleach(string):
    temp = ""
    string = str(string)
    for i in string:
        if i in ["1","2","3","4","5","6","7","8","9","0","."]:
            temp += i
    if len(temp) > 0:
        return float(temp)

def combine_list(list):
    temp = ""
    for i in list:
        temp += " " + i
    return temp

# Get API info from IMDBid

In [55]:

## gets complete information from omdbapi using the imdb id

def get_api_from_id(title_id):
    this_url = "http://www.omdbapi.com/?i=" + title_id + "&plot=full&r=json&apikey=9f5296af"
    req = requests.get(this_url)
    return req.json()
#     print req.json()
    sleep(2)

In [56]:
def df_from_api(title_id):
    df = pd.DataFrame(data=[title_id], columns=['imdb_id'])
    ## check the column names here against the column names in the dataset
    df['json'] = df['imdb_id'].apply(get_api_from_id)
    df['name'] = df['json'].apply(lambda x: x['Title'])
    df['genres'] = df['json'].apply(lambda x: str.lower(str(x['Genre'])))
    df['seasons'] = df['json'].apply(lambda x: bleach(x['totalSeasons']))
    df['runtime'] = df['json'].apply(lambda x: bleach(x['Runtime']))
    df['release_date'] = df['json'].apply(lambda x: x['Released'])
    sleep(2)
    return df

# Parse Genres

In [57]:
def parse_genres(df):
    # create dummy variables of all of the genres
    genre_names = ['action', u'adventure', u'animation', u'biography', u'comedy',
           u'crime', u'documentary', u'drama', u'family', u'fantasy',
           u'game', u'history', u'horror', u'music', u'musical', u'mystery',
           u'news', u'reality', u'romance', u'sci', u'short', u'sport', u'talk',
           u'thriller', u'war', u'western']

    for i in genre_names:
        df['is_%s' % i] = df['genres'].apply(lambda x: 1 if i in x.lower() else 0)

# Scrape Network

In [58]:
def scrape_network(id):
    words = ""
    url = "http://www.imdb.com/title/" + id + "/companycredits?ref_=ttspec_sa_5"
    soup = BeautifulSoup(urllib2.urlopen(url))
    simpleLists = soup.find_all('ul', {'class': 'simpleList'})
    try:
        for li in simpleLists[1]('li'):
            for a in li('a'):
                words += (a.get_text() + '\n')
        return words.split("\n")[0]
#         print words.split("\n")[0]
    except:
        return "unknown"
    sleep(2)

# Scrape Keywords

In [59]:
## Accesses page based on imdb id

def access_keyword_page(imdbID):
    ## imdbID needs to be added in str format
    return 'http://www.imdb.com/title/' + imdbID + '/keywords?ref_=tt_stry_kw'

In [60]:
def scrape_keywords(imdbID):
    soup_for_keywords = BeautifulSoup(urllib2.urlopen(access_keyword_page(imdbID)))
    temp_keywords = []
    for div in soup_for_keywords('div', {'id':'keywords_content'}):
        for text in div('div', {'class':'sodatext'}):
            for a in text('a'):
    #             print a.get_text()
                temp_keywords.append(a.get_text())
    return temp_keywords
#     print temp_keywords
    sleep(2)

In [61]:
def parse_keywords(df):
    # for making keyword dummy variables
    keywords_to_use_2 = [u'adult', u'african', u'alien',
           u'american', u'angel', u'anim', u'base', u'best', u'black', u'book',
           u'boy', u'boyfriend', u'brother', u'california', u'celebr', u'charact',
           u'child', u'citi', u'comedi', u'comedian', u'comic', u'cult',
           u'daughter', u'death', u'detect', u'doctor', u'evil', u'famili',
           u'father', u'femal', u'fiction',u'friend', u'friendship',
           u'gay', u'girl', u'girlfriend', u'hero', u'humor',
           u'husband', u'interraci', u'interview', u'investig', u'joke',
           u'life', u'live', u'love', u'male', u'man', u'marriag', u'mother',
           u'murder',u'offic', u'parent', u'parodi',
           u'play', u'polic', u'power', u'protagonist', u'relationship', u'satir',
           u'school', u'secret',u'sex', u'share', u'sister', u'sitcom',
           u'social', u'son', u'spoken', u'spoof', u'stand', u'student',
           u'superhero', u'supernatur', u'surreal', u'teenag',
           u'versu', u'villain', u'violenc', u'wife', u'woman',
           u'york']

    for i in keywords_to_use_2:
        df['keyword_%s' % i] = df['keywords'].apply(lambda x: 1 if i in x else 0)

# Parse Dates

In [62]:
def parse_dates(df):
    df['release_date'] = df['release_date'].apply(lambda x: datetime.strptime(x, '%d %b %Y'))
    df['release_month'] = df['release_date'].apply(lambda x: x.strftime('%m'))
    df['release_weekday'] = df['release_date'].apply(lambda x: x.strftime('%w'))
    ## day of the month
    df['release_monthday'] = df['release_date'].apply(lambda x: x.strftime('%d'))
    ## days of the week
    df['started_sunday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==0 else 0)
    df['started_monday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==1 else 0)
    df['started_tuesday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==2 else 0)
    df['started_wednesday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==3 else 0)
    df['started_thursday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==4 else 0)
    df['started_friday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==5 else 0)
    df['started_saturday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==6 else 0)
    ## months
    df['started_january'] = df['release_month'].apply(lambda x: 1 if int(x)==1 else 0)
    df['started_february'] = df['release_month'].apply(lambda x: 1 if int(x)==2 else 0)
    df['started_march'] = df['release_month'].apply(lambda x: 1 if int(x)==3 else 0)
    df['started_april'] = df['release_month'].apply(lambda x: 1 if int(x)==4 else 0)
    df['started_may'] = df['release_month'].apply(lambda x: 1 if int(x)==5 else 0)
    df['started_june'] = df['release_month'].apply(lambda x: 1 if int(x)==6 else 0)
    df['started_july'] = df['release_month'].apply(lambda x: 1 if int(x)==7 else 0)
    df['started_august'] = df['release_month'].apply(lambda x: 1 if int(x)==8 else 0)
    df['started_september'] = df['release_month'].apply(lambda x: 1 if int(x)==9 else 0)
    df['started_october'] = df['release_month'].apply(lambda x: 1 if int(x)==10 else 0)
    df['started_november'] = df['release_month'].apply(lambda x: 1 if int(x)==11 else 0)
    df['started_december'] = df['release_month'].apply(lambda x: 1 if int(x)==12 else 0)
    ## year
    df['first_year'] = df['release_date'].apply(lambda x: int(x.strftime('%Y')))
    df['started_on_first'] = df['release_monthday'].dropna().apply(lambda x: 1 if x==1 else 0)

# Parsing runtime

In [63]:
def parse_runtime(df):
    df['half_hour'] = df['runtime'].apply(lambda x: 1 if (int(x)<= 30) and (int(x)>= 20) else 0)
    df['full_hour'] = df['runtime'].apply(lambda x: 1 if (int(x)<= 60) and (int(x)>= 40) else 0)

# Parse network

In [64]:
def parse_network(df):
    networks = ['ABC', 'NBC', 'CBS', 'Fox', 'Nickelodeon', 'Cartoon', 'Comedy', 'MTV',
               'HBO', 'Disney', 'WB']
    for i in networks:
        df['from_' + i] = df['network'].apply(lambda x: 1 if i in x else 0)
    
# 'from_ABC', u'from_NBC', u'from_CBS', u'from_Fox', u'from_Nickelodeon',
#        u'from_Cartoon', u'from_Comedy', u'from_MTV', u'from_HBO',
#        u'from_Disney', u'from_WB'    

# Prepare DF

In [65]:
## delete errant columns
## drop: name, runtime, imdb_id, json, genres, seasons, release_date, network, keywords, 
## keep original dataframe so the script can output the show title, and retrieve other information
    ## like number of seasons

In [66]:
def define_features(df):
    df2 = df
    df2.drop(['name', 'runtime', 'imdb_id', 'json', 'genres', 'seasons', 'release_date', 'network', 
              'keywords', 'release_month', 'release_weekday'], inplace=True, axis=1)
    return df2

# Run through the model

In [67]:
# import pickled model
# 

In [68]:
# model_pickle_path = 'ada_boost_pickle.pkl'
# model_unpickle = open(model_pickle_path, 'rb')

# Putting it all together

In [94]:
## make a different function that opens the pickled model and runs the df through it

def get_tv_prediction(imdb_id):
    df1 = df_from_api(imdb_id)
    df1['keywords'] = df1['imdb_id'].apply(lambda x: combine_list(scrape_keywords(x)))
    df1['network'] = df1['imdb_id'].apply(lambda x: (str(scrape_network(x))))
    parse_keywords(df1)
    parse_network(df1)
    parse_dates(df1)
    parse_genres(df1)
    parse_runtime(df1)
    name = df1['name'].values[0]
    df2 = define_features(df1)
#     model_pickle_path = 'ada_boost_pickle.pkl'
#     model_unpickle = open(model_pickle_path, 'rb')
#     ada_boost = pickle.load(model_unpickle)
#     prediction = ["Cancelled" if ada_boost.predict(df2) == 1 else "Renewed"]
#     print "The model predicts that %s will be %s" %(df1['name'].values, prediction[0])
#     print df1
    return name
#     print df2
#     return df2

In [95]:
get_tv_prediction('tt0303461')

u'Firefly'

In [71]:
test = get_tv_prediction('tt0303461')


In [72]:
test.shape

(1, 143)

In [73]:
test.columns.sort_values()[91]

u'keyword_offic'

In [79]:
# test.columns.sort_values()

# Dataset

In [75]:
# shows = pd.read_csv('../good_shows_data2.csv')

In [76]:
# pd.set_option('max_seq_items', 200)

In [78]:
# shows.columns.sort