In [1]:
import pandas as pd
import numpy as np
import urllib2
from bs4 import BeautifulSoup
from time import sleep
import requests
import json
from datetime import datetime
import pickle

In [69]:
pd.set_option("display.max_columns", 200)

# Misc Functions

In [8]:
def bleach(string):
    temp = ""
    string = str(string)
    for i in string:
        if i in ["1","2","3","4","5","6","7","8","9","0","."]:
            temp += i
    if len(temp) > 0:
        return float(temp)

def combine_list(list):
    temp = ""
    for i in list:
        temp += " " + i
    return temp

# Get API info from IMDBid

In [9]:

## gets complete information from omdbapi using the imdb id

def get_api_from_id(title_id):
    this_url = "http://www.omdbapi.com/?i=" + title_id + "&plot=full&r=json&apikey=9f5296af"
    req = requests.get(this_url)
    return req.json()
#     print req.json()
    sleep(2)

In [10]:
def df_from_api(title_id):
    df = pd.DataFrame(data=[title_id], columns=['imdb_id'])
    ## check the column names here against the column names in the dataset
    df['json'] = df['imdb_id'].apply(get_api_from_id)
    df['name'] = df['json'].apply(lambda x: x['Title'])
    df['genres'] = df['json'].apply(lambda x: str.lower(str(x['Genre'])))
    df['seasons'] = df['json'].apply(lambda x: bleach(x['totalSeasons']))
    df['runtime'] = df['json'].apply(lambda x: bleach(x['Runtime']))
    df['release_date'] = df['json'].apply(lambda x: x['Released'])
    sleep(2)
    return df

# Parse Genres

In [11]:
def parse_genres(df):
    # create dummy variables of all of the genres
    genre_names = ['action', u'adventure', u'animation', u'biography', u'comedy',
           u'crime', u'documentary', u'drama', u'family', u'fantasy',
           u'game', u'history', u'horror', u'music', u'musical', u'mystery',
           u'news', u'reality', u'romance', u'sci', u'short', u'sport', u'talk',
           u'thriller', u'war', u'western']

    for i in genre_names:
        df['is_%s' % i] = df['genres'].apply(lambda x: 1 if i in x.lower() else 0)

# Scrape Network

In [12]:
def scrape_network(id):
    words = ""
    url = "http://www.imdb.com/title/" + id + "/companycredits?ref_=ttspec_sa_5"
    soup = BeautifulSoup(urllib2.urlopen(url))
    simpleLists = soup.find_all('ul', {'class': 'simpleList'})
    try:
        for li in simpleLists[1]('li'):
            for a in li('a'):
                words += (a.get_text() + '\n')
        return words.split("\n")[0]
#         print words.split("\n")[0]
    except:
        return "unknown"
    sleep(2)

# Scrape Keywords

In [13]:
## Accesses page based on imdb id

def access_keyword_page(imdbID):
    ## imdbID needs to be added in str format
    return 'http://www.imdb.com/title/' + imdbID + '/keywords?ref_=tt_stry_kw'

In [14]:
def scrape_keywords(imdbID):
    soup_for_keywords = BeautifulSoup(urllib2.urlopen(access_keyword_page(imdbID)))
    temp_keywords = []
    for div in soup_for_keywords('div', {'id':'keywords_content'}):
        for text in div('div', {'class':'sodatext'}):
            for a in text('a'):
    #             print a.get_text()
                temp_keywords.append(a.get_text())
    return temp_keywords
#     print temp_keywords
    sleep(2)

In [57]:
def parse_keywords(df):
    # for making keyword dummy variables
    keywords_to_use_2 = [u'adult', u'african', u'alien',
           u'american', u'angel', u'anim', u'base', u'best', u'black', u'book',
           u'boy', u'boyfriend', u'brother', u'california', u'celebr', u'charact',
           u'child', u'citi', u'comedi', u'comedian', u'comic', u'cult',
           u'daughter', u'death', u'detect', u'doctor', u'evil', u'famili',
           u'father', u'femal', u'fiction',u'friend', u'friendship',
           u'gay', u'girl', u'girlfriend', u'hero', u'humor',
           u'husband', u'interraci', u'interview', u'investig', u'joke',
           u'life', u'live', u'love', u'male', u'man', u'marriag', u'mother',
           u'murder', u'new',u'offic', u'parent', u'parodi',
           u'play', u'polic', u'power', u'protagonist', u'relationship', u'satir',
           u'school', u'secret',u'sex', u'share', u'sister', u'sitcom',
           u'social', u'son', u'spoken', u'spoof', u'stand', u'student',
           u'superhero', u'supernatur', u'surreal', u'teenag',
           u'versu', u'villain', u'violenc', u'wife', u'woman',
           u'york']

    for i in keywords_to_use_2:
        df['keyword_%s' % i] = df['keywords'].apply(lambda x: 1 if i in x else 0)

# Parse Dates

In [65]:
def parse_dates(df):
    df['release_date'] = df['release_date'].apply(lambda x: datetime.strptime(x, '%d %b %Y'))
    df['release_month'] = df['release_date'].apply(lambda x: x.strftime('%m'))
    df['release_weekday'] = df['release_date'].apply(lambda x: x.strftime('%w'))
    ## day of the month
    df['release_monthday'] = df['release_date'].apply(lambda x: x.strftime('%d'))
    ## days of the week
    df['started_sunday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==0 else 0)
    df['started_monday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==1 else 0)
    df['started_tuesday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==2 else 0)
    df['started_wednesday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==3 else 0)
    df['started_thursday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==4 else 0)
    df['started_friday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==5 else 0)
    df['started_saturday'] = df['release_weekday'].apply(lambda x: 1 if int(x)==6 else 0)
    ## months
    df['started_january'] = df['release_month'].apply(lambda x: 1 if int(x)==1 else 0)
    df['started_february'] = df['release_month'].apply(lambda x: 1 if int(x)==2 else 0)
    df['started_march'] = df['release_month'].apply(lambda x: 1 if int(x)==3 else 0)
    df['started_april'] = df['release_month'].apply(lambda x: 1 if int(x)==4 else 0)
    df['started_may'] = df['release_month'].apply(lambda x: 1 if int(x)==5 else 0)
    df['started_june'] = df['release_month'].apply(lambda x: 1 if int(x)==6 else 0)
    df['started_july'] = df['release_month'].apply(lambda x: 1 if int(x)==7 else 0)
    df['started_august'] = df['release_month'].apply(lambda x: 1 if int(x)==8 else 0)
    df['started_september'] = df['release_month'].apply(lambda x: 1 if int(x)==9 else 0)
    df['started_october'] = df['release_month'].apply(lambda x: 1 if int(x)==10 else 0)
    df['started_november'] = df['release_month'].apply(lambda x: 1 if int(x)==11 else 0)
    df['started_december'] = df['release_month'].apply(lambda x: 1 if int(x)==12 else 0)
    ## year
    df['first_year'] = df['release_date'].apply(lambda x: int(x.strftime('%Y')))

# Parsing runtime

In [17]:
def parse_runtime(df):
    df['half_hour'] = df['runtime'].apply(lambda x: 1 if (int(x)<= 30) and (int(x)>= 20) else 0)
    df['full_hour'] = df['runtime'].apply(lambda x: 1 if (int(x)<= 60) and (int(x)>= 40) else 0)

# Parse network

In [86]:
def parse_network(df):
    networks = ['ABC', 'NBC', 'CBS', 'Fox', 'Nickelodeon', 'Cartoon', 'Comedy', 'MTV',
               'HBO', 'Disney', 'WB']
    for i in networks:
        df['from_' + i] = df['network'].apply(lambda x: 1 if i in x else 0)
    
# 'from_ABC', u'from_NBC', u'from_CBS', u'from_Fox', u'from_Nickelodeon',
#        u'from_Cartoon', u'from_Comedy', u'from_MTV', u'from_HBO',
#        u'from_Disney', u'from_WB'    

# Prepare DF

In [None]:
## delete errant columns
## drop: name, runtime, imdb_id, json, genres, seasons, release_date, network, keywords, 
## keep original dataframe so the script can output the show title, and retrieve other information
    ## like number of seasons

In [91]:
def define_features(df):
    df2 = df
    df2.drop(['name', 'runtime', 'imdb_id', 'json', 'genres', 'seasons', 'release_date', 'network', 
              'keywords', 'release_month', 'release_weekday'], inplace=True, axis=1)
    return df2

# Run through the model

In [None]:
# import pickled model
# 

# Putting it all together

In [99]:
## make a different function that opens the pickled model and runs the df through it

def get_tv_prediction(imdb_id):
    df1 = df_from_api(imdb_id)
    df1['keywords'] = df1['imdb_id'].apply(lambda x: combine_list(scrape_keywords(x)))
    df1['network'] = df1['imdb_id'].apply(lambda x: (str(scrape_network(x))))
    parse_keywords(df1)
    parse_network(df1)
    parse_dates(df1)
    parse_genres(df1)
    parse_runtime(df1)
    df2 = define_features(df1)
#     print df1
#     return df1
#     print df2
    return df2

In [94]:
test_df = get_tv_prediction('tt0303461')


In [98]:
test_df

Unnamed: 0,keyword_adult,keyword_african,keyword_alien,keyword_american,keyword_angel,keyword_anim,keyword_base,keyword_best,keyword_black,keyword_book,keyword_boy,keyword_boyfriend,keyword_brother,keyword_california,keyword_celebr,keyword_charact,keyword_child,keyword_citi,keyword_comedi,keyword_comedian,keyword_comic,keyword_cult,keyword_daughter,keyword_death,keyword_detect,keyword_doctor,keyword_evil,keyword_famili,keyword_father,keyword_femal,keyword_fiction,keyword_friend,keyword_friendship,keyword_gay,keyword_girl,keyword_girlfriend,keyword_hero,keyword_humor,keyword_husband,keyword_interraci,keyword_interview,keyword_investig,keyword_joke,keyword_life,keyword_live,keyword_love,keyword_male,keyword_man,keyword_marriag,keyword_mother,keyword_murder,keyword_new,keyword_offic,keyword_parent,keyword_parodi,keyword_play,keyword_polic,keyword_power,keyword_protagonist,keyword_relationship,keyword_satir,keyword_school,keyword_secret,keyword_sex,keyword_share,keyword_sister,keyword_sitcom,keyword_social,keyword_son,keyword_spoken,keyword_spoof,keyword_stand,keyword_student,keyword_superhero,keyword_supernatur,keyword_surreal,keyword_teenag,keyword_versu,keyword_villain,keyword_violenc,keyword_wife,keyword_woman,keyword_york,from_ABC,from_NBC,from_CBS,from_Fox,from_Nickelodeon,from_Cartoon,from_Comedy,from_MTV,from_HBO,from_Disney,from_WB,release_monthday,started_sunday,started_monday,started_tuesday,started_wednesday,started_thursday,started_friday,started_saturday,started_january,started_february,started_march,started_april,started_may,started_june,started_july,started_august,started_september,started_october,started_november,started_december,first_year,is_action,is_adventure,is_animation,is_biography,is_comedy,is_crime,is_documentary,is_drama,is_family,is_fantasy,is_game,is_history,is_horror,is_music,is_musical,is_mystery,is_news,is_reality,is_romance,is_sci,is_short,is_sport,is_talk,is_thriller,is_war,is_western,half_hour,full_hour
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,1,1,0,0,0,1,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,20,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,2002,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1


In [95]:
test_df.shape

(1, 143)

In [96]:
test_df.columns

Index([       u'keyword_adult',      u'keyword_african',
              u'keyword_alien',     u'keyword_american',
              u'keyword_angel',         u'keyword_anim',
               u'keyword_base',         u'keyword_best',
              u'keyword_black',         u'keyword_book',
                u'keyword_boy',    u'keyword_boyfriend',
            u'keyword_brother',   u'keyword_california',
             u'keyword_celebr',      u'keyword_charact',
              u'keyword_child',         u'keyword_citi',
             u'keyword_comedi',     u'keyword_comedian',
              u'keyword_comic',         u'keyword_cult',
           u'keyword_daughter',        u'keyword_death',
             u'keyword_detect',       u'keyword_doctor',
               u'keyword_evil',       u'keyword_famili',
             u'keyword_father',        u'keyword_femal',
            u'keyword_fiction',       u'keyword_friend',
         u'keyword_friendship',          u'keyword_gay',
               u'keyword_girl',

In [97]:
test = get_tv_prediction('tt0303461')

In [37]:
test

Unnamed: 0,imdb_id,json,name,genres,seasons,runtime,release_date,keywords,network,from_ABC,...,is_news,is_reality,is_romance,is_sci,is_short,is_sport,is_talk,is_thriller,is_war,is_western
0,tt0303461,{u'Plot': u'Captain Malcolm 'Mal' Reynolds is ...,Firefly,"adventure, drama, sci-fi",1.0,44.0,20 Sep 2002,future captain space psychic spacecraft preac...,fox film corporation,0,...,0,0,0,1,0,0,0,0,0,0


In [39]:
test['date_parse'] = test['release_date'].apply(lambda x: datetime.strptime(x, '%d %b %Y'))

In [40]:
test

Unnamed: 0,imdb_id,json,name,genres,seasons,runtime,release_date,keywords,network,from_ABC,...,is_reality,is_romance,is_sci,is_short,is_sport,is_talk,is_thriller,is_war,is_western,date_parse
0,tt0303461,{u'Plot': u'Captain Malcolm 'Mal' Reynolds is ...,Firefly,"adventure, drama, sci-fi",1.0,44.0,20 Sep 2002,future captain space psychic spacecraft preac...,fox film corporation,0,...,0,0,1,0,0,0,0,0,0,2002-09-20


# Dataset

In [3]:
shows = pd.read_csv('../good_shows_data2.csv')

In [5]:
pd.set_option('max_seq_items', 200)

In [6]:
shows.columns

Index([u'json', u'title_rough', u'check', u'title', u'imdb_id', u'big_json',
       u'seasons', u'cancelled', u'runtime', u'genres', u'imdb_rating',
       u'release_date', u'plot', u'year', u'type', u'votes', u'keywords',
       u'first_year', u'is_new', u'fixed_runtime', u'is_action',
       u'is_adventure', u'is_animation', u'is_biography', u'is_comedy',
       u'is_crime', u'is_documentary', u'is_drama', u'is_family',
       u'is_fantasy', u'is_game', u'is_history', u'is_horror', u'is_music',
       u'is_musical', u'is_mystery', u'is_news', u'is_reality', u'is_romance',
       u'is_sci', u'is_short', u'is_sport', u'is_talk', u'is_thriller',
       u'is_war', u'is_western', u'release_month', u'release_weekday',
       u'release_monthday', u'stemmed_plot', u'stemmed_keywords',
       u'keyword_adult', u'keyword_african', u'keyword_alien',
       u'keyword_american', u'keyword_angel', u'keyword_anim', u'keyword_base',
       u'keyword_best', u'keyword_black', u'keyword_book', u'keywor