In [None]:
# Predict IMDB rating by extracting information from script
# This can work 2 ways, one a script could be automatically
# analyzed, this could show the opportunities of success
# for the script, producers could use it to set a standard
# for the scripts to read. By using metadata other parameters
# could be calculated and optimized when creating the movie
# like duration, actors, rating among others.

In [394]:
# Functions that will be used
import helper as hp
import pandas as pd
import numpy  as np
import string
from sklearn.linear_model import LinearRegression as lr

In [None]:
# Select randomly starting letter of movie (later will be all letters)
np.random.seed(0)
n = 1
letters   = list(np.random.choice(list(string.ascii_uppercase),size=n,replace=False))
url       = ["http://www.springfieldspringfield.co.uk/movie_scripts.php?order=","&page="]

# Append data obtained to empty list
# fetch the url, and extract the links from it
# these links contain the droids we are looking for... i mean the scripts
prefix = 'http://www.springfieldspringfield.co.uk'
pages  = {}
n      = 3
for i in letters:
    for j in range(1,n): # Number of pages to go in, change latter too
        temp = bs4.BeautifulSoup(hp.fetch(url[0]+i+url[1]+str(j)).text,'lxml')
        temp = temp.find_all('a',class_='script-list-item')
        clean = ''
        for link in temp:
            pages.update({str(link.contents[0]):prefix+link.get('href')})

In [None]:
# Having the list of url's, got to each one and scrape the script
script = []
for i in pages.values():
    script.append(hp.springScrap(hp.fetch(i)))

In [45]:
# Process script to remove all caps words, indicating actions
# to be seen if leaving or removing them creates a better model
scriptRC = []
for i in range(len(script)):
    scriptRC.append(hp.removeCAPS(script[i]))

In [46]:
# Save text of scrapped and process scripts to local disk
for i in range(len(script)):
    f = open('scrapped/'+pages.keys()[i]+'.txt','w')
    f.write(script[i])
    f.close()
    
for i in range(len(script)):
    f = open('scrapped/'+pages.keys()[i]+'RC.txt','w')
    f.write(scriptRC[i])
    f.close()

In [48]:
# Username and password for Watson's API, obtained
# Creating an account with them, some free perks
iusername = 'ABCDEF'
ipassword = '123456'

# Submit scripts (in this case those with actions removed)
# to Watson through the API, the API is for Personality Insights
# it returns a set of 30 parameters obtained by analysing
# the text supplied
insights = []
for i in scriptRC:
    insights.append(hp.insight(i,iusername,ipassword))

In [56]:
# Associate names of movie scripts with insights returned
nins = []
for i in range(len(insights)):
    nins.append([pages.keys()[i],hp.dToL(hp.flatten(insights[i]))[0]])

In [None]:
# Create a dataframe, the columns will be the each of the insights features
# the rows each instant of movie script, it starts with 0's
index   = pages.keys()
columns = hp.dToL(hp.flatten(insights[0]))[1]

df = pd.DataFrame(data=np.zeros(shape=(len(index),len(columns))),index=index,columns=columns)

In [73]:
# Fill the dataframe with the information from the insights
for i in range(len(index)):
    for j in range(len(columns)):
        df.ix[(i,j)] = nins[i][1][j]

In [192]:
# After completing creating data from Watson, proceed to use OMDB API, this API
# returns important data from a movie by supplying the name of the movie or
# the imdb tag.
# The names of the movies were available from the previous scrapping, however
# names sometimes are spelled or written differently and could not be found by the
# API, that is why this was checked by hand.
#
# Some curation of the titles needs to be done by hand, will automate (somehow) later
titles = ['Ca$h','Caddyshack','Cabin Fever 3: Patient Zero','Caged','Cairo Time','Caddyshack II','Cable Guy',
         'Caffeine','Caligula','Calendar Girls','Cabin in the sky','The White Horse','Cadaver','Cabin Fever',
         'Call Girl','C.r.a.z.y','Julius Caesar','Caesar and Cleopatra','Cake','Cabin Fever 2','California Solo',
         'Cabaret Desire','C.O.G.','Cafe Society','Cadillac Records','Cabin in the woods','Cabaret',
         'Illustrious Corpses','Cadillac Man','Cahill','Cake Eaters','Calamity Jane','The Dark Knight','The Godfather',
         'Fight Club','The Lord of the Rings: The Fellowship of the Ring',
          'Star Wars: Episode V - The Empire Strikes Back','The Matrix','The silence of the lambs']

In [None]:
# Set the dataframe index to the curated names
df.index = titles

# Proceed to request the information to the OMDB API
# and store it in a dictionary
imdbDict= {}
for i in titles:
    imdbDict.update({i:omdb(name=i)})

In [201]:
# Our target is in sight, the 'imdbRating' is what we aim to 
# predict through regression. Create the target and store
# it in memory
target = pd.DataFrame(index=df.index,columns=['imdbRating'])
for i in df.index:
    target.ix[i] = imdbDict[i]['imdbRating']

In [206]:
# Check if the rating was available, if not remove that row
nas    = target['imdbRating']!='N/A'
target = target.ix[nas]
df = df.ix[nas]

In [236]:
# By using the nltk library for natural language processing
# it is possible to obtain interesting statistics of the text
# that can give further power to the model
# analytics this time are # of words, diversity of words
# which is unique words / total and #verbs / #nouns
nanalytics     = []
for i in range(len(scriptRC)):
    nanalytics.append([pages.keys()[i],words(scriptRC[i])])

# Store this new analytics in the training df
df['Words']      = 0.
df['DiversityW'] = 0.
df['Verb/Noun']  = 0.

for i in range(len(nanalytics)):
    df['Words'].iloc[i]      = nanalytics[i][1][0]
    df['DiversityW'].iloc[i] = nanalytics[i][1][1]
    df['Verb/Noun'].iloc[i]  = nanalytics[i][1][2]

In [286]:
# From the imdb API information we use only 2 now,
# The year and the runtime of the movie
df['Year']    = 0
df['Runtime'] = 0

for i in df.index:
    df['Year'][i]    = imdbDict[i]['Year']
    df['Runtime'][i] = imdbDict[i]['Runtime'].split()[0]

In [None]:
# Finish by storing the dataset and the target on disk
# for further use.
df.to_csv('scrapped/final.csv')
target.to_csv('scrapped/target.csv')

In [395]:
# The road to victory... is filled by rehearsals and mistakes.
#
#url1 = 'http://www.dailyscript.com/movie.html'
#url2 = 'http://www.dailyscript.com/movie_n-z.html'
#prefix = 'http://www.dailyscript.com/'

#list1 = fetch(url1)
#list2 = fetch(url2)

#soup1 = bs4.BeautifulSoup(list1.text,'lxml')
#soup2 = bs4.BeautifulSoup(list2.text,'lxml')
#prefix = 'http://www.dailyscript.com/'

#l1 = []
#l2 = []
#l4 = []

#for i in soup1.find_all('a')[8:-3]:
#    if i.contents[0].encode('utf8') != 'imdb':
#        l1.append(i.contents[0].encode('utf8'))
#        l2.append(prefix+i.get('href'))
#    else:
#        l3.append(i.contents[0].encode('utf8'))
#        l4.append(i.get('href'))

#l4.append('0') #There is somewhere something not cool, check it
        
#for i in soup2.find_all('a')[8:-3]:
#    if i.contents[0].encode('utf8') != 'imdb':
#        l1.append(i.contents[0].encode('utf8'))
#        l2.append(prefix+i.get('href'))
#    else:
#        l4.append(i.get('href'))

#temp = l1[0]
#nList = []
#pdfs  = []
#nimdb = []

# filter pdfs
#for i in range(len(l2)):
#    if l2[i][-3:].lower() != 'pdf':
#        pdfs.append(i)

# filter imdbs
#for i in range(len(l4)):
#    if l4[i] != '0':
#        nimdb.append(i)

#for i in range(2,11):
#    raw  = fetch('http://www.springfieldspringfield.co.uk/movie_scripts.php?order=A&page='+str(i))
#    soup = bs4.BeautifulSoup(temp.text,'lxml')
#    for i in soup.findall('a')

#soup1 = bs4.BeautifulSoup(list1.text,'lxml')
#soup2 = bs4.BeautifulSoup(list2.text,'lxml')

#for i in soup1.find_all('a')[8:-3]:
#    if i.contents[0].encode('utf8') != 'imdb':
#        l1.append(i.contents[0].encode('utf8'))
#        l2.append(prefix+i.get('href'))
#    else:
#        l3.append(i.contents[0].encode('utf8'))
#        l4.append(i.get('href'))

#l4.append('0') #There is somewhere something not cool, check it
        
#for i in soup2.find_all('a')[8:-3]:
#    if i.contents[0].encode('utf8') != 'imdb':
#        l1.append(i.contents[0].encode('utf8'))
#        l2.append(prefix+i.get('href'))
#    else:
#        l4.append(i.get('href'))

#url       = ['http://www.springfieldspringfield.co.uk/movie_script.php?movie=dark-knight-the-batman-the-dark-knight',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=godfather-the',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=fight-club',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=lord-of-the-rings-the-fellowship-of-the-ring-the',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=star-wars-episode-v-the-empire-strikes-back',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=matrix-the',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=silence-of-the-lambs-the']

#names     = ['darkknight','godfather','fightclub','lotrI','starwarsV','matrix','silencelambs']
#raw       = fetch(url)
#text      = springScrap(raw)
#analysis  = insight(text,iusername,ipassword) 
#flat      = flatten(analysis)
#saveText(text,name)

# Courier 12

#qual  = []

#for i in range(len(names)):
#    qual.append([i[1] for i in pd.read_csv('scrapped/'+names[i]+'insight.csv').values])

#matrix = pd.DataFrame(qual,columns=qualities,index=names)
#matrix['score'] = []
#meta data 
#matrix

#import pandas as pd
#import numpy as np

#qualities = ['Dutifulness','Cooperation','Self-consciousness','Orderliness','Achievement striving','Self-efficacy',
#             'Activity level','Self-discipline','Excitement-seeking','Cautiousness','Morality','Anxiety','Emotionality',
#             'Vulnerability','Immoderation','Sympathy','Friendliness','Modesty','Altruism','Assertiveness',
#             'Adventurousness','Gregariousness','Intellect','Imagination','Artistic interests','Depression','Anger',
#             'Trust','Cheerfulness','Liberalism']

#url       = ['http://www.springfieldspringfield.co.uk/movie_script.php?movie=dark-knight-the-batman-the-dark-knight',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=godfather-the',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=fight-club',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=lord-of-the-rings-the-fellowship-of-the-ring-the',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=star-wars-episode-v-the-empire-strikes-back',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=matrix-the',
#             'http://www.springfieldspringfield.co.uk/movie_script.php?movie=silence-of-the-lambs-the']

#names     = ['darkknight','godfather','fightclub','lotrI','starwarsV','matrix','silencelambs']

#df = pd.DataFrame(np.zeros(shape=(len(qualities),1)),index=qualities)
#df = df.transpose()

#for i in range(len(url)):
#    raw          = fetch(url[i])
#    text         = springScrap(raw)
#    analysis     = insight(text,iusername,ipassword) 
#    flat         = flatten(analysis)
#    df[names[i]] = flat.values()
#    saveText(text,names[i])
#    pd.DataFrame(flat.values()).transpose().to_csv(names[i]+'insight.csv')
#    saveText(flat,names[i]+'insight')

# Data collected yesterday for tests, add to the df
# names     = ['darkknight','godfather','fightclub','lotrI','starwarsV','matrix','silencelambs']

# nlist2 = []
# for i in names:
#    nlist2.append(pd.read_csv('scrapped/'+i+'insight.csv'))

#for i in range(len(names)):
#    x = pd.DataFrame(data=np.transpose(list(nlist2[i]['0']))).transpose()
#    x.columns = columns
#    x.index   = [names[i]]
#    df = df.append(x)
# There is lots of information to be addressed, some information is may help the model, 
# test.keys()
# Our target will be the imdbrating or the metascore, we want those
# Futher information that might be useful would be rated, language, runtime, votes could be
# used to filter rows with few votes, year, actors and directors could be used for simulations
# what impact an actor or a director could have in the ratings
# by now, lets take all the information and add it cautiosly to the model
# df = df.drop(df.index[22])
# df.index
# To account for the rows i should have not deleted until now and add some missing:
# nanalytics = []
# for i in [0,1,2,3,4,5,6,7,8,9,10,12,14,15,16,17,18,19,20,21,22,24,25,26,27,28,29,30,31,32,33]:
#     nanalytics.append(analytics[i])

# tdk = words(encoder(hp.readTxt('scrapped/darkknight')))
# tgf = words(encoder(hp.readTxt('scrapped/godfather')))
# fca = words(encoder(hp.readTxt('scrapped/fightclub')))
# lor = words(encoder(hp.readTxt('scrapped/lotrI')))
# swv = words(encoder(hp.readTxt('scrapped/starwarsV')))
# tma = words(encoder(hp.readTxt('scrapped/matrix')))
# sol = words(encoder(hp.readTxt('scrapped/silencelambs')))

# nanalytics.append(['darkknight',tdk])
# nanalytics.append(['godfather',tgf])
# nanalytics.append(['fightclub',fca])
# nanalytics.append(['lordoftherings',lor])
# nanalytics.append(['starwarsv',swv])
# nanalytics.append(['matrix',tma])
# nanalytics.append(['silenceoflambs',sol])