In [337]:
import urllib.request
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import numpy as np

In [338]:
def read_url(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "html.parser")
    return soup

In [388]:
# obtain first 20 pages of movies under keyword "Action Hero"
# 1000 titles
url_list = []

for i in range(1,20):
    url_list.append('http://www.imdb.com/search/keyword?keywords=action-hero&mode=advanced&page={}&ref_=kw_nxt&sort=moviemeter,asc'.format(i))

In [389]:
# get IMDB scores
IMDB_scores = []

for url in url_list:
    scores = read_url(url).find_all("div", {"class": "lister-item-content"})
    for p in scores:
        try:
            if p.find('div')['class']==['ratings-bar']:
                IMDB_scores.append(float(p.find('div',{'class':'ratings-bar'}).find('strong').get_text()))
        except:
            IMDB_scores.append(np.nan)

In [390]:
import re
# get run times
runtimes = []

for url in url_list:
    RT = read_url(url).find_all("p", {"class": "text-muted"})
    for p in RT:
        if p.find('span')==None:
            pass
        elif p.find_all('span')[0]['class']==['certificate'] and p.find_all('span')[2]['class']!=['genre']:
            runtimes.append(int(re.findall(r'\d+',str(p.find_all('span')[2].get_text()))[0]))
        elif p.find_all('span')[0]['class']==['runtime']:
            runtimes.append(int(re.findall(r'\d+',str(p.find_all('span')[0].get_text()))[0]))
        else:    
            runtimes.append(np.nan)


In [391]:
# Movie Ratings
ratings = []

for url in url_list:
    rate = read_url(url).find_all("p", {"class": "text-muted"})
    for p in rate:
        if p.find('span')==None:
            pass
        elif p.find('span')['class']==['certificate']:
            ratings.append(str(p.find('span',{'class':'certificate'}).get_text()))
        else:
            ratings.append(np.nan)

In [392]:
# Getting different genres
genres = []
for url in url_list:
    genre = read_url(url).find_all('span', {"class":"genre"})
    for item in [g.get_text() for g in genre]:
        genres.append(str(item).replace('\n','').strip())

In [393]:
# create categorical variable for each genre where 1 if movie contains genre, 0 otherwise.
gen_cat = ['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 
        'Horror', 'Music', 'Musical', 'Mystery', 'Reality-Tv', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western' ]

Action, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, History, Horror, Music, Musical, Mystery, Reality_Tv, Romance, Sci_Fi, Short, Sport, Thriller, War, Western = ([] for i in range(len(gen_cat)))

gen_list = [Action, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, History, Horror, Music, Musical, Mystery, Reality_Tv, Romance, Sci_Fi, Short, Sport, Thriller, War, Western] 
for g in genres:
    for cat in gen_cat:
        if cat in g:
            gen_list[gen_cat.index(cat)].append(1)
        else:
            gen_list[gen_cat.index(cat)].append(0)

In [394]:
# movie titles
titles = []
for url in url_list:
    dl = read_url(url).find_all('h3', {"class": "lister-item-header"})
    for h in dl:
        for link in h.find_all('a'):
            titles.append(str(link.get_text()))

In [395]:
titles

['Logan',
 'X-Men: Apocalypse',
 'John Wick: Chapter 2',
 'Doctor Strange',
 'The Wolverine',
 'Deadpool',
 'X-Men Origins: Wolverine',
 'X-Men',
 "Assassin's Creed",
 'Suicide Squad',
 'X-Men: Days of Future Past',
 'John Wick',
 'Legends of Tomorrow',
 'Gotham',
 'Rogue One',
 'Hawaii Five-0',
 'The Magnificent Seven',
 'X: First Class',
 'The Dark Knight',
 'The Legend of Tarzan',
 'X2',
 'Batman v Superman: Dawn of Justice',
 'Captain America: Civil War',
 'Guardians of the Galaxy',
 'Jack Reacher: Never Go Back',
 'The Nice Guys',
 'Mad Max: Fury Road',
 'Star Wars: Episode VII - The Force Awakens',
 'Kingsman: The Secret Service',
 'Star Trek Beyond',
 'Warcraft',
 'The Dark Knight Rises',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'The Revenant',
 'Jurassic World',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Star Wars: Episode IV - A New Hope',
 'Central Intelligence',
 'The Matrix',
 'Teenage Mutant Ninja Turtles: Out of the Shadows',
 "The Huntsman

In [396]:
imdb_movies = pd.DataFrame({"Title":titles,'Rating':ratings, 'Runtime':runtimes, 'IMDB_Score':IMDB_scores, 'Action':Action,
                            'Adventure':Adventure, 'Animation':Animation, 'Biography':Biography, 'Comedy':Comedy,
                            'Crime':Crime, 'Documentary':Documentary, 'Drama':Drama, 'Family':Family, 'Fantasy':Fantasy, 
                            'History':History, 'Horror':Horror, 'Music':Music, 'Musical': Musical, 'Mystery':Mystery, 
                            'Reality-Tv': Reality_Tv, 'Romance':Romance, 'Sci-Fi':Sci_Fi, 'Short':Short, 'Sport':Sport, 
                            'Thriller':Thriller, 'War':War, 'Western':Western})
imdb_movies = imdb_movies[['Title','Rating','Runtime','IMDB_Score', 'Action', 'Adventure', 'Animation', 'Biography', 
                         'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music',
                         'Musical', 'Mystery', 'Reality-Tv', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 
                         'Western']]

# Replace movie rating with value 1-4 based on typical box office performance
imdb_movies['Rating'] = imdb_movies['Rating'].replace('G',1).replace('PG',2).replace('PG-13',4).replace('R',3).replace([i for i in imdb_movies['Rating'] if i not in ['G','PG','PG-13','R']],np.nan)
# imdb_movies['Rating']=[np.nan for i in imdb_movies['Rating'] if type(i)==str]

imdb_movies.head()

Unnamed: 0,Title,Rating,Runtime,IMDB_Score,Action,Adventure,Animation,Biography,Comedy,Crime,...,Musical,Mystery,Reality-Tv,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
0,Logan,3.0,137.0,8.6,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,X-Men: Apocalypse,4.0,144.0,7.1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,John Wick: Chapter 2,3.0,122.0,8.1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,Doctor Strange,4.0,115.0,7.7,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Wolverine,4.0,126.0,6.7,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [397]:
url='http://www.the-numbers.com/movie/budgets/all'
titles = []
budgets = []
d_grosses = []
w_grosses = []
release = []

table1 = read_url(url).find_all('table')[0]
r_list = table1.find_all('td')[1::6]
m_list = table1.find_all('td')[2::6]
budget = table1.find_all('td')[3::6]
d_gross = table1.find_all('td')[4::6]
w_gross = table1.find_all('td')[5::6]

for m in m_list:
    titles.append(str(m.find('b').find('a').get_text()))
for b in budget:
    budgets.append(int(str(b.get_text()).replace('$','').replace(',','')))
for d in d_gross:
    d_grosses.append(int(str(d.get_text()).replace('$','').replace(',','')))
for w in w_gross:
    w_grosses.append(int(str(w.get_text()).replace('$','').replace(',','')))
for r in r_list:
    release.append(str(r.get_text()))

In [398]:
# create binary for summer months and holiday season months
date = []
x_factor = []
for rel in release:
    try:
        date.append(datetime.strptime(rel,'%m/%d/%Y'))
    except:
        date.append('')
for d in date:
    try:
        m = d.month
        if m in [5,6,7,11,12]:
            x_factor.append(1)
        else:
            x_factor.append(0)
    except:
        x_factor.append(np.nan)

In [399]:
date

[datetime.datetime(2009, 12, 18, 0, 0),
 datetime.datetime(2015, 12, 18, 0, 0),
 datetime.datetime(2007, 5, 24, 0, 0),
 datetime.datetime(2015, 11, 6, 0, 0),
 datetime.datetime(2012, 7, 20, 0, 0),
 datetime.datetime(2013, 7, 2, 0, 0),
 datetime.datetime(2012, 3, 9, 0, 0),
 datetime.datetime(2010, 11, 24, 0, 0),
 datetime.datetime(2007, 5, 4, 0, 0),
 datetime.datetime(2015, 5, 1, 0, 0),
 datetime.datetime(2016, 5, 6, 0, 0),
 datetime.datetime(2016, 3, 25, 0, 0),
 datetime.datetime(2012, 12, 14, 0, 0),
 datetime.datetime(2009, 7, 15, 0, 0),
 datetime.datetime(2013, 12, 13, 0, 0),
 datetime.datetime(2014, 12, 17, 0, 0),
 datetime.datetime(2011, 5, 20, 0, 0),
 datetime.datetime(2006, 6, 28, 0, 0),
 datetime.datetime(2008, 11, 14, 0, 0),
 datetime.datetime(2012, 5, 4, 0, 0),
 datetime.datetime(2006, 7, 7, 0, 0),
 datetime.datetime(2013, 6, 14, 0, 0),
 datetime.datetime(2008, 5, 16, 0, 0),
 datetime.datetime(2012, 7, 3, 0, 0),
 datetime.datetime(2015, 6, 12, 0, 0),
 datetime.datetime(2012, 5

In [400]:
budget_idx = pd.DataFrame({'Title':titles, 'Budget':budgets, 'DomesticGross':d_grosses, 'WorldwideGross':w_grosses, 'Release':release, 'X_Factor':x_factor})

In [401]:
# Merge the-numbers and IMDB dataframes

movies = pd.merge(imdb_movies,budget_idx, how='inner', on=['Title'])
movies.dropna(inplace=True)
movies.head()

Unnamed: 0,Title,Rating,Runtime,IMDB_Score,Action,Adventure,Animation,Biography,Comedy,Crime,...,Short,Sport,Thriller,War,Western,Budget,DomesticGross,Release,WorldwideGross,X_Factor
0,Logan,3.0,137.0,8.6,1,0,0,0,0,0,...,0,0,0,0,0,127000000,163724809,3/3/2017,448355498,0
1,X-Men: Apocalypse,4.0,144.0,7.1,1,1,0,0,0,0,...,0,0,0,0,0,178000000,155442489,5/27/2016,542742489,1
2,Doctor Strange,4.0,115.0,7.7,1,1,0,0,0,0,...,0,0,0,0,0,165000000,232638727,11/4/2016,676338727,1
3,Doctor Strange,4.0,76.0,6.8,1,0,1,0,0,0,...,0,0,0,0,0,165000000,232638727,11/4/2016,676338727,1
4,The Wolverine,4.0,126.0,6.7,1,1,0,0,0,0,...,0,0,0,0,0,115000000,132556852,7/26/2013,416456852,1


In [402]:
movies['Release'] = pd.to_datetime(movies['Release'], coerce=True)

  if __name__ == '__main__':


In [403]:
# Created a function to include additional data by searching for a title name retrieved in the previous dataframe
def scrape_imdb_listing(df):
    """
    Searches IMDB, parses results and returns DataFrame.
    :df = DataFrame with movie titles
    """
    movie_list = list()

    for movie in (df['Title']):
        base_url = 'http://www.imdb.com/find?q='
        url = base_url + movie +'&s=all'
        webpage = requests.get(url).text
        soup = BeautifulSoup(webpage, 'html.parser')

        try:
            results = soup('table', {'class':'findList'})[0]
        except:
            continue
            
        title = results.find_all('tr')[0]
        link = title.find('a', href=True)['href']

        url = 'http://www.imdb.com' + link
        webpage = requests.get(url).text
        soup = BeautifulSoup(webpage, 'html.parser')

        movie_title = soup.find('title')
        
        try: 
            count = soup.find('span', itemprop='ratingCount').text
        except:
            count = ''


        try: 
            reviews_count = soup.find('div', class_='titleReviewbarItemBorder')
            u_reviews = reviews_count.find_all('a')[0].text.split(' ')[0]
            c_reviews = reviews_count.find_all('a')[1].text.split(' ')[0]
        except:
            u_reviews = []
            c_review = []


        try: 
            director = soup.find('span', itemprop='name').text
        except:
            director = ''

        try: 
            country = soup.find('div', class_='subtext').find_all('a', title=True)[-1].text.split(' ')[-1]
            country = re.sub('[\(\)\{\}<>]', '', country)
        except:
            country = ''
            
        
    
        movie_list.append([movie, count, u_reviews, c_reviews, director, country])
        

    
    df = pd.DataFrame(movie_list, columns = ['Title', 'Rating_Count', 'User_Review_Count', 
                                             'Critic_Review_Count', 'Director', 'Country'])
    return df, movie_list

In [404]:
imdb_listings, temp_list = scrape_imdb_listing(movies)

In [405]:
imdb_listings.head()

Unnamed: 0,Title,Rating_Count,User_Review_Count,Critic_Review_Count,Director,Country
0,Logan,156083,662,442,James Mangold,USA\n\n
1,X-Men: Apocalypse,256653,710,447,Bryan Singer,USA\n\n
2,Doctor Strange,250924,614,491,Scott Derrickson,USA\n\n
3,Doctor Strange,250924,614,491,Scott Derrickson,USA\n\n
4,The Wolverine,348355,544,453,James Mangold,USA\n\n


In [406]:
imdb_listings = imdb_listings.drop_duplicates(['Title'])

In [407]:
movies = pd.merge(movies,imdb_listings, how='inner', on=['Title'])
movies.dropna(inplace=True)
movies = movies.drop_duplicates(['Title'])
movies

Unnamed: 0,Title,Rating,Runtime,IMDB_Score,Action,Adventure,Animation,Biography,Comedy,Crime,...,Budget,DomesticGross,Release,WorldwideGross,X_Factor,Rating_Count,User_Review_Count,Critic_Review_Count,Director,Country
0,Logan,3.0,137.0,8.6,1,0,0,0,0,0,...,127000000,163724809,2017-03-03,448355498,0,156083,662,442,James Mangold,USA\n\n
1,X-Men: Apocalypse,4.0,144.0,7.1,1,1,0,0,0,0,...,178000000,155442489,2016-05-27,542742489,1,256653,710,447,Bryan Singer,USA\n\n
2,Doctor Strange,4.0,115.0,7.7,1,1,0,0,0,0,...,165000000,232638727,2016-11-04,676338727,1,250924,614,491,Scott Derrickson,USA\n\n
4,The Wolverine,4.0,126.0,6.7,1,1,0,0,0,0,...,115000000,132556852,2013-07-26,416456852,1,348355,544,453,James Mangold,USA\n\n
5,Deadpool,3.0,108.0,8.1,1,1,0,0,1,0,...,58000000,363070709,2016-02-12,783770709,0,603357,1166,624,Tim Miller,USA\n\n
6,X-Men Origins: Wolverine,4.0,107.0,6.7,1,1,0,0,0,0,...,150000000,179883157,2009-05-01,374825760,1,381799,653,355,Gavin Hood,USA\n\n
7,X-Men,4.0,104.0,7.4,1,1,0,0,0,0,...,75000000,157299717,2000-07-14,296339717,1,475462,1408,292,Bryan Singer,USA\n\n
8,Suicide Squad,4.0,123.0,6.3,1,1,0,0,0,0,...,175000000,325100054,2016-08-05,746100054,0,366561,1692,616,David Ayer,USA\n\n
9,X-Men: Days of Future Past,4.0,132.0,8.0,1,1,0,0,0,0,...,200000000,233921534,2014-05-23,747862775,1,543124,761,544,Bryan Singer,USA\n\n
10,The Magnificent Seven,4.0,133.0,7.0,1,1,0,0,0,0,...,90000000,93427848,2016-09-23,159762983,0,110831,324,333,Antoine Fuqua,USA\n\n


In [408]:
movies.columns

Index(['Title', 'Rating', 'Runtime', 'IMDB_Score', 'Action', 'Adventure',
       'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'Reality-Tv', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War',
       'Western', 'Budget', 'DomesticGross', 'Release', 'WorldwideGross',
       'X_Factor', 'Rating_Count', 'User_Review_Count', 'Critic_Review_Count',
       'Director', 'Country'],
      dtype='object')

In [411]:
# function to convert country by stripping
def convert_country(row):
    try:
        country = row['Country'].strip()
        return country
    except:
        return np.nan

In [412]:
movies['Country'] = imdb_listings.apply(lambda x: convert_country(x), 1)

In [413]:
movies.head()

Unnamed: 0,Title,Rating,Runtime,IMDB_Score,Action,Adventure,Animation,Biography,Comedy,Crime,...,Budget,DomesticGross,Release,WorldwideGross,X_Factor,Rating_Count,User_Review_Count,Critic_Review_Count,Director,Country
0,Logan,3.0,137.0,8.6,1,0,0,0,0,0,...,127000000,163724809,2017-03-03,448355498,0,156083,662,442,James Mangold,USA
1,X-Men: Apocalypse,4.0,144.0,7.1,1,1,0,0,0,0,...,178000000,155442489,2016-05-27,542742489,1,256653,710,447,Bryan Singer,USA
2,Doctor Strange,4.0,115.0,7.7,1,1,0,0,0,0,...,165000000,232638727,2016-11-04,676338727,1,250924,614,491,Scott Derrickson,USA
4,The Wolverine,4.0,126.0,6.7,1,1,0,0,0,0,...,115000000,132556852,2013-07-26,416456852,1,348355,544,453,James Mangold,USA
5,Deadpool,3.0,108.0,8.1,1,1,0,0,1,0,...,58000000,363070709,2016-02-12,783770709,0,603357,1166,624,Tim Miller,USA


In [414]:
# assigning 0 or 1 to movie for foreign feature
def check_foreign(row):
    try:
        country = row['Country']

        if country in ['USA', 'UK', 'Canada']:
            return 0
        else:
            return 1
    except:
        return np.nan

In [415]:
movies['Foreign'] = movies.apply(lambda x: check_foreign(x), 1)

In [416]:
# assign dummy variables to director
df_director = pd.get_dummies(movies['Director'])

In [417]:
movies_new = pd.concat([movies, df_director], axis=1)

In [418]:
movies_new.head()

Unnamed: 0,Title,Rating,Runtime,IMDB_Score,Action,Adventure,Animation,Biography,Comedy,Crime,...,Tony Gilroy,Tony Scott,Uwe Boll,Walter Hill,Warren Beatty,William Friedkin,Wolfgang Petersen,Wych Kaosayananda,Xavier Gens,Zack Snyder
0,Logan,3.0,137.0,8.6,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,X-Men: Apocalypse,4.0,144.0,7.1,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Doctor Strange,4.0,115.0,7.7,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,The Wolverine,4.0,126.0,6.7,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Deadpool,3.0,108.0,8.1,1,1,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [427]:
# function to convert rating, user and critic count to float
def critic_review_count(row):
    try:
        count = float(row['Critic_Review_Count'].replace(',', ''))
        return count
    except:
        return np.nan
    
def convert_rating_count(row):
    try:
        count = float(row['Rating_Count'].replace(',', ''))
        return count
    except:
        return np.nan
    
def user_review_count(row):
    try:
        count = float(row['User_Review_Count'].replace(',', ''))
        return count
    except:
        return np.nan

In [428]:
movies_new['Rating_Count'] = imdb_listings.apply(lambda x: convert_rating_count(x), 1)
movies_new['User_Review_Count'] = imdb_listings.apply(lambda x: user_review_count(x), 1)
movies_new['Critic_Review_Count'] = imdb_listings.apply(lambda x: critic_review_count(x), 1)

In [429]:
movies_new.to_csv('movies.csv')