# Web Scrapping Disney movie data from Wikipedia

### Task 1: Get info Box (store in python dictionary)

##### import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import requests

##### Load the webpage

In [2]:
r=requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films").text
soup=BeautifulSoup(r,'lxml')
movies=soup.select(".wikitable.sortable i a")

In [3]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(" ",strip=True).replace("\xa0"," ") for li in row_data.find_all('li')]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ",strip=True).replace("\xa0"," ") 

In [4]:
def clean_tags(soup):
    for tag in soup.find_all(["sup","span"]):
        tag.decompose()

### Task 2 : Get info box for all movies

In [5]:
def get_info_box(url):
    movie_info={}
    html_text=requests.get(url).text
    soup=BeautifulSoup(html_text,'lxml')
    info_box=soup.find('table',class_='infobox vevent')
    info_rows=info_box.find_all('tr')
    clean_tags(soup)
   
    for index,row in enumerate(info_rows):
        if index==0:
            movie_info['title']=row.find('th').get_text(" ",strip=True)
        else:
            header=row.find('th')
            if header:
                content_key=row.find('th').get_text(" ",strip=True)
                content_data=get_content_value(row.find("td"))  
                movie_info[content_key]=content_data  
    return movie_info 

In [6]:
movie_info_list=[] 
base_path="https://en.wikipedia.org/"
for index,movie in enumerate(movies):
    if index % 10==0:
        print(index)
    try:
        relative_path=movie['href']
        full_path=base_path+relative_path
        title=movie['title'] 
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)  

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
The London Connection
'NoneType' object has no attribute 'find'
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
490
500
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
510
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
520
Sister Act 3
'NoneType' object has no attribute 'find'
The Thief
'NoneType' object has no attribute 'find_all'
Tom Sawyer
'NoneType' object has no attribute 'find_all'
530
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'
FC Barc

##### save and reload data

In [7]:
import json

def save_data(title,data):
    with open (title,'w',encoding='utf8') as f:
        json.dump(data,f,ensure_ascii=False,indent=2)

def load_data(title):
    with open(title,encoding='utf-8') as f:
        return json.load(f)

save_data("disney_data.json",movie_info_list)

### Task 3 : Clean our data

In [8]:
print([movie.get('Running time (int)','N/A') for movie in movie_info_list])

['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'

##### Subtasks:
* Clean up references[1]
* Convert running time into an interger
* Convert dates into datetime object
* Split up the  long strings
* Convert Budget & Box office to numbers

##### Convert running time into an interger

In [9]:
def minutes_to_integer(running_time):
    if running_time=="N/A":
        return None
    if isinstance(running_time,list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)']=minutes_to_integer(movie.get('Running time','N/A'))        


In [10]:
print([movie.get('Budget','N/A')for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '

##### Convert Budget & Box office to numbers

In [11]:
import re
from sys import flags
amounts=r"thousand|million|billion"
number=r"\d+(,\d{3})*\.*\d*"
word_re=fr"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re=fr"\${number}"

def word_to_value(word):
    value_dict={"thousand":1000,"million":1000000,"billion":1000000000}
    return value_dict[word]


def parse_word_syntax(string):
    value_string=re.search(number,string).group()
    value=float(value_string.replace(",",""))
    word=re.search(amounts,string,flags=re.I).group().lower()
    word_value=word_to_value(word)
    return value*word_value


def parse_value_syntax(string):
    value_string=re.search(number,string).group()
    value=float(value_string.replace(",",""))
    return value


def money_conversion(money):
    if money=="N/A":
        return None
    if isinstance(money,list):
        money=money[0]
    
    word_syntax=re.search(word_re,money,flags=re.I)
    value_syntax=re.search(value_re,money)
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None      


In [12]:
for movie in movie_info_list:
    movie['Budget (float)']=money_conversion(movie.get('Budget','N/A'))  
    movie['Box Office (float)']=money_conversion(movie.get('Box office','N/A'))  

In [13]:
print([movie.get('Budget (float)','N/A')for movie in movie_info_list])

[None, 1490000.0, 2600000.0, 2280000.0, 600000.0, 950000.0, 858000.0, None, 788000.0, None, 1350000.0, 2125000.0, None, 1500000.0, 1500000.0, None, 2200000.0, 1800000.0, 3000000.0, None, 4000000.0, 2000000.0, 300000.0, 1800000.0, None, 5000000.0, None, 4000000.0, None, None, None, None, None, None, 700000.0, None, None, None, None, None, 6000000.0, 1000000.0, None, 2000000.0, None, None, 2500000.0, None, None, 4000000.0, 3600000.0, None, None, None, None, 3000000.0, None, 3000000.0, None, None, None, None, None, None, None, None, None, 3000000.0, None, None, None, None, 4400000.0, None, None, None, None, None, None, None, None, None, None, None, 4000000.0, None, 5000000.0, None, None, None, None, 5000000.0, None, None, None, None, None, None, 4000000.0, None, None, None, 6300000.0, None, None, None, None, None, None, None, None, 5000000.0, None, None, None, None, 8000000.0, None, None, None, None, None, None, 1000000.0, None, None, None, None, 5000000.0, None, None, None, 7500000.0, No

In [14]:
print([movie.get('Box Office (float)','N/A')for movie in movie_info_list])

[45.472, 418000000.0, 164000000.0, 83300000.0, 960000.0, 1300000.0, 267399999.99999997, 1135000.0, 799000.0, 3355000.0, 3275000.0, 65000000.0, 3165000.0, 2560000.0, 3700000.0, 1625000.0, 182000000.0, 4100000.0, 2400000.0, 2100000.0, 87400000.0, 1000000.0, 2600000.0, None, 1750000.0, 28200000.0, 2150000.0, 187000000.0, 2100000.0, 1600000.0, 1700000.0, None, None, 2750000.0, None, 1750000.0, 6250000.0, None, 1800000.0, 2500000.0, 51600000.0, 12300000.0, None, 1700000.0, 3100000.0, None, 3750000.0, 2300000.0, None, 40000000.0, 303000000.0, 25400000.0, 25100000.0, None, None, 4600000.0, 3500000.0, 5000000.0, None, None, None, 21745500.0, 22100000.0, 2550000.0, 3000000.0, 4350000.0, 4200000.0, 22200000.0, 1600000.0, 4000000.0, 2250000.0, 3500000.0, 103100000.0, 3500000.0, 1275000.0, 4000000.0, 28068222.0, 6200000.0, 22565634.0, None, 16207116.0, 3000000.0, 1900000.0, 4000000.0, 378000000.0, None, 5000000.0, 21540050.0, 2250000.0, 4150000.0, 3300000.0, 51300000.0, 1300000.0, None, 5500000.0,

In [15]:
print([movie.get("Release dates","N/A") for movie in movie_info_list])

['N/A', ['December 21, 1937 ( Carthay Circle Theatre )', 'February 4, 1938 (United States)'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], 'N/A', 'N/A', ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere – London)', 'August 13, 1942 (Premiere – New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere – Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere – Boston)', 'February 19, 1943 (U.S.)'], 'N/A', ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)'], ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)'], ['November 12, 1946 (Premiere: Atlanta, Georgia)', 'November 20, 1946', 'March 30, 1947 (Stanford theater, Palo Alto, California)'], 'N/A', 'N/A', ['November 29, 1948 (Chicago, Illinois)', 'January 19, 1949 (Indianapolis, Indiana)'], 'N/A', ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'], ['June 22, 1950 (World Premiere-London)', 'July 30, 1950 

##### Convert dates into datetime object

In [16]:
from datetime import datetime

dates=[movie.get("Release dates","N/A") for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date,list):
        date=date[0]
    if date=="N/A":
        return None    
    date_str=clean_date(date)
    fmts=["%B %d, %Y","%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str,fmt)
        except:
            pass
    return None        


In [17]:
for movie in movie_info_list:
    movie['Release date (datetime)']=date_conversion(movie.get('Release date','N/A'))

##### Saving Data in pickle file format

In [18]:
import pickle
def save_data_pickel(name,data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)



In [19]:
import pickle
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [20]:
save_data_pickel("disney_movie_data_cleaned_more.pickle",movie_info_list)

In [21]:
a=load_data_pickle("disney_movie_data_cleaned_more.pickle")


In [22]:
a==movie_info_list

True

### Task 4 : Attach IMBD rating and Rotten Tomatoes/Meta scores rating scores

In [23]:
movie_info_list=load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [24]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url="http://www.omdbapi.com/?"
    parameters={"apikey":os.environ['OMDB_api_key'],"t":title}
    params_encode=urllib.parse.urlencode(parameters)
    full_url=base_url+params_encode
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings =omdb_info.get('Ratings',[])
    for rating in ratings:
        if rating['Source']=='Rotten Tomatoes':
            return rating['Value']
    return None        



In [25]:
#import time
for movie in movie_info_list:
    #while True:
    title=movie['title']
    omdb_info=get_omdb_info(title)
    movie['imdb']=omdb_info.get('imdbRating',None)
    movie['metascore']=omdb_info.get('Metascore',None)
    movie['rotten_tomatoes']=get_rotten_tomato_score(omdb_info)
        #time_wait=1.5
        #print(f'waiting for {time_wait} seconds...')
        #time.sleep(time_wait)
        



In [26]:
movie_info_list[10]

{'title': 'Make Mine Music',
 'Directed by': ['Jack Kinney',
  'Clyde Geronimi',
  'Hamilton Luske',
  'Joshua Meador',
  'Robert Cormack'],
 'Story by': ['James Bodrero',
  'Homer Brightman',
  'Erwin Graham',
  'Eric Gurney',
  'T. Hee',
  'Sylvia Holland',
  'Dick Huemer',
  'Dick Kelsey',
  'Dick Kinney',
  'Jesse Marsh',
  'Tom Oreb',
  'Cap Palmer',
  'Erdman Penner',
  'Dick Shaw',
  'Harry Reeves',
  'John Walbridge',
  'Roy Williams'],
 'Based on': ['"',
  'Casey at the Bat',
  '"',
  'by',
  'Ernest Thayer',
  'Peter and the Wolf',
  'by',
  'Sergei Prokofiev'],
 'Produced by': 'Walt Disney',
 'Starring': 'Nelson Eddy',
 'Music by': ['Eliot Daniel',
  'Ken Darby',
  'Charles Wolcott',
  'Oliver Wallace',
  'Edward Plumb'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures, Inc.',
 'Release dates': ['April 20, 1946 (New York City premiere)',
  'August 15, 1946 (U.S.)'],
 'Running time': '75 minutes',
 'Country': 'United States',
 'Languag

In [27]:
save_data_pickel('disney_movie_data_final.pickle',movie_info_list)

### Task 5 : Save data as JSON and CSV

In [28]:
movie_info_copy=[movie.copy() for movie in movie_info_list]

In [29]:
import datetime
for movie in movie_info_copy:
    current_date=movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)']=current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)']=None    

In [30]:
save_data('disney_data_final.json',movie_info_copy)

In [31]:
import pandas as pd
df=pd.DataFrame(movie_info_list)

In [32]:
df.to_csv('disney_mocie_data_final.csv')