# Disney Dataset Creation (BeautifulSoup)

### Scrape & clean a list of disney wikipedia pages to create a dataset to further analyze

Based on a youtube tutorial in Keith Galli chanel
https://www.youtube.com/watch?v=Ewgy-G9cmbg

### Import Necessary Libraries

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import re
import json
from datetime import datetime
import pickle
import urllib


# Get list of movies from Wikipedia

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
page = requests.get(url)

In [None]:
soup = bs(page.content)
tabels = soup.select('.wikitable.sortable i a')

In [None]:
movies = []

In [None]:
def get_content_valeu(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip = True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    
    else: 
        return row_data.get_text(' ', strip = True).replace('\xa0', ' ')


In [None]:
def tag_remover(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()
    return

In [None]:
def extract_info(movie_url):
    try:
        page = requests.get(movie_url)
        soup = bs(page.content)
        info_box = soup.find(class_='infobox vevent')
        info_row = info_box.find_all('tr')
        tag_remover(soup)
        movie_dic = {}
        
        for index, row in enumerate(info_row):
            if index == 0:
                movie_dic['Titel'] =  row.find("th").get_text(" ", strip=True)
            else:
                header = row.find('th')
                if header:
                    content_key = row.find("th").get_text(" ", strip=True)
                    content_value = get_content_valeu(row.find("td"))
                    movie_dic[content_key] = content_value
        return movie_dic
    except Exception as e:
        print(movie_url)
        print(e)


In [None]:
for index, movie in enumerate(tabels):
    href = tabels[index]['href']
    #print(tabels[index])
    movie_url = 'https://en.wikipedia.org' + href
    if extract_info(movie_url):
        movies.append(extract_info(movie_url))
    


## Save/Reload Movie Data

In [None]:
def save_data(titel, movie_list):
    with open(titel, 'w') as f:
        json.dump(movie_list, f, indent= 4)
        
def load_data(titel):
    with open(titel, 'r') as f:
        li = json.load(f)
        return li

In [None]:
save_data('disney_data.json', movies)

In [None]:
movies = load_data('disney_data.json')

In [None]:
extract_info('https://en.wikipedia.org/wiki/Toy_Story_4')

In [None]:
len(movies)

In [None]:
movies[43]

# Clean data and add more info

### Convert running time into an integer

In [None]:
#[movie.get('Running time', 'N/A') for movie in movies]
def min_to_int(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        
        return int(running_time[0].split(' ')[0])
    else:
        return int(running_time.split(' ')[0])

for movie in movies:
    movie['Running time(int)'] = min_to_int(movie.get('Running time', 'N/A'))


### Convert Budget & Box office to numbers

In [None]:
#[movie.get('Budget', 'N/A') for movie in movies]


In [None]:
money_r = r'[\$]?(\d+([,\.\d]+)?)'
magnitud = r"thousand|million|billion"
money_mag = rf'{money_r}(-|\sto\s|–)?\s({magnitud})'

def add_magnitud(string):
    mag = {'thousand':1000, 'million': 1000000, 'billion': 1000000000}
    
    return mag[string]

def parse_money(string):
    valeu  = re.search(money_r, string).group()
    valeu = valeu.replace('$', '')
    return float(valeu.replace(',', ''))

def parse_mag(string):
    valeu = re.search(money_mag, string).group()
    valeu = valeu.replace('$', '')
    valeu = valeu.replace(',', '')

    mag = add_magnitud(valeu.split(' ')[1])
    return float(valeu.split(' ')[0]) * mag




def money_convert(money):
    if money == 'N/A':
        return None
    
    if isinstance(money, list):
        money = money[0]
    
    money_val = re.search(money_r, money)
    money_mag_val = re.search(money_mag, money)

    
    if money_mag_val:
        return parse_mag(money)
    elif money_val:
        return parse_money(money)
    else:
        return None


In [None]:
for movie in movies:
    movie['Box office (int)'] = money_convert(movie.get('Box office', 'N/A'))
    movie['Budget (int)'] = money_convert(movie.get('Budget', 'N/A'))

In [None]:
movies[-100]

### Convert dates into datetime object

In [None]:
#[movie.get('Release date', 'N/A') for movie in movies]

In [None]:
date_frt = ['%B %d, %Y', '%d %B, %Y']
def date_cleaner(date_string):
    date = date_string.split('(')[0].strip()
    return date

def datetime_convertor(datetime_string):
    if datetime_string == 'N/A':
        return None
    if isinstance(datetime_string, list):
        datetime_string = datetime_string[0]
    
    for frt in date_frt:
        try:
           
           return datetime.strptime(date_cleaner(datetime_string), frt)
        except:
            pass
    return None


In [None]:
for movie in movies:
    movie['Release date (date)'] = datetime_convertor(movie.get('Release date', 'N/A'))

In [None]:
movies[-100]

### Save/Reload Movie Data

In [None]:
def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [None]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movies)

In [None]:
movies = load_data_pickle("disney_movie_data_cleaned_more.pickle")

# Attach IMDB/Rotten Tomatoes/Metascore scores

In [None]:
data_url = 'http://www.omdbapi.com/?'

def get_info_omdb(title):
    parameters = {'apikey':0000000, 't': title}
    encode_params = urllib.parse.urlencode(parameters)
    full_path = data_url + encode_params

    return requests.get(full_path).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None


In [None]:
for movie in movies:
    omdb = get_info_omdb(movie.get('titel'))
    movie['imdbRating'] = omdb.get('imdbRating')
    movie['Metascore'] = omdb.get('Metascore')
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb)


In [None]:
movies[-50]

In [None]:
save_data_pickle('disney_movie_data_final.pickle', movies)