# ***Disney Movies Dataset Creation***

## **Creating the dataset**

### import necessary libraries

In [11]:
from bs4 import BeautifulSoup as bs
import requests
import re

### loading the html and converting it into a bs object

In [12]:
website = 'https://en.wikipedia.org/wiki/Toy_Story_3'
headers = {
    "User-Agent": "Mozilla"
}
r = requests.get(website, headers=headers)
soup = bs(r.content, 'lxml')

In [13]:
soup.title

<title>Toy Story 3 - Wikipedia</title>

### functions for scraping content out of the info box 

In [14]:
def get_content_value(row_data): # scraping row data <td>
    lists = row_data.find_all("li")
    if lists: # to handle the <li> items
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in lists]
    elif row_data.find('br'): # to handle the <a> <br/> items
        return [s for s in row_data.stripped_strings]
    else: # to handle single value case
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup): # to removie sup and span tags to remove stuff like [1] [2] etc.
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()

def get_info_box(url):
    website = url
    headers = {
        "User-Agent": "Mozilla"
    }
    r = requests.get(website, headers=headers)
    soup = bs(r.content, 'lxml')
    clean_tags(soup)
    
    table = soup.find("table", attrs={"class":"infobox vevent"})
    rows = table.find_all('tr')
    movie = dict()
            
    for index, row in enumerate(rows):
        if index == 0:
            movie['title'] = row.th.get_text() # to scrape title
        elif index == 1: # skipped because this index contains the movie poster
            continue
        else:
            header = row.th.get_text(" ", strip=True) 
            value = get_content_value(row.td)
            movie[header] = value
    
    return movie # to return the dictionary containing info

In [15]:
get_info_box('https://en.wikipedia.org/wiki/Toy_Story_3')

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production company': 'Pixar Animation Studios',
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['June 12, 2010 ( Taormina Film Fest )',
  'June 18, 2010 (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

### scraping the list of disney movies table

In [16]:
website = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
headers = {
    "User-Agent": "Mozilla"
}
r = requests.get(website, headers=headers)
soup = bs(r.content, 'lxml')

In [17]:
links_to_movies = []
tables = soup.find_all("table", attrs={"class":"wikitable sortable"})
for table in tables:
    links = table.select("i>a") # inside i tags, we target a tags 
    for link in links:
        links_to_movies.append(link['href'].strip()) # and grab the href and append it
links_to_movies

['/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',
 '/wiki/Pinocchio_(1940_film)',
 '/wiki/Fantasia_(1940_film)',
 '/wiki/The_Reluctant_Dragon_(1941_film)',
 '/wiki/Dumbo',
 '/wiki/Bambi',
 '/wiki/Saludos_Amigos',
 '/wiki/Victory_Through_Air_Power_(film)',
 '/wiki/The_Three_Caballeros',
 '/wiki/Make_Mine_Music',
 '/wiki/Song_of_the_South',
 '/wiki/Fun_and_Fancy_Free',
 '/wiki/Melody_Time',
 '/wiki/So_Dear_to_My_Heart',
 '/wiki/The_Adventures_of_Ichabod_and_Mr._Toad',
 '/wiki/Cinderella_(1950_film)',
 '/wiki/Treasure_Island_(1950_film)',
 '/wiki/Alice_in_Wonderland_(1951_film)',
 '/wiki/The_Story_of_Robin_Hood_(film)',
 '/wiki/Peter_Pan_(1953_film)',
 '/wiki/The_Sword_and_the_Rose',
 '/wiki/The_Living_Desert',
 '/wiki/Rob_Roy:_The_Highland_Rogue',
 '/wiki/The_Vanishing_Prairie',
 '/wiki/20,000_Leagues_Under_the_Sea_(1954_film)',
 '/wiki/Davy_Crockett:_King_of_the_Wild_Frontier_(film)',
 '/wiki/Lady_and_the_Tramp',
 '/wiki/The_African_Lion',
 '/wiki/The_Littlest_Outlaw',
 '/wiki/The_G

In [18]:
len(links_to_movies)

562

### scraping the info box for each movie in the table

In [None]:
prefix = 'https://en.wikipedia.org'
# 'https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)'
movies = []
for link in links_to_movies:
    try:
        url = prefix + link
        movies.append(get_info_box(url))
    except Exception as e:
        print(e)
        print(link)

In [29]:
# the above movie links cannot be scraped because of language issues, unavailability of info box or outdated infoboxes
len(movies)

535

In [25]:
movies

[{'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['Perce Pearce',
   'William Cottrell',
   'Larry Morey',
   'Wilfred Jackson',
   'Ben Sharpsteen'],
  'Story by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['"', 'Snow White', '"', 'by the', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Music by': ['Frank Churchill', 'Leigh Harline', 'Paul Smith'],
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
   'February 4, 1938 (United States)'],
  'Running time': '83 minutes',
  'Country': 'United States',
  'Language': 'English',
  'Budget': '$1.5 million',
  'Box office': '$418 million'},
 {'title': 'Pinocchio',
  'Directed by': ['Ben Sharpsteen',
   'Hamilton Luske',
   'Bill Roberts',
   'Norman Ferguson',
   'Jack Kinney',
   'Wilfr

### dumping the list of dicts into a json file

In [26]:
import json

In [47]:
def save_data(file_name, data):
    with open(file_name, "w") as json_file:
        json.dump(data, json_file, indent=4)

def load_data(file_name):
    with open(file_name) as f:
        return json.load(f)

In [None]:
save_data('disney_dataset_cleaned.json', movies)

In [65]:
movies = load_data('disney_dataset_cleaned.json')

## **Cleaning the data**

### Subtasks :
 - ~~removing [1] [2] etc~~
 - ~~long string handling~~
 - converting release date into datetime object
 - ~~converting running time into integer~~
 - ~~inspecting movies that could not be scraped~~
 - ~~fixing 'Based on': ['"', 'Snow White', '"', 'by the', 'Brothers Grimm']~~
 - convert budget and box office into numbers

In [66]:
movies

[{'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['Perce Pearce',
   'William Cottrell',
   'Larry Morey',
   'Wilfred Jackson',
   'Ben Sharpsteen'],
  'Story by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['"', 'Snow White', '"', 'by the', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Music by': ['Frank Churchill', 'Leigh Harline', 'Paul Smith'],
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
   'February 4, 1938 (United States)'],
  'Running time': '83 minutes',
  'Country': 'United States',
  'Language': 'English',
  'Budget': '$1.5 million',
  'Box office': '$418 million'},
 {'title': 'Pinocchio',
  'Directed by': ['Ben Sharpsteen',
   'Hamilton Luske',
   'Bill Roberts',
   'Norman Ferguson',
   'Jack Kinney',
   'Wilfr

In [67]:
def convert_running_time_into_int(time):
    if isinstance(time, list):
       int_time = int(time[0].split(' ')[0].strip())
    elif '\n' in time:
        int_time = int(time.split('\n')[0].strip())
    elif time == 'N/A':
        int_time = None
    else:
        int_time = int(time.split(' ')[0].strip())
    return int_time

In [68]:
print(convert_running_time_into_int('68\nminutes'))

68


In [69]:
# converting running time into int
for movie in movies:
    time = movie.get('Running time', 'N/A')
    movie['Running time (int)'] = convert_running_time_into_int(time)

In [70]:
for movie in movies:
    print(movie.get("Based on", 'N/A'))

['"', 'Snow White', '"', 'by the', 'Brothers Grimm']
['The Adventures of Pinocchio', 'by', 'Carlo Collodi']
N/A
N/A
['Dumbo, the Flying Elephant', 'by', 'Helen Aberson', 'Harold Pearl']
['Bambi, a Life in the Woods', 'by', 'Felix Salten']
['Walt Disney', 'Dick Lundy']
['Victory Through Air Power', 'by', 'Maj. Alexander P. Seversky']
['Walt Disney', 'Dick Lundy']
['"', 'Casey at the Bat', '"', 'by', 'Ernest Thayer', 'Peter and the Wolf', 'by', 'Sergei Prokofiev']
['"', 'Uncle Remus', '"', 'by', 'Joel Chandler Harris']
['Sinclair Lewis ( Bongo )', '" Jack and the Beanstalk " ( Mickey and the Beanstalk )']
N/A
['The Wind in the Willows by Kenneth Grahame', '" The Legend of Sleepy Hollow " by Washington Irving']
['"', 'Cinderella', '"', 'by', 'Charles Perrault']
['Treasure Island', 'by', 'Robert Louis Stevenson']
["Alice's Adventures in Wonderland", 'and', 'Through the Looking-Glass', 'by', 'Lewis Carroll']
N/A
['Peter and Wendy', 'by', 'J. M. Barrie']
['When Knighthood Was in Flower', '(n

In [71]:
# fixing based on strings
for movie in movies:
    value = movie.get("Based on", 'N/A')
    if isinstance(value, str):
        continue
    if value == "N/A":
        continue
    lmao = " ".join(value)
    movie['Based on'] = lmao

In [74]:
for movie in movies:
    print(movie.get("Based on", 'N/A'))

" Snow White " by the Brothers Grimm
The Adventures of Pinocchio by Carlo Collodi
N/A
N/A
Dumbo, the Flying Elephant by Helen Aberson Harold Pearl
Bambi, a Life in the Woods by Felix Salten
Walt Disney Dick Lundy
Victory Through Air Power by Maj. Alexander P. Seversky
Walt Disney Dick Lundy
" Casey at the Bat " by Ernest Thayer Peter and the Wolf by Sergei Prokofiev
" Uncle Remus " by Joel Chandler Harris
Sinclair Lewis ( Bongo ) " Jack and the Beanstalk " ( Mickey and the Beanstalk )
N/A
The Wind in the Willows by Kenneth Grahame " The Legend of Sleepy Hollow " by Washington Irving
" Cinderella " by Charles Perrault
Treasure Island by Robert Louis Stevenson
Alice's Adventures in Wonderland and Through the Looking-Glass by Lewis Carroll
N/A
Peter and Wendy by J. M. Barrie
When Knighthood Was in Flower (novel) by Charles Major When Knighthood Was in Flower (play) by James B. Fagan
N/A
N/A
N/A
Twenty Thousand Leagues Under the Seas by Jules Verne
N/A
"Happy Dan, the Cynical Dog" by Ward 

In [88]:
import re
number = r"\d+(,\d{3})*\.*\d*"
print(re.search(number, "23,777.930").group())

23,777.930


In [90]:
print([movie.get('Budget', 'N/A') for movie in movies])

['$1.5 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$2 million', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', 'N/A', '$2.2 million', '$1.8 million', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', '$7 million', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$5 million', '$3.6–4 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$2 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '$6.3

In [91]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [92]:
for movie in movies:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [93]:
movies[-20]

{'title': "Disney's Snow White",
 'Directed by': 'Marc Webb',
 'Screenplay by': 'Erin Cressida Wilson',
 'Based on': 'Snow White and the Seven Dwarfs by Disney " Snow White " by the Brothers Grimm',
 'Produced by': ['Marc Platt', 'Jared LeBoff'],
 'Starring': ['Rachel Zegler', 'Andrew Burnap', 'Gal Gadot'],
 'Cinematography': 'Mandy Walker',
 'Edited by': ['Mark Sanger', 'Sarah Broshar'],
 'Music by': ['Jeff Morrow (score)',
  'Larry Morey and Frank Churchill (songs)',
  'Pasek and Paul (new songs)'],
 'Production companies': ['Walt Disney Pictures', 'Marc Platt Productions'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['March 12, 2025 ( Alcázar of Segovia )',
  'March 21, 2025 (United States)'],
 'Running time': '109 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$240–270 million',
 'Box office': '$205.7 million',
 'Running time (int)': 109,
 'Budget (float)': 240000000.0,
 'Box office (float)': 205700000.0}

In [96]:
from datetime import datetime

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [98]:
for movie in movies:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))
    

In [100]:
movies[300]

{'title': 'Tarzan II',
 'Directed by': 'Brian Smith',
 'Screenplay by': ['Jim Kammerud', 'Brian Smith', 'Bob Tzudiker Noni White'],
 'Based on': 'Tarzan of the Apes by Disney’s Tarzan by Walt Disney Animation Studios Disney’s The Legend of Tarzan by Walt Disney Television',
 'Produced by': ['Carolyn Bates', 'Jim Kammerud', 'Leslie Hough'],
 'Starring': ['Harrison Chad',
  'George Carlin',
  'Brad Garrett',
  'Ron Perlman',
  'Estelle Harris',
  'Glenn Close',
  'Lance Henriksen',
  'Brenda Grate',
  'Harrison Fahn'],
 'Edited by': ['Ron Price', 'John Royer'],
 'Music by': ['Mark Mancina (score)',
  'Dave Metzger (score)',
  'Phil Collins (songs)'],
 'Production company': 'Disneytoon Studios',
 'Distributed by': 'Buena Vista Home Entertainment',
 'Release date': ['June 14, 2005'],
 'Running time': '72 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 72,
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date (datetime)': datetime.datetim

In [101]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)
        
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [102]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movies)

In [103]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")
a == movies

True

## **Attaching IMDB/Rotten Tomatoes/Metascore scores**

In [116]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_more.pickle')

In [117]:
movie_info_list[-60]

{'title': 'Raya and the Last Dragon',
 'Directed by': ['Don Hall', 'Carlos López Estrada'],
 'Screenplay by': ['Qui Nguyen', 'Adele Lim'],
 'Story by': ['Paul Briggs',
  'Don Hall',
  'Adele Lim',
  'Carlos López Estrada',
  'Kiel Murray',
  'Qui Nguyen',
  'John Ripa',
  'Dean Wellins'],
 'Based on': 'Story ideas by Bradley Raymond and Helen Kalafatic',
 'Produced by': ['Osnat Shurer', 'Peter Del Vecho'],
 'Starring': ['Kelly Marie Tran',
  'Awkwafina',
  'Izaac Wang',
  'Gemma Chan',
  'Daniel Dae Kim',
  'Benedict Wong',
  'Sandra Oh',
  'Thalia Tran',
  'Lucille Soong',
  'Alan Tudyk'],
 'Cinematography': ['Rob Dressel (layout)', 'Adolph Lusinsky (lighting)'],
 'Edited by': ['Fabienne Rawley', 'Shannon Stein'],
 'Music by': 'James Newton Howard',
 'Production company': 'Walt Disney Animation Studios',
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['March 5, 2021'],
 'Running time': '107 minutes',
 'Country': 'United States',
 'Language': 'English',

In [119]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": "f694132b", 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info("into the woods")

{'Title': 'Into the Woods',
 'Year': '2014',
 'Rated': 'PG',
 'Released': '25 Dec 2014',
 'Runtime': '125 min',
 'Genre': 'Adventure, Comedy, Drama',
 'Director': 'Rob Marshall',
 'Writer': 'James Lapine, Stephen Sondheim',
 'Actors': 'Anna Kendrick, Meryl Streep, Chris Pine',
 'Plot': 'A witch tasks a childless baker and his wife with procuring magical items from classic fairy tales to reverse the curse put on their family tree.',
 'Language': 'English',
 'Country': 'United States, Canada, Italy, United Kingdom',
 'Awards': 'Nominated for 3 Oscars. 11 wins & 75 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTY4MzQ4OTY3NF5BMl5BanBnXkFtZTgwNjM5MDI3MjE@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '5.9/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '70%'},
  {'Source': 'Metacritic', 'Value': '69/100'}],
 'Metascore': '69',
 'imdbRating': '5.9',
 'imdbVotes': '152,185',
 'imdbID': 'tt2180411',
 'Type': 'movie',
 'DVD': 'N/A',
 'BoxO

In [121]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [122]:
movie_info_list[-4]

{'title': 'The Hunchback of Notre Dame',
 'Directed by': ['Gary Trousdale', 'Kirk Wise'],
 'Screenplay by': ['Tab Murphy',
  'Irene Mecchi',
  'Bob Tzudiker',
  'Noni White',
  'Jonathan Roberts'],
 'Story by': 'Tab Murphy',
 'Based on': 'The Hunchback of Notre-Dame by Victor Hugo',
 'Produced by': 'Don Hahn',
 'Starring': ['Tom Hulce',
  'Demi Moore',
  'Tony Jay',
  'Kevin Kline',
  'Paul Kandel',
  'Jason Alexander',
  'Charles Kimbrough',
  'Mary Wickes',
  'David Ogden Stiers'],
 'Edited by': 'Ellen Keneshea',
 'Music by': 'Alan Menken',
 'Production company': 'Walt Disney Feature Animation',
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release dates': ['June 19, 1996 ( Louisiana Superdome )',
  'June 21, 1996 (United States)'],
 'Running time': '91 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$70 million',
 'Box office': '$325.3 million',
 'Running time (int)': 91,
 'Budget (float)': 70000000.0,
 'Box office (float)': 325300000.0,
 'Rele

In [123]:
save_data_pickle("disney_movie_data_added_ratings_more.pickle", movie_info_list)

In [146]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_more.pickle')

In [153]:
import pandas as pd

In [None]:
df = pd.DataFrame(movie_list_info)

In [154]:
df.head()

Unnamed: 0,title,Directed by,Story by,Based on,Produced by,Music by,Production company,Distributed by,Release dates,Running time,...,Release date,Written by,Edited by,Languages,Narrated by,Screenplay by,Countries,Color process,Production companies,Layouts by
0,Snow White and the Seven Dwarfs,"[Perce Pearce, William Cottrell, Larry Morey, ...","[Ted Sears, Richard Creedon, Otto Englander, D...",""" Snow White "" by the Brothers Grimm",Walt Disney,"[Frank Churchill, Leigh Harline, Paul Smith]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",83 minutes,...,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...","[Ted Sears, Otto Englander, Webb Smith, Willia...",The Adventures of Pinocchio by Carlo Collodi,Walt Disney,"[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,...,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Joe Grant, Dick Huemer]",,"[Walt Disney, Ben Sharpsteen]",See plot,Walt Disney Productions,RKO Radio Pictures,,126 minutes,...,"[November 13, 1940]",,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...",,,Walt Disney,"[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,74 minutes,...,"[June 27, 1941]","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",Paul Weatherwax,,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...","[Joe Grant, Dick Huemer]","Dumbo, the Flying Elephant by Helen Aberson Ha...",Walt Disney,"[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",64 minutes,...,,,,,,,,,,


In [156]:
df.to_excel("movie_excel.xlsx", index=False)