In [1]:
#This file uses BeautifulSoup to collect a list of paired films -- one remake and one original -- from Wikipedia.
#It creates a CSV file with one entry per film (after collecting more data about them, they will be paired up again later).

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

%matplotlib inline

from bs4 import BeautifulSoup

In [2]:
#Request Movies & Remakes A to M
url = 'https://en.wikipedia.org/wiki/List_of_film_remakes_(A%E2%80%93M)'
response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
#Save that file
with open('data_files/movies_atom.html', 'w') as f:
    f.write(response.text.encode('utf-8'))

In [5]:
#Request Movies & Remakes N to Z
url = 'https://en.wikipedia.org/wiki/List_of_film_remakes_(N%E2%80%93Z)'
response = requests.get(url)

In [6]:
response.status_code

200

In [7]:
#Save that file
with open('data_files/movies_ntoz.html', 'w') as f:
    f.write(response.text.encode('utf-8'))

In [8]:
#Go through html tables to capture the information in the tables

def get_pairs(file):
    """Function that returns a list of dictionaries to hold information on each original and remake"""
    movie_pairs = []
    header = ['remake', 'original']
    for row in file.findAll('tr'):
        row_dict = {}
        for i, cell in enumerate(row.findAll('td')):
            row_dict[header[i]] = cell.findAll(text = True)
        movie_pairs.append(row_dict)

    #Eliminate empty dictionaries (from table headers/other empty table rows)    
    non_empty_pairs = []
    for i in movie_pairs:
        if i:
            non_empty_pairs.append(i)
    movie_pairs = non_empty_pairs
    
    return movie_pairs   

In [9]:
#Call get_pairs on the 2 html files
atom = BeautifulSoup(open('data_files/movies_atom.html').read(), 'lxml')
pairs_atom = get_pairs(atom)
ntoz = BeautifulSoup(open('data_files/movies_ntoz.html').read(), 'lxml')
pairs_ntoz = get_pairs(ntoz)

In [10]:
#Combine into one list
movie_pairs = []
for i in pairs_atom:
    movie_pairs.append(i)
for i in pairs_ntoz:
    movie_pairs.append(i)   
    
#Drop last row, which doesn't actually contain movie information    
movie_pairs = movie_pairs[:-1]    

In [1]:
#Unfortunately Wikipedia is not perfect and there were a number of cases where the director or year information was missing or 
# not in the proper spot. I fixed these by hand. Code for that is not included here, but feel free to contact me for details.  

In [13]:
#Create & populate lists to hold movie names, years, directors
original_name = []
original_year = []
original_director = []
remake_name = []
remake_year = []
remake_director = []

for i in movie_pairs:
    original_name.append(i['original'][0])
    original_year.append(i['original'][1])
    original_director.append(i['original'][2])
    remake_name.append(i['remake'][0])
    remake_year.append(i['remake'][1])
    remake_director.append(i['remake'][2])

#Verify all looks correct & of the same length   
print len(original_name)    
print len(original_year)
print len(original_director)
print len(remake_name)    
print len(remake_year)
print len(remake_director)

534
534
534
534
534
534


In [14]:
#Use regular expressions to eliminate parentheses around years & capture 4 digits only

import re

original_year_regex = []
for i in original_year:
    year = re.search( r'\d\d\d\d', i)
    original_year_regex.append(year.group(0))

remake_year_regex = []
for i in remake_year:
    year = re.search( r'\d\d\d\d', i)
    remake_year_regex.append(year.group(0))

#Verify this worked    
original_year = original_year_regex
print len(original_year)
print original_year[0:20]

remake_year = remake_year_regex    
print len(remake_year)
print remake_year[0:20]

534
[u'1957', u'2005', u'1963', u'1943', u'1961', u'1964', u'1957', u'1940', u'1986', u'1931', u'1939', u'1947', u'1966', u'1937', u'1943', u'1979', u'1956', u'1970', u'1951', u'1946']
534
[u'2007', u'2010', u'2011', u'1951', u'1996', u'2005', u'2007', u'1951', u'2014', u'1933', u'1957', u'1984', u'2004', u'1938', u'1989', u'2005', u'1988', '2010', u'1994', u'1980']


In [15]:
#Need to delete Dumbo (index 124) because it's on the list even though the remake hasn't come out yet
print original_name[124]
print original_year[124]
print original_director[124]
print remake_name[124]
print remake_year[124]
print remake_director[124]

original_name.pop(124)
original_year.pop(124)
original_director.pop(124)
remake_name.pop(124)
remake_year.pop(124)
remake_director.pop(124)

Dumbo
1941
Ben Sharpsteen
Dumbo
2017
Tim Burton


u'Tim Burton'

In [16]:
#Create a dataframe with one row for each movie to facilitate further data collection
all_movies = pd.DataFrame()
all_movies['title'] = remake_name + original_name
all_movies['year'] = remake_year + original_year
all_movies['director'] = remake_director + original_director

print all_movies.info()
all_movies.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1066 entries, 0 to 1065
Data columns (total 3 columns):
title       1066 non-null object
year        1066 non-null object
director    1066 non-null object
dtypes: object(3)
memory usage: 25.1+ KB
None


Unnamed: 0,title,year,director
0,12,2007,Nikita Mikhalkov
1,13,2010,Géla Babluani
2,13 Assassins,2011,Takashi Miike
3,The 13th Letter,1951,Otto Preminger
4,101 Dalmatians,1996,Stephen Herek
5,2001 Maniacs,2005,Tim Sullivan
6,3:10 to Yuma,2007,James Mangold
7,Abbott and Costello Meet the Invisible Man,1951,Charles Lamont
8,About Last Night,2014,Steve Pink
9,Adorable,1933,William Dieterle


In [17]:
#Save the dataframes to separate CSV files
all_movies.to_csv('data_files/all_movies.csv', encoding='utf-8')