# Get Film Data From Wikipedia

In [1]:
%pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: wikipedia
  Running setup.py install for wikipedia: started
  Running setup.py install for wikipedia: finished with status 'done'
Successfully installed wikipedia-1.4.0
Note: you may need to restart the kernel to use updated packages.


  DEPRECATION: wikipedia is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from bs4 import BeautifulSoup
import wikipedia as wiki
import pandas as pd
import re

def get_wiki_page(title):
    """
    Get the wikipedia page given a title

    NOTE: From https://github.com/openai/openai-cookbook/blob/main/examples/fine-tuned_qa/olympics-1-collect-data.ipynb
    """
    try:
        return wiki.WikipediaPage(title)
    except wiki.exceptions.DisambiguationError as e:
        return wiki.WikipediaPage(e.options[0])
    except wiki.exceptions.PageError as e:
        return None

In [3]:
# get wikipedia pages
pages = [get_wiki_page(page) for page in ["List_of_film_remakes_(A–M)", "List_of_film_remakes_(N-Z)"]]
# get beautiful soup of page
pages = [BeautifulSoup(page.html(), "html.parser") for page in pages]
# get all h2 values from page
pages = [page.find_all('h2')[:-2] for page in pages]

In [4]:
def get_rows(header):
    rows = []
    table = header.find_next_sibling()
    og, remake = '', ''
    for row in table.find_all('tr')[1:]: # 1:end is to skip header
        og, remake = tuple([s.text.strip() for s in row.find_all('td')[:2]])
        rows.append({'header': header.text.replace('[edit]', ''), 'original': og, 'remake': remake})
    return rows

In [5]:
all_rows = []
for h2s in pages:
    for header in h2s:
        all_rows += get_rows(header)
        
og_remake_ref_df = pd.DataFrame(all_rows)

## Create All Movies DataFrame

In [6]:
"""
NOTE: Code written with assistance from ChatGPT

Prompt: Write me the python code to split a series of strings into a list using regex with the following pattern: open parenthesis, 
any number of digits, closed parenthesis. I want to split by this value but keep the content within the parenthesis
"""
def split_remakes(remake):
    pattern = r'(?:\((\d+)\))'
    result = [res.strip() for res in re.split(pattern, remake)][:-1]

    return result

In [7]:
# create dataframe for all movies
all_movies = []
for remake_row in og_remake_ref_df.remake.apply(split_remakes).tolist():
    for i in range(len(remake_row) // 2):
        all_movies.append({
            'movie': remake_row[i*2].strip(),
            'year': remake_row[(i*2)+1].strip(),
            'status': 'remake'
        })

for og_row in og_remake_ref_df.original.str.split('('):
    all_movies.append({
        'movie': og_row[0].strip(),
        'year': og_row[-1].replace(')', '').strip(),
        'status': 'original'
    })

all_movies_df = pd.DataFrame(all_movies)

# Get Data From IMDB

In [8]:
import requests
import time

In [9]:
def get_movie_info(title, year):
    api_key = '716978917b3565e92ac410463f95122c'
    url = f"https://api.themoviedb.org/3/search/movie?query={title}&include_adult=true&page=1&year={year}&api_key={api_key}"
    res = requests.get(url)
    # hope the first value is the movie we are looking for
    res = res.json()['results'][0]
    return {
        'original_language': res['original_language'],
        'original_title': res['original_title'],
        'overview': res['overview'],
        'popularity': res['popularity'],
        'release_date': res['release_date'],
        'vote_average': res['vote_average'],
        'vote_count': res['vote_count']
    }

In [15]:
extended_info = []
for _, movie in all_movies_df.iterrows():
    time.sleep(0.4)
    title, year, status = tuple(movie)
    try:
        movie_info = get_movie_info(title, year)
    except:
        print('Error occured')
    
            'popularity': None,
            'release_date': None,
            'vote_average': None,
            'vote_count': None
        }    print(movie)
        movie_info = {
            'original_language': None,
            'original_title': None,
            'overview': None,
    movie_info['movie'] = title
    movie_info['year'] = year
    movie_info['status'] = status
    extended_info.append(movie_info)

Error occured
movie      Aaina
year        2013
status    remake
Name: 11, dtype: object
Error occured
movie     Nee Bareda Kadambari
year                      1985
status                  remake
Name: 13, dtype: object
Error occured
movie     The Cat and the Canary
year                        1979
status                    remake
Name: 103, dtype: object
Error occured
movie     Memoirs of a Murderer aka 22-nenme no kokuhaku...
year                                                   2017
status                                               remake
Name: 122, dtype: object
Error occured
movie     Yugandhar
year           1979
status       remake
Name: 157, dtype: object
Error occured
movie     Shobhraj
year          1986
status      remake
Name: 159, dtype: object
Error occured
movie     Doraemon: Nobita and the Birth of Japan 2016
year                                              2016
status                                          remake
Name: 164, dtype: object
Error occured
movie     

In [19]:
extended_info_df = pd.DataFrame(extended_info)

In [21]:
og_remake_ref_df.to_csv('og_remake_ref.csv', index=False)
extended_info_df.to_csv('extended_info.csv', index=False)