# Data Gathering from Wikipedia 

Using Wikipedia API to create a list of movie franchises and the list of movies associated with them. 

In [None]:
pip install wikipedia

In [1]:
import requests
import json
from pathlib import Path
import pandas as pd
import wikipedia
from bs4 import BeautifulSoup
import re
import string

## Wikipedia API 

In [2]:
wiki = "https://en.wikipedia.org/wiki/"
page = wikipedia.page("Lists_of_feature_film_series")
raw_link_list = list(page.links)

In [3]:
link_list = []
for link in raw_link_list:
    if "List of feature" in link:
        link = link.replace(" ","_")
        link_list.append(link)
        
link_list

['List_of_feature_film_series_with_11_to_20_entries',
 'List_of_feature_film_series_with_eight_entries',
 'List_of_feature_film_series_with_five_entries',
 'List_of_feature_film_series_with_four_entries',
 'List_of_feature_film_series_with_more_than_twenty_entries',
 'List_of_feature_film_series_with_nine_entries',
 'List_of_feature_film_series_with_seven_entries',
 'List_of_feature_film_series_with_six_entries',
 'List_of_feature_film_series_with_ten_entries',
 'List_of_feature_film_series_with_three_entries']

This function extracts the links from the "List of Feature Films" wiki page api.
These links are then used to find each franchise list on the corresponding pages using beautiful soup.

*Note: Unfortunately at this point we must transfer to using requests instead of the API as beautiful soup only works with html data structures, which the API does not provide.*

## Beautiful Soup

In [4]:
pages = []
for link in link_list:
    url = wiki + link
    html = requests.get(url).content
    soup = BeautifulSoup(html, "html.parser")

    with open("output.html", "w", encoding = 'utf-8') as file:
    
        # prettify the soup object and convert it into a string  
        file.write(str(soup.prettify()))
        
    pages.append(soup)

In [5]:
print(url)

https://en.wikipedia.org/wiki/List_of_feature_film_series_with_three_entries


This initial use of beautiful soup extracts all the movie information from the pages in the list above.\
The data is parsed into a list of its sections based on the html format given.

In [6]:
# Parse html block using beautiful soup commands
def parse_bs(tag, div, opt):
    if opt:
        parent = tag.findAll(div, attrs = opt)
    else: 
        parent = tag.findAll(div)
    
    
    parsed = []
    for div in parent:
        parsed.append(div)
    
    return parsed

In [7]:
divisions = []
for page in pages:
    division = parse_bs(page, "div", {"class":"div-col"})
    divisions.append(division)

Breaks down html page into divisions.\
Each division contains all franchises of a particular count.\
ie division[0] contains all franchises of length 11.

In [8]:
sections = []
for div in divisions:
    for d in div:
        sections.append(parse_bs(d, "ul", None))

print(len(sections))
#for s in sections:
#    print(s, "\n")
print(sections[-1])

154
[<ul><li><i><a href="/wiki/DC_Extended_Universe#Zack_Snyder's_Justice_League_(2021)" title="DC Extended Universe">Zack Snyder's Justice League Trilogy</a></i>
<ol><li><i><a href="/wiki/Man_of_Steel_(film)" title="Man of Steel (film)">Man of Steel</a></i> (2013)</li>
<li><i><a href="/wiki/Batman_v_Superman:_Dawn_of_Justice" title="Batman v Superman: Dawn of Justice">Batman v Superman: Dawn of Justice – Ultimate Edition</a></i> (2016)</li>
<li><i><a href="/wiki/Zack_Snyder%27s_Justice_League" title="Zack Snyder's Justice League">Zack Snyder's Justice League</a></i> (2021)</li></ol></li>
<li><i><a href="/wiki/Zeitgeist_(film_series)" title="Zeitgeist (film series)">Zeitgeist</a></i>
<ol><li><i>Zeitgeist: The Movie</i> (2007) (V)</li>
<li><i>Zeitgeist: Addendum</i> (2008) (V)</li>
<li><i>Zeitgeist: Moving Forward</i> (2011) (V)</li></ol></li>
<li><i>Zenon</i>
<ol><li><i><a href="/wiki/Zenon:_Girl_of_the_21st_Century_(film)" title="Zenon: Girl of the 21st Century (film)">Zenon: Girl of 

Breaks down division page into into sections.\
Each section contains all franchises that start with a particular letter,\
    ie. section[0] contains all franchises that begin with the letter "A".

In [9]:
def extract_title_line(franchises, list_len, movie_count):
    title_list = []
    # Extract title from each franchise and store in title_list
    step = 0
    for i in range(list_len):
        title_list.append(str(franchises).split("\n")[step])
        step+=movie_count
        
    return title_list

In [10]:
movie_list = []
franchise_list = []
for section in sections:
    for s in section:
        letter = parse_bs(s, "ol", None)
        if letter:
            movie_count = len(parse_bs(letter[0], "li", None))
        
            for f in letter:
                movie_list.append(f)
    
            titles = extract_title_line(s, len(letter), (movie_count+1))
            for t in titles:
                franchise_list.append(t)
    
print(len(movie_list))  
print(len(franchise_list))  
    #for i in range(17):
    #    
    #    print(franchise_list[i], "\n")
    #    print(movie_list[i], "\n\n")

1290
1290


In [11]:
movie_franchises = pd.DataFrame()
movie_franchises["franchise_name"] = franchise_list
movie_franchises["movie_name"] = movie_list
movie_franchises

Unnamed: 0,franchise_name,movie_name
0,"<ul><li><i><a href=""/wiki/The_Aldrich_Family#F...","[[[<a href=""/wiki/What_a_Life_(film)"" title=""W..."
1,"<ul><li><i><a href=""/wiki/Coffin_Joe"" title=""C...","[[[<a href=""/wiki/At_Midnight_I%27ll_Take_Your..."
2,"<li><i><a href=""/wiki/The_Crime_Club#Film"" tit...","[[[<a href=""/wiki/The_Westland_Case"" title=""Th..."
3,"<ul><li><i><a href=""/wiki/Fast_%26_Furious"" ti...","[[[<a href=""/wiki/The_Fast_and_the_Furious_(20..."
4,<ul><li><i>Gingerdead Man vs. Evil Bong</i>,"[[[<a href=""/wiki/The_Gingerdead_Man"" title=""T..."
...,...,...
1285,"<ul><li><i><a href=""/wiki/DC_Extended_Universe...","[[[<a href=""/wiki/Man_of_Steel_(film)"" title=""..."
1286,"<li><i><a href=""/wiki/Zeitgeist_(film_series)""...","[[[Zeitgeist: The Movie], (2007) (V)], \n, [[..."
1287,<li><i>Zenon</i>,"[[[<a href=""/wiki/Zenon:_Girl_of_the_21st_Cent..."
1288,"<li><i><a href=""/wiki/Zorro"" title=""Zorro"">Zor...","[[[<a class=""extiw"" href=""https://fr.wikipedia..."


In [12]:
# break down movie_name into list of movies using .apply()
# extract franchise_name (convert to string?)
# explode() movie_name
# extract date from movie_name

# extract movie_name (str)

In [13]:
movie_franchises["movie_name"] = movie_franchises["movie_name"].apply(lambda x : parse_bs(x, "li", None))

This cell above breaks the section down to access each franchise movie list

In [14]:
def extract_title(line):
    regex = re.compile(r'<[^>]+>')
    def remove_html(string):
        return regex.sub('', string)
    title=remove_html(line)
    for char in "*":
        title = title.replace(char, '')
    return title

In [15]:
def clean_title(title):
    pattern = r'\[.*?\]'
    re.sub(pattern, '', title)
    return title

In [16]:
movie_franchises["franchise_name"] = movie_franchises["franchise_name"].apply(lambda x : extract_title(x))
movie_franchises

Unnamed: 0,franchise_name,movie_name
0,The Aldrich Family,"[[[<a href=""/wiki/What_a_Life_(film)"" title=""W..."
1,Coffin Joe,"[[[<a href=""/wiki/At_Midnight_I%27ll_Take_Your..."
2,The Crime Club,"[[[<a href=""/wiki/The_Westland_Case"" title=""Th..."
3,Fast &amp; Furious,"[[[<a href=""/wiki/The_Fast_and_the_Furious_(20..."
4,Gingerdead Man vs. Evil Bong,"[[[<a href=""/wiki/The_Gingerdead_Man"" title=""T..."
...,...,...
1285,Zack Snyder's Justice League Trilogy,"[[[<a href=""/wiki/Man_of_Steel_(film)"" title=""..."
1286,Zeitgeist,"[[[Zeitgeist: The Movie], (2007) (V)], [[Zeit..."
1287,Zenon,"[[[<a href=""/wiki/Zenon:_Girl_of_the_21st_Cent..."
1288,Zorro (1958-1962) [e],"[[[<a class=""extiw"" href=""https://fr.wikipedia..."


In [17]:
movie_franchises = movie_franchises.explode("movie_name")
movie_franchises.reset_index(inplace = True, drop = True)
movie_franchises

Unnamed: 0,franchise_name,movie_name
0,The Aldrich Family,"[[[What a Life]], (1949)]"
1,The Aldrich Family,"[[[Life with Henry]], (1952)]"
2,The Aldrich Family,"[[[Henry Aldrich for President]], (1952)]"
3,The Aldrich Family,"[[[Henry Aldrich, Editor]], (1952)]"
4,The Aldrich Family,"[[[Henry and Dizzy]], (1952)]"
...,...,...
6400,Zorro (1958-1962) [e],[[[El zorro escarlata en diligencia fantasma]]...
6401,Zorro (1958-1962) [e],"[[[El Zorro Vengador]], (1962)]"
6402,Zorro (1969-1971)[f],"[[[Zorro's Latest Adventure]], (1969)]"
6403,Zorro (1969-1971)[f],"[[[Zorro, Rider of Vengeance]], (1971)]"


In [18]:
def extract_release_date(movie):
    
    date = ""
    extract = re.findall(r'\((.*?)\)',str(movie))
    for e in extract:
        if e.isdigit():
            date = e
    return date

In [19]:
# Find all years for movies in movie list
movie_franchises["release_year"] = movie_franchises["movie_name"].apply(lambda x : extract_release_date(x))
movie_franchises

Unnamed: 0,franchise_name,movie_name,release_year
0,The Aldrich Family,"[[[What a Life]], (1949)]",1949
1,The Aldrich Family,"[[[Life with Henry]], (1952)]",1952
2,The Aldrich Family,"[[[Henry Aldrich for President]], (1952)]",1952
3,The Aldrich Family,"[[[Henry Aldrich, Editor]], (1952)]",1952
4,The Aldrich Family,"[[[Henry and Dizzy]], (1952)]",1952
...,...,...,...
6400,Zorro (1958-1962) [e],[[[El zorro escarlata en diligencia fantasma]]...,1959
6401,Zorro (1958-1962) [e],"[[[El Zorro Vengador]], (1962)]",1962
6402,Zorro (1969-1971)[f],"[[[Zorro's Latest Adventure]], (1969)]",1969
6403,Zorro (1969-1971)[f],"[[[Zorro, Rider of Vengeance]], (1971)]",1971


In [20]:
for d in movie_franchises["release_year"]:
    if len(d) > 4:
        print(d)

In [21]:
def clean_movie(movie):
    movie = re.sub(r'\(.*?\)', '', movie)
    return movie

In [22]:
movie_franchises["movie_name"] = movie_franchises["movie_name"].apply(lambda x : extract_title(str(x))).apply(lambda x : clean_movie(x))
movie_franchises

Unnamed: 0,franchise_name,movie_name,release_year
0,The Aldrich Family,What a Life,1949
1,The Aldrich Family,Life with Henry,1952
2,The Aldrich Family,Henry Aldrich for President,1952
3,The Aldrich Family,"Henry Aldrich, Editor",1952
4,The Aldrich Family,Henry and Dizzy,1952
...,...,...,...
6400,Zorro (1958-1962) [e],El zorro escarlata en diligencia fantasma,1959
6401,Zorro (1958-1962) [e],El Zorro Vengador,1962
6402,Zorro (1969-1971)[f],Zorro's Latest Adventure,1969
6403,Zorro (1969-1971)[f],"Zorro, Rider of Vengeance",1971
