# Data Gathering from Wikipedia 

Using Wikipedia API to create a list of movie franchises and the list of movies associated with them. 

In [1]:
import requests
import json
from pathlib import Path
import pandas as pd
import wikipedia
from bs4 import BeautifulSoup
import re
import string

## Wikipedia API 

In [2]:
wiki = "https://en.wikipedia.org/wiki/"
page = wikipedia.page("Lists_of_feature_film_series")
raw_link_list = list(page.links)

In [3]:
link_list = []
for link in raw_link_list:
    if "List of feature" in link:
        link = link.replace(" ","_")
        link_list.append(link)
        
link_list

['List_of_feature_film_series_with_11_to_20_entries',
 'List_of_feature_film_series_with_eight_entries',
 'List_of_feature_film_series_with_five_entries',
 'List_of_feature_film_series_with_four_entries',
 'List_of_feature_film_series_with_more_than_twenty_entries',
 'List_of_feature_film_series_with_nine_entries',
 'List_of_feature_film_series_with_seven_entries',
 'List_of_feature_film_series_with_six_entries',
 'List_of_feature_film_series_with_ten_entries',
 'List_of_feature_film_series_with_three_entries']

This function extracts the links from the "List of Feature Films" wiki page api.
These links are then used to find each franchise list on the corresponding pages using beautiful soup.

*Note: Unfortunately at this point we must transfer to using requests instead of the API as beautiful soup only works with html data structures, which the API does not provide.*

## Beautiful Soup

In [4]:
pages = []
for link in link_list:
    print(link)
    url = wiki + link
    html = requests.get(url).content
    soup = BeautifulSoup(html, "html.parser")
    filename = link + ".html"
    with open(filename, "w", encoding = 'utf-8') as file:
    
        # prettify the soup object and convert it into a string  
        file.write(str(soup.prettify()))
        
    pages.append(soup)

List_of_feature_film_series_with_11_to_20_entries
List_of_feature_film_series_with_eight_entries
List_of_feature_film_series_with_five_entries
List_of_feature_film_series_with_four_entries
List_of_feature_film_series_with_more_than_twenty_entries
List_of_feature_film_series_with_nine_entries
List_of_feature_film_series_with_seven_entries
List_of_feature_film_series_with_six_entries
List_of_feature_film_series_with_ten_entries
List_of_feature_film_series_with_three_entries


In [5]:
print(url)

https://en.wikipedia.org/wiki/List_of_feature_film_series_with_three_entries


This initial use of beautiful soup extracts all the movie information from the pages in the list above.\
The data is parsed into a list of its sections based on the html format given.

In [6]:
# Parse html block using beautiful soup commands
def parse_bs(tag, div, opt):
    if opt:
        parent = tag.findAll(div, attrs = opt)
    else: 
        parent = tag.findAll(div)
    
    
    parsed = []
    for div in parent:
        parsed.append(div)
    
    return parsed

In [7]:
divisions = []
for page in pages:
    division = parse_bs(page, "div", {"class":"div-col"})
    divisions.append(division)

In [8]:
outlier = parse_bs(pages[4], "div", {"class":"mw-parser-output"})
outlier_divs = parse_bs(outlier[0], "ul", None)
outlier_sect = []
for i in range(7, 37):
    outlier_sect.append(outlier_divs[i])


In [9]:
outlier_sect[0]

<ul><li><i><a href="/wiki/Mil_M%C3%A1scaras" title="Mil Máscaras">Mil Máscaras</a></i>
<ol><li><i>Mil Máscaras</i> (1966)</li>
<li><i>Los Canallas</i> (<i>The Scoundrels</i>) (1966)</li>
<li><i>Las Vampiras</i> (<i>The Vampire Girls</i>) (1968)</li>
<li><i>Enigma de Muerte</i> (<i>Enigma of Death</i>) (1968)</li>
<li><i>Los Campeones Justicieros</i> (<i>The Champions of Justice</i>) (1970)</li>
<li><i>Las Momias de Guanajuato</i> (<i>The Mummies of Guanajuato</i>) (1970)</li>
<li><i>El Robo de las Momias de Guanajuato</i> (<i>The Theft of the Mummies of Guanajuato</i>) (1972)</li>
<li><i>Vuelven los Campeones Justicieros</i> (<i>The Champions of Justice Return</i>) (1972)</li>
<li><i>Una Rosa Sobre el Ring</i> (<i>A Rose In The Ring</i>) (1972)</li>
<li><i>Leyendas Macabras de la Colonia</i> (<i>Macabre Legends of The Colony</i>) (1973)</li>
<li><i>Los Vampiros de Coyoacán</i> (<i>The Vampires of Coyoacán</i>) (1973)</li>
<li><i>Las Momias de San Ángel</i> (<i>The Mummies of San Ángel<

### Breaks down html page into divisions.
Each division contains all franchises of a particular count.\
ie division[0] contains all franchises of length 11.

In [10]:
sections = []
for div in divisions:
    for d in div:
        sections.append(parse_bs(d, "ul", None))
print(len(sections))
print(sections[-1])

154
[<ul><li><i><a href="/wiki/DC_Extended_Universe#Zack_Snyder's_Justice_League_(2021)" title="DC Extended Universe">Zack Snyder's Justice League Trilogy</a></i>
<ol><li><i><a href="/wiki/Man_of_Steel_(film)" title="Man of Steel (film)">Man of Steel</a></i> (2013)</li>
<li><i><a href="/wiki/Batman_v_Superman:_Dawn_of_Justice" title="Batman v Superman: Dawn of Justice">Batman v Superman: Dawn of Justice – Ultimate Edition</a></i> (2016)</li>
<li><i><a href="/wiki/Zack_Snyder%27s_Justice_League" title="Zack Snyder's Justice League">Zack Snyder's Justice League</a></i> (2021)</li></ol></li>
<li><i><a href="/wiki/Zeitgeist_(film_series)" title="Zeitgeist (film series)">Zeitgeist</a></i>
<ol><li><i>Zeitgeist: The Movie</i> (2007) (V)</li>
<li><i>Zeitgeist: Addendum</i> (2008) (V)</li>
<li><i>Zeitgeist: Moving Forward</i> (2011) (V)</li></ol></li>
<li><i>Zenon</i>
<ol><li><i><a href="/wiki/Zenon:_Girl_of_the_21st_Century_(film)" title="Zenon: Girl of the 21st Century (film)">Zenon: Girl of 

Breaks down division page into into sections.\
Each section contains all franchises that start with a particular letter,\
    ie. section[0] contains all franchises that begin with the letter "A".

In [11]:
def extract_title_line(franchises, list_len, movie_count):
    title_list = []
    # Extract title from each franchise and store in title_list
    step = 0
    for i in range(list_len):
        title_list.append(str(franchises).split("\n")[step])
        step+=movie_count
        
    return title_list

In [12]:
for section in sections:
    print(section, "\n")

[<ul><li><i><a href="/wiki/The_Aldrich_Family#Films" title="The Aldrich Family">The Aldrich Family</a></i> *
<ol><li><i><a href="/wiki/What_a_Life_(film)" title="What a Life (film)">What a Life</a></i> (1939)</li>
<li><i><a href="/wiki/Life_with_Henry" title="Life with Henry">Life with Henry</a></i> (1940)</li>
<li><i><a href="/wiki/Henry_Aldrich_for_President" title="Henry Aldrich for President">Henry Aldrich for President</a></i> (1941)</li>
<li><i><a href="/wiki/Henry_Aldrich,_Editor" title="Henry Aldrich, Editor">Henry Aldrich, Editor</a></i> (1942)</li>
<li><i><a href="/wiki/Henry_and_Dizzy" title="Henry and Dizzy">Henry and Dizzy</a></i> (1942)</li>
<li><i><a href="/wiki/Henry_Aldrich_Swings_It" title="Henry Aldrich Swings It">Henry Aldrich Swings It</a></i> (1943)</li>
<li><i><a href="/wiki/Henry_Aldrich_Gets_Glamour" title="Henry Aldrich Gets Glamour">Henry Aldrich Gets Glamour</a></i> (1943)</li>
<li><i><a href="/wiki/Henry_Aldrich_Haunts_a_House" title="Henry Aldrich Haunts a

<li><i><a href="/wiki/The_Cutting_Edge:_Fire_and_Ice" title="The Cutting Edge: Fire and Ice">The Cutting Edge: Fire and Ice</a></i> (2010) (TV)</li></ol></li></ul>] 

[<ul><li><i><a class="mw-redirect" href="/wiki/Dad_and_Dave" title="Dad and Dave">Dad and Dave</a></i> (1932 series) *
<ol><li><i><a href="/wiki/On_Our_Selection_(1932_film)" title="On Our Selection (1932 film)">On Our Selection</a></i> (1932)</li>
<li><i><a href="/wiki/Grandad_Rudd" title="Grandad Rudd">Grandad Rudd</a></i> (1935)</li>
<li><i><a href="/wiki/Dad_and_Dave_Come_to_Town" title="Dad and Dave Come to Town">Dad and Dave Come to Town</a></i> (1938)</li>
<li><i><a href="/wiki/Dad_Rudd,_M.P." title="Dad Rudd, M.P.">Dad Rudd, M.P.</a></i> (1940)</li></ol></li>
<li><i><a href="/wiki/Danny_Phantom" title="Danny Phantom">Danny Phantom</a></i> *
<ol><li><i><a class="mw-redirect" href="/wiki/Reign_Storm_(Danny_Phantom)" title="Reign Storm (Danny Phantom)">Danny Phantom - Reign Storm</a></i> (2005) (TV)</li>
<li><i><a cl

<li><i><a class="new" href="/w/index.php?title=Lyubit_po-russki_3:_Gubernator&amp;action=edit&amp;redlink=1" title="Lyubit po-russki 3: Gubernator (page does not exist)">Lyubit po-russki 3: Gubernator</a></i> (1999)</li></ol></li></ul>] 

[<ul><li><i>Major League</i>
<ol><li><i><a href="/wiki/Major_League_(film)" title="Major League (film)">Major League</a></i> (1989)</li>
<li><i><a href="/wiki/Major_League_II" title="Major League II">Major League II</a></i> (1994)</li>
<li><i><a href="/wiki/Major_League:_Back_to_the_Minors" title="Major League: Back to the Minors">Major League: Back to the Minors</a></i> (1998)</li></ol></li>
<li><i>Malevolence</i>
<ol><li><i><a href="/wiki/Malevolence_(film)" title="Malevolence (film)">Malevolence</a></i> (2003)</li>
<li><i><a href="/wiki/Bereavement_(film)" title="Bereavement (film)">Bereavement</a></i> (2010) (prequel)</li>
<li><i><a class="mw-redirect" href="/wiki/Killer:_Malevolence_3" title="Killer: Malevolence 3">Killer: Malevolence 3</a></i> (

<li><i><a href="/wiki/The_East_Is_Red_(1993_film)" title="The East Is Red (1993 film)">The East Is Red</a></i> (1994) (aka <i>Swordsman III</i>)</li></ol></li></ul>] 

[<ul><li><i>Tad, the Lost Explorer</i> (A)
<ol><li><i><a href="/wiki/Tad,_The_Lost_Explorer" title="Tad, The Lost Explorer">Tad, The Lost Explorer</a></i> (2013)</li>
<li><i><a href="/wiki/Tad_the_Lost_Explorer_and_the_Secret_of_King_Midas" title="Tad the Lost Explorer and the Secret of King Midas">Tad the Lost Explorer and the Secret of King Midas</a></i> (2017)</li>
<li><i>Tad the Lost Explorer and the Curse of the Mummy</i> (2022)</li></ol></li>
<li><i><a href="/wiki/Taken_(franchise)" title="Taken (franchise)">Taken</a></i>
<ol><li><i><a href="/wiki/Taken_(film)" title="Taken (film)">Taken</a></i> (2008)</li>
<li><i><a href="/wiki/Taken_2" title="Taken 2">Taken 2</a></i> (2012)</li>
<li><i><a href="/wiki/Taken_3" title="Taken 3">Taken 3</a></i> (2014)</li></ol></li>
<li><i>Tales from the Crypt</i> <a href="/wiki/Tale

In [13]:
movie_list = []
franchise_list = []
for section in sections:
    for s in section:
        letter = parse_bs(s, "ol", None)
        if letter:
            movie_count = len(parse_bs(letter[0], "li", None))
        
            for f in letter:
                movie_list.append(f)
    
            titles = extract_title_line(s, len(letter), (movie_count+1))
            for t in titles:
                franchise_list.append(t)
    
print(len(movie_list))  
print(len(franchise_list))  
    #for i in range(17):
    #    
    #    print(franchise_list[i], "\n")
    #    print(movie_list[i], "\n\n")

1291
1291


In [14]:
for s in outlier_sect:
    letter = parse_bs(s, "ol", None)
    if letter:
        movie_count = len(parse_bs(letter[0], "li", None))
    
        for f in letter:
            movie_list.append(f)

        titles = extract_title_line(s, len(letter), (movie_count+1))
        for t in titles:
            franchise_list.append(t)
            
    

In [15]:
print(len(movie_list))  
print(len(franchise_list))  

1344
1344


In [16]:
movie_franchises = pd.DataFrame()
movie_franchises["franchise_name"] = franchise_list
movie_franchises["movie_name"] = movie_list
movie_franchises

Unnamed: 0,franchise_name,movie_name
0,"<ul><li><i><a href=""/wiki/The_Aldrich_Family#F...","[[[<a href=""/wiki/What_a_Life_(film)"" title=""W..."
1,"<ul><li><i><a href=""/wiki/Coffin_Joe"" title=""C...","[[[<a href=""/wiki/At_Midnight_I%27ll_Take_Your..."
2,"<li><i><a href=""/wiki/The_Crime_Club#Film"" tit...","[[[<a href=""/wiki/The_Westland_Case"" title=""Th..."
3,"<ul><li><i><a href=""/wiki/Fast_%26_Furious"" ti...","[[[<a href=""/wiki/The_Fast_and_the_Furious_(20..."
4,<ul><li><i>Gingerdead Man vs. Evil Bong</i>,"[[[<a href=""/wiki/The_Gingerdead_Man"" title=""T..."
...,...,...
1339,"<ul><li><i><a href=""/wiki/The_Three_Mesquiteer...","[[[<a href=""/wiki/The_Three_Mesquiteers_(film)..."
1340,"<ul><li><i><a href=""/wiki/El_Santo"" title=""El ...","[[[<a href=""/wiki/Santo_contra_el_cerebro_del_..."
1341,"<ul><li><i><a href=""/wiki/Charles_Starrett#The...","[[[<a href=""/wiki/The_Durango_Kid_(film)"" titl..."
1342,"<ul><li><i><a class=""mw-redirect"" href=""/wiki/...","[[[<a href=""/wiki/Hop-Along_Cassidy"" title=""Ho..."


In [17]:
# break down movie_name into list of movies using .apply()
# extract franchise_name (convert to string?)
# explode() movie_name
# extract date from movie_name

# extract movie_name (str)

In [18]:
movie_franchises["movie_name"] = movie_franchises["movie_name"].apply(lambda x : parse_bs(x, "li", None))

This cell above breaks the section down to access each franchise movie list

In [19]:
def extract_title(line):
    regex = re.compile(r'<[^>]+>')
    def remove_html(string):
        return regex.sub('', string)
    title=remove_html(line)
    for char in "*":
        title = title.replace(char, '')
    return title

In [20]:
def clean_title(title):
    pattern = r'\[.*?\]'
    re.sub(pattern, '', title)
    return title

In [21]:
movie_franchises["franchise_name"] = movie_franchises["franchise_name"].apply(lambda x : extract_title(x))
movie_franchises

Unnamed: 0,franchise_name,movie_name
0,The Aldrich Family,"[[[<a href=""/wiki/What_a_Life_(film)"" title=""W..."
1,Coffin Joe,"[[[<a href=""/wiki/At_Midnight_I%27ll_Take_Your..."
2,The Crime Club,"[[[<a href=""/wiki/The_Westland_Case"" title=""Th..."
3,Fast &amp; Furious,"[[[<a href=""/wiki/The_Fast_and_the_Furious_(20..."
4,Gingerdead Man vs. Evil Bong,"[[[<a href=""/wiki/The_Gingerdead_Man"" title=""T..."
...,...,...
1339,The Three Mesquiteers,"[[[<a href=""/wiki/The_Three_Mesquiteers_(film)..."
1340,Santo,"[[[<a href=""/wiki/Santo_contra_el_cerebro_del_..."
1341,The Durango Kid,"[[[<a href=""/wiki/The_Durango_Kid_(film)"" titl..."
1342,Hopalong Cassidy (American-Western),"[[[<a href=""/wiki/Hop-Along_Cassidy"" title=""Ho..."


In [22]:
movie_franchises["franchise_id"] = ""
for i in range(0, len(list(movie_franchises["franchise_name"]))):
    movie_franchises["franchise_id"][i] = "f" + str(i)
    
movie_franchises = movie_franchises[['franchise_id', 'franchise_name', 'movie_name']]
movie_franchises

Unnamed: 0,franchise_id,franchise_name,movie_name
0,f0,The Aldrich Family,"[[[<a href=""/wiki/What_a_Life_(film)"" title=""W..."
1,f1,Coffin Joe,"[[[<a href=""/wiki/At_Midnight_I%27ll_Take_Your..."
2,f2,The Crime Club,"[[[<a href=""/wiki/The_Westland_Case"" title=""Th..."
3,f3,Fast &amp; Furious,"[[[<a href=""/wiki/The_Fast_and_the_Furious_(20..."
4,f4,Gingerdead Man vs. Evil Bong,"[[[<a href=""/wiki/The_Gingerdead_Man"" title=""T..."
...,...,...,...
1339,f1339,The Three Mesquiteers,"[[[<a href=""/wiki/The_Three_Mesquiteers_(film)..."
1340,f1340,Santo,"[[[<a href=""/wiki/Santo_contra_el_cerebro_del_..."
1341,f1341,The Durango Kid,"[[[<a href=""/wiki/The_Durango_Kid_(film)"" titl..."
1342,f1342,Hopalong Cassidy (American-Western),"[[[<a href=""/wiki/Hop-Along_Cassidy"" title=""Ho..."


In [23]:
movie_franchises["franchise_name"] =movie_franchises["franchise_name"].apply(lambda x : x.replace("&amp;", "&"))
movie_franchises.tail(10)

Unnamed: 0,franchise_id,franchise_name,movie_name
1334,f1334,Charlie Chan,"[[[<a href=""/wiki/The_House_Without_a_Key_(ser..."
1335,f1335,The Bowery Boys,"[[[<a href=""/wiki/Live_Wires_(1946_film)"" titl..."
1336,f1336,Edgar Wallace Mysteries,"[[[<a href=""/wiki/Clue_of_the_Twisted_Candle"" ..."
1337,f1337,Scooby-Doo (a),"[[[<a href=""/wiki/Scooby_Goes_Hollywood"" title..."
1338,f1338,Otoko wa Tsurai yo,"[[[<a href=""/wiki/It%27s_Tough_Being_a_Man"" ti..."
1339,f1339,The Three Mesquiteers,"[[[<a href=""/wiki/The_Three_Mesquiteers_(film)..."
1340,f1340,Santo,"[[[<a href=""/wiki/Santo_contra_el_cerebro_del_..."
1341,f1341,The Durango Kid,"[[[<a href=""/wiki/The_Durango_Kid_(film)"" titl..."
1342,f1342,Hopalong Cassidy (American-Western),"[[[<a href=""/wiki/Hop-Along_Cassidy"" title=""Ho..."
1343,f1343,Super Sentai,"[[[<a href=""/wiki/Himitsu_Sentai_Gorenger"" tit..."


In [24]:
movie_franchises["franchise_name"] = movie_franchises["franchise_name"].apply(lambda x : re.sub(r'\([^()]{1}\)|\[[^][]{1}\]','', x))
movie_franchises

Unnamed: 0,franchise_id,franchise_name,movie_name
0,f0,The Aldrich Family,"[[[<a href=""/wiki/What_a_Life_(film)"" title=""W..."
1,f1,Coffin Joe,"[[[<a href=""/wiki/At_Midnight_I%27ll_Take_Your..."
2,f2,The Crime Club,"[[[<a href=""/wiki/The_Westland_Case"" title=""Th..."
3,f3,Fast & Furious,"[[[<a href=""/wiki/The_Fast_and_the_Furious_(20..."
4,f4,Gingerdead Man vs. Evil Bong,"[[[<a href=""/wiki/The_Gingerdead_Man"" title=""T..."
...,...,...,...
1339,f1339,The Three Mesquiteers,"[[[<a href=""/wiki/The_Three_Mesquiteers_(film)..."
1340,f1340,Santo,"[[[<a href=""/wiki/Santo_contra_el_cerebro_del_..."
1341,f1341,The Durango Kid,"[[[<a href=""/wiki/The_Durango_Kid_(film)"" titl..."
1342,f1342,Hopalong Cassidy (American-Western),"[[[<a href=""/wiki/Hop-Along_Cassidy"" title=""Ho..."


In [25]:
movie_franchises["franchise_name"] = movie_franchises["franchise_name"].apply(lambda x : x.strip())

In [26]:
movie_franchises["franchise_name"].value_counts().loc[lambda x : x>1]

V/H/S                    2
Star Wars                2
The Stranger             2
Death Note               2
Demons                   2
Mr. Wong                 2
Scooby-Doo               2
Mobile Suit Gundam       2
Shark Attack             2
The Flintstones          2
The Fairly OddParents    2
Cars                     2
Planet of the Apes       2
Frosty the Snowman       2
Garfield                 2
Name: franchise_name, dtype: int64

In [27]:
test = movie_franchises[movie_franchises["franchise_name"].str.contains("marvel", na=False, flags=re.IGNORECASE, regex=True)]
test

Unnamed: 0,franchise_id,franchise_name,movie_name
116,f116,Marvel Animated Features,"[[[Ultimate Avengers: The Movie], (2006) (V)]..."
1329,f1329,Marvel Cinematic Universe (various),"[[[<a href=""/wiki/Iron_Man_(2008_film)"" title=..."


In [28]:
movie_franchises = movie_franchises.explode("movie_name")
movie_franchises.reset_index(inplace = True, drop = True)
movie_franchises

Unnamed: 0,franchise_id,franchise_name,movie_name
0,f0,The Aldrich Family,"[[[What a Life]], (1939)]"
1,f0,The Aldrich Family,"[[[Life with Henry]], (1940)]"
2,f0,The Aldrich Family,"[[[Henry Aldrich for President]], (1941)]"
3,f0,The Aldrich Family,"[[[Henry Aldrich, Editor]], (1942)]"
4,f0,The Aldrich Family,"[[[Henry and Dizzy]], (1942)]"
...,...,...,...
8180,f1343,Super Sentai,[[[Kikai Sentai Zenkaiger vs. Kiramager vs. Se...
8181,f1343,Super Sentai,[[[Avataro Sentai Donbrothers The Movie: New F...
8182,f1343,Super Sentai,[[[Ninpu Sentai Hurricaneger Degozaru! Shushuu...
8183,f1343,Super Sentai,"[[[Avataro Sentai Donbrothers vs. Zenkaiger], ..."


In [29]:
def extract_release_date(movie):
    
    date = ""
    extract = re.findall(r'\((.*?)\)',str(movie))
    for e in extract:
        if e.isdigit():
            date = e
    return date

In [30]:
# Find all years for movies in movie list
movie_franchises["release_year"] = movie_franchises["movie_name"].apply(lambda x : extract_release_date(x))
movie_franchises

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year
0,f0,The Aldrich Family,"[[[What a Life]], (1939)]",1939
1,f0,The Aldrich Family,"[[[Life with Henry]], (1940)]",1940
2,f0,The Aldrich Family,"[[[Henry Aldrich for President]], (1941)]",1941
3,f0,The Aldrich Family,"[[[Henry Aldrich, Editor]], (1942)]",1942
4,f0,The Aldrich Family,"[[[Henry and Dizzy]], (1942)]",1942
...,...,...,...,...
8180,f1343,Super Sentai,[[[Kikai Sentai Zenkaiger vs. Kiramager vs. Se...,2022
8181,f1343,Super Sentai,[[[Avataro Sentai Donbrothers The Movie: New F...,2022
8182,f1343,Super Sentai,[[[Ninpu Sentai Hurricaneger Degozaru! Shushuu...,2023
8183,f1343,Super Sentai,"[[[Avataro Sentai Donbrothers vs. Zenkaiger], ...",2023


In [31]:
pd.set_option('display.max_rows', 500)
movie_franchises["release_year"].value_counts()

1994    182
2006    165
2005    158
2008    157
2007    147
2002    147
2016    143
2009    142
2004    142
2017    136
1998    135
2015    135
2018    127
1995    126
2003    125
2012    124
1996    122
1993    121
2000    121
1992    120
1990    119
2011    119
2014    116
2013    116
2001    112
1987    110
1989    109
1991    109
1999    108
2010    107
1988    107
1997    105
2019    102
1941     99
1973     96
1943     95
1948     94
1940     93
1947     92
1985     90
1986     89
1946     88
1964     87
1972     85
1945     83
1939     83
1949     83
1974     82
1942     81
1979     79
1944     76
1977     75
1950     74
1975     74
1984     73
1971     73
1968     73
1965     71
1982     71
1966     68
1963     66
1970     66
1980     65
1978     65
1976     65
1951     65
1981     64
1937     62
1967     61
1969     60
1952     60
1938     58
1962     55
2021     52
1983     52
2022     49
1961     48
1957     45
1953     43
1955     40
1958     40
2020     39
1936     37
1959

In [32]:
movie_franchises[movie_franchises["release_year"]==""]

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year
1040,f76,Za La Mort,"[[Il Triangolo Giallo], (Serial, 1917)]",
1042,f76,Za La Mort,"[[[I Topi Grigi], [ [, <a class=""extiw"" href=""...",
1044,f76,Za La Mort,"[[[Dollari e fracks], [ [, <a class=""extiw"" hr...",
1065,f77,The Amityville Horror,[[[The Amityville Legacy]]],
1990,f196,John Wick,"[[[Ballerina]], (TBA)]",
2314,f262,Airport,"[[[The Concorde ... Airport '79]], (1979]",
4161,f635,Pedro Penduko,"[[Penduko], (pre-production)]",
4470,f680,Foolish Years,"[[Lude godine], , [(Wacky Years)], , 1977)]",
4471,f680,Foolish Years,"[[Došlo doba da se ljubav proba], (, [The Tim...",
4472,f680,Foolish Years,"[[Ljubi, ljubi, al' glavu ne gubi], (, [Kiss,...",


In [33]:
for d in movie_franchises["release_year"]:
    if len(d) > 4:
        print(d)

In [34]:
def clean_movie(movie):
    movie = re.sub(r'\(.*?\)', '', movie)
    return movie

In [35]:
movie_franchises["movie_name"] = movie_franchises["movie_name"].apply(lambda x : extract_title(str(x))).apply(lambda x : clean_movie(x))
movie_franchises

Unnamed: 0,franchise_id,franchise_name,movie_name,release_year
0,f0,The Aldrich Family,What a Life,1939
1,f0,The Aldrich Family,Life with Henry,1940
2,f0,The Aldrich Family,Henry Aldrich for President,1941
3,f0,The Aldrich Family,"Henry Aldrich, Editor",1942
4,f0,The Aldrich Family,Henry and Dizzy,1942
...,...,...,...,...
8180,f1343,Super Sentai,Kikai Sentai Zenkaiger vs. Kiramager vs. Senpa...,2022
8181,f1343,Super Sentai,Avataro Sentai Donbrothers The Movie: New Firs...,2022
8182,f1343,Super Sentai,Ninpu Sentai Hurricaneger Degozaru! Shushuuto ...,2023
8183,f1343,Super Sentai,Avataro Sentai Donbrothers vs. Zenkaiger,2023


In [36]:
movie_franchises["movie_name"] = movie_franchises["movie_name"].apply(lambda x : x.replace("&amp;", "&"))
movie_franchises["movie_name"] = movie_franchises["movie_name"].apply(lambda x : x.strip())
print(movie_franchises.loc[33:43])

   franchise_id  franchise_name                             movie_name  \
33           f3  Fast & Furious               The Fast and the Furious   
34           f3  Fast & Furious                       2 Fast 2 Furious   
35           f3  Fast & Furious  The Fast and the Furious: Tokyo Drift   
36           f3  Fast & Furious                         Fast & Furious   
37           f3  Fast & Furious                              Fast Five   
38           f3  Fast & Furious                       Fast & Furious 6   
39           f3  Fast & Furious                              Furious 7   
40           f3  Fast & Furious                The Fate of the Furious   
41           f3  Fast & Furious  Fast & Furious Presents: Hobbs & Shaw   
42           f3  Fast & Furious                                     F9   
43           f3  Fast & Furious                                 Fast X   

   release_year  
33         2001  
34         2003  
35         2006  
36         2009  
37         2011  
38 

In [37]:
test = movie_franchises["franchise_name"].unique()

for t in test:
    print(t)

The Aldrich Family
Coffin Joe
The Crime Club
Fast & Furious
Gingerdead Man vs. Evil Bong
Halloween
Lash LaRue
Love Comes Softly
Madame Aema
Mickey Mouse
Mystery Woman
Naruto the Movie
Squadra antiscippo
Star Wars
Wizarding World
Yogi Bear
Young and Dangerous
Aurora Teagarden Mystery
Bomba, the Jungle Boy
The Flintstones
Gamera
The Hombre Lobo Series (a.k.a. The Waldemar Daninsky Series)
Joe Palooka
L.E.T.H.A.L. Ladies (a.k.a. Triple-B, Bullets, Bombs and Babes)
Michael Shayne
The Muppets
Nemuri Kyoshirō (Matsukata Hiroki series)
Signed, Sealed, Delivered
AVP Universe
American Girl
Bratz
Madea Simmons
Mothra
Pekka and Pätkä
Schulmädchen-Report
Star Trek
Wangan Middonaito
Wild Bill Hickok
Air Bud
American Film Theatre
Boston Blackie, portrayed by Chester Morris
Darna
FIFA World Cup
The Land Before Time
Olsen-banden (Danish series)
Olsenbanden (Norwegian series)
One Piece
Sherlock Holmes (1939 film series)
Spider-Man
X-Men
DC Extended Universe
Godzilla  Shōwa era (1954–1975)
Philo Vance
S

In [38]:
movie_franchises.to_csv("wikipedia_movie_franchises.csv")

In [39]:
movie_franchises["franchise_name"].value_counts()

Wong Fei Hung (Chinese films)          80
Super Sentai                           71
Hopalong Cassidy (American-Western)    66
The Durango Kid                        64
Scooby-Doo                             53
                                       ..
A Fixer Upper Mystery                   3
Flåklypa Grand Prix                     3
Flash Gordon (serials)                  3
The Flesh Trilogy                       3
Bab                                     3
Name: franchise_name, Length: 1328, dtype: int64