## I wrote a blog post talking through how to scrape Wikipedia. Check it out here:
https://medium.com/@Alexander_H/scraping-wikipedia-with-python-8000fc9c9e6c

In [3]:
import wikipedia
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from uuid import uuid4 as uuid
import numpy as np
import urllib
from urllib.request import urlopen, Request
import numpy as np

# first pull the HTML from the page that links to all of the pages with the links.
# in this case, this page gives the links list pages of sci-fi films by decade.
# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
# to see what I'm pulling from.
html = requests.get('https://en.wikipedia.org/wiki/Lists_of_science_fiction_films')

#turn the HTML into a beautiful soup text object
b = BeautifulSoup(html.text, 'lxml')
# create an mpty list where those links will go.
links = []

# in this case, all of the links we're in a '<li>' brackets.
for i in b.find_all(name = 'li'):
    # pull the actual link for each one
    for link in i.find_all('a', href=True):
        links.append(link['href'])
# the above code ends up pulling more links than I want,
# so I just use the ones I want
links = links[1:11]
# each link only returns something like 'wiki/List_of_science_fiction_films_of_the_1920s'
# so I add the other part of the URL to each.
decade_links = ['https://en.wikipedia.org' + i for i in links]

# create two new lists, one for the title of the page, 
# and one for the link to the page
film_titles = []
film_links = []
decades = []
#film_countries = []
# for loop to pull from each decade page with list of films.
# look at https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
# to follow along as an exampe
for decade in decade_links:
    print(f'Collecting films from {decade}')
    html = requests.get(decade)
    b = BeautifulSoup(html.text, 'lxml')
    # get to the table on the page
    for i in b.find_all(name='table', class_='wikitable'):
        # get to the row of each film
        for j in i.find_all(name='tr'):
            #get just the title cell for each row.
            # contains the title and the URL
            for k in j.find_all(name='i'):
                # get within that cell to just get the words
                for link in k.find_all('a', href=True):
                    # get the title and add to the list
                    film_titles.append(link['title'])
                    # get the link and add to that list
                    film_links.append(link['href'])
                    # add the decade to keep track for later
                    decades.append(decade[-5:-1])
            #for k in j.find_all(class_='flagicon'):
                #for link in k.find_all('a', href=True):
                    #film_countries = film_countries.append(link['title'])
                    
    #be a conscientious scraper and pause between scrapes
    time.sleep(1)
print(f'Number of Film Links Collected: {len(film_links)}')
print(f'Number of Film Titles Collected: {len(film_titles)}')
# remove film links that don't have a description page on Wikipedia
new_film_links = [i for i in film_links if 'redlink' not in i]
# same goes for titles
new_film_titles = [i for i in film_titles if '(page does not exist)' not in i]
print(f'Number of Film Links with Wikipedia Pages: {len(new_film_links)}')
print(f'Number of Film Titles with Wikipedia Pages: {len(new_film_titles)}')
#use this list to fetch from the API
title_links_decades = list(zip(new_film_titles, new_film_links, decades))

Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1930s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1940s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1950s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1960s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1970s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1980s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1990s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2000s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2010s
Number of Film Links Collected: 1817
Number of Fil

In [None]:
# name columns for dataframe you want
col = ['title','link','summary','year','decade','country','synopsis']
# create the dataframe
df = pd.DataFrame(columns=col)
# create a list of all the names you think/know the section might be called
possibles = ['Plot','Synopsis','Plot synopsis','Plot summary', 'Story','Plotline','The Beginning','Summary',
            'Content','Premise']
# sometimes those names have 'Edit' latched onto the end due to 
# user error on Wikipedia. In that case, it will be 'PlotEdit'
# so it's easiest just to make another list that acccounts for that
possibles_edit = [i + 'Edit' for i in possibles]
#then merge those two lists together
all_possibles = possibles + possibles_edit
# make a count to track progress
count = 0
print('Initializing Scrape...')
# load the page once and save it as a variable, otherwise it will request
# the page every time.
# always do a try, except when pulling from the API, in case it gets confused
# by the title.
for i in title_links_decades:
    try:
        wik = wikipedia.WikipediaPage(i[0])
    except:
        wik = np.NaN
    try:
        html = wik.html()
    except:
        html = np.NaN
    
    # year
    try:
        year = html.split("bday dtstart published updated\">")[1].split('<')[0]
    except:
        try:
            #weird .split. the -4 gets the four digit year number
            year = html.split("Release date")[1].split("<li>")[1].split('<')[0].rstrip(' ')[-4:]
            if len(year) != 4:
                try:
                    year = html.split("Release date")[1].split(">\n")[1].split('<')[0].rstrip(' ')[-4:]
                    if len(year) == 4:
                        year = year
                    else:
                        year = np.NaN
                except:
                    try:
                        year = html.split("Release date")[1].split(">\n")[1].split('<')[0].rstrip(' ')[-4:]
                        if len(year) == 4:
                            year = year
                        else:
                            year = np.NaN
                    except:
                        year = np.NaN
        except:
            year = np.NaN
            
    link = 'https://en.wikipedia.org' + i[1]
    #country
    try:
        country = html.split("Country")[1].split('\n')[1].split('<')[0].lstrip(' ')
        if len(country)>2:
            country = country
        else:
            country = html.split("Country")[1].split('<li>')[1].split('<')[0].lstrip(' ')
    except:
        country = np.NaN
    #print(country)
    title = i[0]
    #print(title)
    #summary
    try:
        summary = wik.summary.replace('\n','').replace("\'","")
    except:
        summary = np.NaN
    #print(summary)

    #synopsis
    try:
         # for all possible titles in all_possibles list
        for j in all_possibles:
            # if that section does exist, i.e. it doesn't return 'None'
            if wik.section(j) != None:
                  #then that's what the plot is! Otherwise try the next one!
                synopsis = wik.section(j).replace('\n','').replace("\'","")
    # if none of those work, or if the page didn't load from above, then plot
    # equals np.NaN            
    except:
        synopsis = np.NaN
    try:
        decade = i[2]
    except:
        decade = np.NaN

    #print('PLOT\n')
    #print(plot_)
    df_add = pd.DataFrame([[title,link,summary,year,decade,country,synopsis]], columns = col)
    df = df.append(df_add)
    if count % 25 == 0:
            print(f'Appended {count} movies to dataframe')
    if count % 25 == 0:
            df.to_csv(f'/Users/Alexander/GA/data_capstone/scrapings/wiki_11_{count}.csv')
            print(f'Exported CSV wiki_11_{count}.csv')
            df.to_pickle(f'/Users/Alexander/GA/data_capstone/scrapings/wiki_11_{count}.pkl')
            print(f'Exported CSV wiki_11_{count}.pkl')
    #print(count)
    count += 1
df.to_csv(f'/Users/Alexander/GA/data_capstone/scrapings/wiki_11_{count-1}.csv')
print(f'Exported CSV wiki_11_{count - 1}.csv')
df.to_pickle(f'/Users/Alexander/GA/data_capstone/scrapings/wiki_11_{count}.pkl')
print(f'Exported CSV wiki_11_{count - 1}.pkl')
print(f'{count - 1} pages scraped')
print("Scraping Complete")

In [None]:
# if you want to be able to read everything in the dataframe you would run the following code
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

In [6]:
df.head()

Unnamed: 0,title,link,summary,year,decade,country,synopsis
0,Algol (film),https://en.wikipedia.org/wiki/Algol_(film),Algol: Tragedy of Power (German: Algol. Tragöd...,1920-09-03,1920,Weimar Republic,"The story follows the life of Robert Herne, wh..."
0,Dr. Jekyll and Mr. Hyde (1920 Haydon film),https://en.wikipedia.org/wiki/Dr._Jekyll_and_M...,Dr. Jekyll and Mr. Hyde is a 40-minute horror ...,1920-04,1920,United States,"As the first transformation into Hyde begins, ..."
0,Dr. Jekyll and Mr. Hyde (1920 film),https://en.wikipedia.org/wiki/Dr._Jekyll_and_M...,Dr. Jekyll and Mr. Hyde is a 1920 horror silen...,1920-03-18,1920,United States,Henry Jekyll (John Barrymore) is a doctor of m...
0,Figures of the Night,https://en.wikipedia.org/wiki/Figures_of_the_N...,Figures of the Night (German:Nachtgestalten) i...,1920,1920,Germany,Henry Jekyll (John Barrymore) is a doctor of m...
0,The Invisible Ray (1920 serial),https://en.wikipedia.org/wiki/The_Invisible_Ra...,The Invisible Ray is a 1920 American science f...,1920-07-01,1920,United States,After a mineralogist discovers a ray with extr...
