This scraper is going to be used to find African American Film Directors from combining information from Wikipedia and IMDB


In [1]:
from bs4 import BeautifulSoup
from imdb import Cinemagoer
import requests
import pandas as pd
import re
import time

## This creates the Director's list from both wikipedia pages
Its going through each hyperlink on the wikipedia website and adds them to a list. It is broken up into two variables director_links
and director_links2 to make sure to get all the directors from A-Z.

In [2]:
url = "https://en.wikipedia.org/wiki/Category:African-American_film_directors"
url2 = "https://en.wikipedia.org/w/index.php?title=Category:African-American_film_directors&from=P"
movie_list = set()
director_films = set()
testing_list = []
names = []

# this will take out names that are organizations and take out the added identifiers at the end of names
name_exclude = ['Black women filmmakers', 'Pioneers of African-American Cinema']
name_check = ['film director', 'filmmaker', 'director', 'writer', 'actor', 'musician']

# parsing out the <a> tags 
def director_page(url):
    try:
        result = requests.get(url)
        result.raise_for_status()
        doc = BeautifulSoup(result.text, "html.parser")
        
    except Exception as e:
        print(e)
        
    return doc.find(class_="mw-category mw-category-columns").find_all('a')


director_links = director_page(url)
director_links2 = director_page(url2)

In [3]:
print(director_links)

[<a href="/wiki/Black_women_filmmakers" title="Black women filmmakers">Black women filmmakers</a>, <a href="/wiki/Abdisalam_Aato" title="Abdisalam Aato">Abdisalam Aato</a>, <a href="/wiki/Gay_Abel-Bey" title="Gay Abel-Bey">Gay Abel-Bey</a>, <a href="/wiki/Fathia_Absie" title="Fathia Absie">Fathia Absie</a>, <a href="/wiki/Anita_W._Addison" title="Anita W. Addison">Anita W. Addison</a>, <a href="/wiki/Omowale_Akintunde" title="Omowale Akintunde">Omowale Akintunde</a>, <a href="/wiki/Queen_Muhammad_Ali" title="Queen Muhammad Ali">Queen Muhammad Ali</a>, <a href="/wiki/Khalik_Allah" title="Khalik Allah">Khalik Allah</a>, <a href="/wiki/Madeline_Anderson" title="Madeline Anderson">Madeline Anderson</a>, <a href="/wiki/M._K._Asante" title="M. K. Asante">M. K. Asante</a>, <a href="/wiki/Philip_Atwell" title="Philip Atwell">Philip Atwell</a>, <a href="/wiki/Sam_Bailey_(director)" title="Sam Bailey (director)">Sam Bailey (director)</a>, <a href="/wiki/Mya_Baker" title="Mya Baker">Mya Baker</a>

### Creates a list of (Director Names, Wikipedia Link)


In [4]:
def Directors(names, object):    
    for director in object:
        if any(y in director.string for y in name_exclude):
            director.string = ''
        elif any(x in director.string for x in name_check):
            director.string  = director.string.rsplit('(', 1)[0]

        if director.string or director.string in names: 
            #         LIST:    NAME                       WIKI_LINK
            names.append([str(director.string.rstrip()), director['href']])        
    return names 

names = Directors(names, director_links)
names = Directors(names, director_links2)

### Creates the Dataframe with Name and Wiki_Link Columns


In [5]:
df = pd.DataFrame(names, columns=['Name', 'wiki_link'])
df

Unnamed: 0,Name,wiki_link
0,Abdisalam Aato,/wiki/Abdisalam_Aato
1,Gay Abel-Bey,/wiki/Gay_Abel-Bey
2,Fathia Absie,/wiki/Fathia_Absie
3,Anita W. Addison,/wiki/Anita_W._Addison
4,Omowale Akintunde,/wiki/Omowale_Akintunde
...,...,...
313,Tricia Woodgett,/wiki/Tricia_Woodgett
314,Bille Woodruff,/wiki/Bille_Woodruff
315,Fronza Woods,/wiki/Fronza_Woods
316,Tanya Wright,/wiki/Tanya_Wright


### Saves the Dataframe as a csv file


In [50]:
df.to_csv("director_names.csv")
print(len(names))

318


### This is set up for getting Filmography from Director's Wikipedia page
* It first opens up each wikipedia page of the director and checks to see what type of Filmography section or if it has one at all.
* Then depending on what type it finds. It will get the list of movies and the year it was released.
* In the end, it will create a list with the "Movie, Year, Wikipedia" which will be used to identify a director on IMDB

In [8]:
director_films = set()
test = []
mxm = []

#The range is performed in set of 50 to avoid timeout issues and to test if the data is pulled
#the total range is 311 which is the len(names)
for i in range(0, len(names)-1): 
#for i in range(0, 300): 

    #add a wait time of 5 seconds so there isn't a timeout
    if i % 50 == 0:
        time.sleep(5)

    wiki_url = "https://en.wikipedia.org" + str(df.loc[i].at['wiki_link'])      
    table_check = False

    try:
        result = requests.get(wiki_url)
        result.raise_for_status()
        doc = BeautifulSoup(result.text, "html.parser")
    except Exception as e:
        print(e)

    if doc.find(id="Filmography") and doc.find(class_="wikitable"):
        if doc.find(id="Filmography").parent.find_next("th").find_next("th").text.rstrip() == "Title" or \
           doc.find(id="Filmography").parent.find_next("th").find_next("th").text.rstrip() == "Film": 
            film_list = doc.find(id="Filmography").parent.find_next("th").find_all_next("tr", limit=6)        
            table_check = True                  
        else:
            film_list = doc.find(id="Filmography").parent.find_all_next("li", limit=6)
    elif doc.find(id="Films") and doc.find(class_="wikitable"):
        if doc.find(id="Films").parent.find_next("th").find_next("th").text.rstrip() == "Film": 
            film_list = doc.find(id="Films").parent.find_next("th").find_all_next("tr", limit=6)        
            table_check = True
        else:
            film_list = doc.find(id="Films").parent.find_all_next("li", limit=6)
    elif doc.find(id="Filmography"):
        film_list = doc.find(id="Filmography").parent.find_all_next("li", limit=6)
    elif doc.find(id="Films_2"):
        film_list = doc.find(id="Films_2").parent.find_all_next("li", limit=6)
    elif (doc.find(id="Films") or doc.find(id="Film")) and doc.find(class_="mw-headline"):
        if doc.find(id="Films"):
            film_list = doc.find(id="Films").parent.find_all_next("li", limit=6)            
        else:
            film_list = doc.find(id="Film").parent.find_all_next("li", limit=6)
    elif (doc.find(id="Selected_filmography") or doc.find(id="Select_Filmography")) and doc.find(class_="mw-headline"):
        if doc.find(id="Selected_filmography"):
            film_list = doc.find(id="Selected_filmography").parent.find_all_next("li", limit=6)
        else:
            film_list = doc.find(id="Select_Filmography").parent.find_all_next("li", limit=6)
    elif doc.find(id="Filmography_as_director") and doc.find(class_="mw-headline"):
        film_list = doc.find(id="Filmography_as_director").parent.find_all_next("li", limit=6)
    else:
        print("Couldn't find one for ", wiki_url)
        continue

    for film in film_list:
        
        #Filmography Section has a Film Table: Year Movie and table check is used since it changes the format
        if table_check == True:            
            if re.search('\d\d\d\d', film.text):
                s = film.text.rsplit('\n\n')
                if len(s) >= 2:
                    t = [s[1].strip(), s[0].lstrip(), wiki_url]
                else:
                    s[0] = s[0].strip()
                    s[0] = re.sub('\[\d+\]',"", s[0])
                    s = s[0].split('\n')                         
                    t = [s[1].strip(), s[0].strip(), wiki_url]                         
                if re.search('\n', t[1]):
                    t = t[1].split('\n')
                    t = [t[1], t[0], wiki_url]                
                director_films.add(tuple(t))                     
        #Filmography Section has Number Movie (year) format
        elif film.text[:1].isdigit() and re.search('\(\d\d\d\d\)', film.text):
            film = film.text.rsplit(r')', 1)[0]
            film = film.replace('(', ' ', 1)
            director_films.add((film[:-6].strip(), film[-4:].strip(), wiki_url))            
        #Filmography Section has Movie (year) format
        elif  ')' in film:
            film = film.text.rsplit(r' ', 1)[0]
            film = film.replace('(', ' ', 1)
            director_films.add((film[:-6].strip(), film[-4:].strip(), wiki_url))            
        #Filmography Section has Year Movie format
        elif film.text[:1].isdigit():
            s = film.text.rsplit('(', 1)[0]
            s = s.split(":")            
            if len(s) >= 2:           
                director_films.add((s[1].strip(), s[0].strip(), wiki_url))
            else:
                continue #print( "WILL TRY TO FIX LATER OR IGNORE: ", film.text)
        #Filmography Section has Movie year format
        else:
            bird = film.text.split(' (', 1)
            tweet = re.search('\d\d\d\d', film.text)            
            try:
                tweet = str(tweet.group())
            except:
                tweet = "0000"
            mxm = bird[0].strip(), tweet.strip(), wiki_url            
            director_films.add(tuple(mxm))

#print(director_films)

Couldn't find one for  https://en.wikipedia.org/wiki/Queen_Muhammad_Ali
Couldn't find one for  https://en.wikipedia.org/wiki/Sam_Bailey_(director)
Couldn't find one for  https://en.wikipedia.org/wiki/Qasim_Basir
Couldn't find one for  https://en.wikipedia.org/wiki/Mark_Beachum
Couldn't find one for  https://en.wikipedia.org/wiki/Keith_Beauchamp_(filmmaker)
Couldn't find one for  https://en.wikipedia.org/wiki/S._Torriano_Berry
Couldn't find one for  https://en.wikipedia.org/wiki/Donari_Braxton
Couldn't find one for  https://en.wikipedia.org/wiki/Calmatic
Couldn't find one for  https://en.wikipedia.org/wiki/Roy_Campanella_II
Couldn't find one for  https://en.wikipedia.org/wiki/Clay_Cane
Couldn't find one for  https://en.wikipedia.org/wiki/Don_Cheadle
Couldn't find one for  https://en.wikipedia.org/wiki/Christopher_Scott_Cherot
Couldn't find one for  https://en.wikipedia.org/wiki/Tina_Gordon_Chism
Couldn't find one for  https://en.wikipedia.org/wiki/Max_Cole
Couldn't find one for  https:/

#### This searches through people and tries to match person to Director via Movies  
#### It then adds all the movies of a director to a Movie List

In [9]:
dfMovies = pd.DataFrame(columns=['Movie_ID', 'Title', 'Person_ID', 'Name'])

In [10]:
testdf = pd.DataFrame(director_films)
testdf.to_csv('testdf.csv')

In [45]:
for i in range(0,8):
    ia = Cinemagoer()
    print(names[i][0])
    people = ia.search_person(names[i][0])
    print(people)
    works = ia.get_person(people[0].personID)
    print("Works = ",works)
    print ("Works Filmography = ",works['filmography'].keys())
    if works.has_key('director'):
                for job in works['filmography'].keys():
                    if job == 'director': 
                        print('Person ID:', people[0].personID, '\tDirector:', people[0]['name'], '\t# Job:', job, )
                        for movie in works['filmography'][job]:
                            test = ia.search_movie(str(movie))
                            #print(test) #year = test[0]['year']
                            print(movie, " ", movie.get('year')) #test[0].get('year')) #movie['year'][0])  

Abdisalam Aato
[<Person id:1333000[http] name:_Abdisalam Ali_>, <Person id:0019452[http] name:_Mohamed Abdisalam Ali_>, <Person id:13049150[http] name:_Abdisalam Abdillahi_>, <Person id:2529151[http] name:_Abdulle Abdisalam Aden_>]
Works =  Abdisalam Ali
Works Filmography =  dict_keys(['actor'])
Gay Abel-Bey
[<Person id:0008498[http] name:_Gay Abel-Bey_>]
Works =  Gay Abel-Bey
Works Filmography =  dict_keys(['producer', 'actress', 'casting department', 'additional crew', 'director', 'writer', 'editor', 'sound department', 'thanks', 'self'])
Person ID: 0008498 	Director: Gay Abel-Bey 	# Job: director
Fragrance   1991
Fathia Absie
[<Person id:7280958[http] name:_Fathia Absie_>, <Person id:6726528[http] name:_Nur Fathia_>, <Person id:7777210[http] name:_Absinthia Absolut_>, <Person id:14399263[http] name:_Fathia Abbas_>]
Works =  Fathia Absie
Works Filmography =  dict_keys(['actress', 'director', 'writer', 'producer'])
Person ID: 7280958 	Director: Fathia Absie 	# Job: director
Pursued   

In [51]:
ia = Cinemagoer()
person_id = []
found = False

#range is from 0 - 311: Only sections of the search at a time to avoid timeout issues
for i in range(0, len(names)): 
    
    #add a wait time of 5 seconds so there isn't a timeout
    if i % 50 == 0 and i > 0:
        print(i)
        time.sleep(5)

    people = ia.search_person(names[i][0])
    found = False
    for person in people:
        if found == False:
            #print(person['name'])        
            works = ia.get_person(people[0].personID)
            #print (works['filmography'].keys())
            if works.has_key('director'):
                for job in works['filmography'].keys():
                    if job == 'director': 
                        #print('Person ID:', person.personID, '\tDirector:', person['name'], '\t# Job:', job, )
                        for movie in works['filmography'][job]:
                            #print(movie, " ", movie.get('year'))                        
                            if next((i for i, v in enumerate(director_films) if v[0] == movie['title'] and v[1] == str(movie.get('year'))), None) and found == False: 
                            #if next((i for i, v in enumerate(director_films) if v[0] == movie['title'] and v[1] == str(movie['year'])), None): # (re.search(my_regex, v[0]) and v[i] == str(movie['year'])))):
                                #print("test 1:", '\tID:%s Title:%s Year: %s' % (movie.movieID, movie['title'], movie['year']))  
                                found = True                                                           
                                for m in works['filmography'][job]:
                                    #print(m['title'],'\t', m['year'])
                                    dfMovies.loc[len(dfMovies.index)] = [m.movieID, m['title'], person.personID, person['name']] 
                            elif next((i for i, v in enumerate(director_films) if v[0] == movie['title']), None) and found == False:
                                #print ("test 2:", '\tID:%s Title:%s Year: %s' % (movie.movieID, movie['title'], movie['year']) )
                                found = True                                  
                                for m in works['filmography'][job]:
                                    #print(m['title'],'\t', m['year'])
                                    dfMovies.loc[len(dfMovies.index)] = [m.movieID, m['title'], person.personID, person['name']]                            
            else:
                continue #print('Person ID:', person.personID, '\tName:    ', person['name'], '\t# Job:', job, "(Doesn't have a filmography)")

50
100
150
200
250
300


In [47]:
dfMovies

Unnamed: 0,Movie_ID,Title,Person_ID,Name
0,0191146,Fragrance,0008498,Gay Abel-Bey
1,0191146,Fragrance,0008498,Gay Abel-Bey
2,0191146,Fragrance,0008498,Gay Abel-Bey
3,13814666,Pursued,7280958,Fathia Absie
4,4629690,The Lobby,7280958,Fathia Absie
...,...,...,...,...
61,12235674,Snoop Dogg Feat. Dr. Dre & Jewell: Just Dippin,0041161,Philip G. Atwell
62,0237973,Making the Video,0041161,Philip G. Atwell
63,6793916,Eminem Feat. Dr. Dre: Guilty Conscience,0041161,Philip G. Atwell
64,6744232,Eminem: Role Model,0041161,Philip G. Atwell


In [None]:
dfMovies.to_csv("Movies.csv")