-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikiMovies.py
99 lines (69 loc) · 3.02 KB
/
wikiMovies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from bs4 import BeautifulSoup
import pandas as pd
import re
from pprint import pprint
#==============================================================================
#not run
def scrape_wiki_movies(urlConst = 'https://en.wikipedia.org/wiki/List_of_American_films_of_', yr_lb = 1970, yr_ub = 2012):
"""for a list of given URL structures of years, get their corresponding wiki pages that contain movie lists for that year; return a dictionary with year as key and soup object of wiki pages as values"""
#get pages
urlList = [urlConst+str(i) for i in range(yr_lb, yr_ub+1)]
soupObjects={}
for url in urlList:
try:
resp = requests.get(url)
soupObjects[str(re.search('\d+$',url).group())] = BeautifulSoup(resp.text,'lxml')
except requests.exceptions.RequestException as e:
print(e)
sys.exit(1)
#soupObjects is a dictionary of "<year>":"beautifulSoup object" (of wiki page that has movie list for that year)
return soupObjects
#==============================================================================
#not run
def get_wiki_movieList(soupObjects, yr_lb=1970,ur_ub=2012):
"""use BeautifulSoup to parse a dictionary of Soup pages and return a pandas dataframe of movie title, director, cast, genre, studios, releaseDate, releaseYear"""
#parse these pages into a pandas df
title=pd.Series([],name='title')
director=pd.Series([],name='director')
cast=pd.Series([],name='cast')
genre=pd.Series([],name='genre')
studios=pd.Series([],name='studios')
releaseDate=pd.Series([],name='releaseDate')
releaseYear=pd.Series([],name='releaseYear')
yearRange = [str(i) for i in range(yr_lb,yr_ub+1)]
for year in yearRange:
for table in soupObjects[year].find_all(class_='wikitable'):
for row in table.find_all('tr'):
r = row.find_all('td')
if not(r):
continue
try:
title=title.append(pd.Series(r[0].text))
except IndexError:
title=title.append(pd.Series('NA'))
try:
director=director.append(pd.Series(r[1].text))
except IndexError:
director=director.append(pd.Series('NA'))
try:
cast=cast.append(pd.Series(r[2].text))
except IndexError:
cast=cast.append(pd.Series('NA'))
try:
genre=genre.append(pd.Series(r[3].text))
except IndexError:
genre=genre.append(pd.Series('NA'))
try:
studios=studios.append(pd.Series(r[4].text))
except IndexError:
studios=studios.append(pd.Series('NA'))
try:
releaseDate=releaseDate.append(pd.Series(r[5].text))
except IndexError:
releaseDate=releaseDate.append(pd.Series('NA'))
releaseYear=releaseYear.append(year)
#concat all these series into a df
dat = pd.concat([title,director,cast,genre,studios,releaseDate,releaseYear], axis=1)
dat.columns = ['title','director','cast','genre','studios','releaseDate','releaseYear']
dat.index = dat['title']
return dat