Skip to content

Commit

Permalink
add godcasts and podcastpedia
Browse files Browse the repository at this point in the history
  • Loading branch information
btelle committed Nov 9, 2016
1 parent be57ba3 commit b82f681
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ Scripts to compile a large dataset of podcasts and episodes for analysis.
* http://www.publicradiofan.com/podcasts.html
* http://newtimeradio.com/
* https://rss.itunes.apple.com/us/?urlDesc=%2Fgenerator
* https://www.podcastpedia.org/categories
* http://www.godcast1000.com/
16 changes: 14 additions & 2 deletions build_lists/clean_data_sources.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
from __future__ import print_function
import re
import re, requests

def all_podcasts():
with open('sources/allpodcasts_com_directory.html', 'r') as fh:
Expand Down Expand Up @@ -30,6 +30,18 @@ def publicradiofan():
if match and len(match) > 0:
fh.write(match.strip()+"\n")

def godcasts():
base_url = 'http://www.godcast1000.com/index.php?cat=&start={0}'
with open('../data/godcasts.txt', 'w') as fh:
for i in range(1, 1101, 50):
url = base_url.format(i)
contents = requests.get(url).text

for match in re.findall('Get RSS: <a href="([^"]+)">', contents):
if match and len(match) > 0:
fh.write(match.strip()+"\n")

all_podcasts()
newtimeradio()
publicradiofan()
publicradiofan()
godcasts()
51 changes: 51 additions & 0 deletions build_lists/podcastpedia_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import re, requests

def get_feed_from_page(url):
html = requests.get(url).text
match = re.search('<a href="([^"]+)" target="_blank" class="icon-feed-producer producer-social" title="Feed">', html)

if match:
return match.group(1)

def parse_search_page(url):
html = requests.get(url).text
results = []

for match in re.findall('<a class="item_title" href="([^"]+)">', html):
if match and len(match) > 0:
results.append("https://www.podcastpedia.org"+match.strip())
return results

base_url = 'https://www.podcastpedia.org/search/advanced_search/results?numberResultsPerPage=500&searchTarget=podcasts&categId={0}&searchMode=natural&currentPage=1'
categories = [
45,
39,
24,
48,
27,
43,
38,
46,
22,
44,
42,
28,
35,
37,
49,
21,
29,
41,
25,
31,
33,
47
]

with open('../data/podcastpedia_list.txt', 'w') as fh:
for cat in categories:
search_url = base_url.format(cat)
for url in parse_search_page(search_url):
feed_url = get_feed_from_page(url)
if feed_url:
fh.write(feed_url+"\n")

0 comments on commit b82f681

Please sign in to comment.