add godcasts and podcastpedia

btelle · Nov 9, 2016 · b82f681 · b82f681
1 parent be57ba3
commit b82f681
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -6,3 +6,5 @@ Scripts to compile a large dataset of podcasts and episodes for analysis.
 * http://www.publicradiofan.com/podcasts.html
 * http://newtimeradio.com/
 * https://rss.itunes.apple.com/us/?urlDesc=%2Fgenerator
+* https://www.podcastpedia.org/categories
+* http://www.godcast1000.com/
diff --git a/build_lists/clean_data_sources.py b/build_lists/clean_data_sources.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 from __future__ import print_function
-import re
+import re, requests
 
 def all_podcasts():
 	with open('sources/allpodcasts_com_directory.html', 'r') as fh:
@@ -30,6 +30,18 @@ def publicradiofan():
 			if match and len(match) > 0:
 				fh.write(match.strip()+"\n")
 
+def godcasts():
+	base_url = 'http://www.godcast1000.com/index.php?cat=&start={0}'
+	with open('../data/godcasts.txt', 'w') as fh:
+		for i in range(1, 1101, 50):
+			url = base_url.format(i)
+			contents = requests.get(url).text
+
+			for match in re.findall('Get RSS: <a href="([^"]+)">', contents):
+				if match and len(match) > 0:
+					fh.write(match.strip()+"\n")
+
 all_podcasts()
 newtimeradio()
-publicradiofan()
+publicradiofan()
+godcasts()
diff --git a/build_lists/podcastpedia_extract.py b/build_lists/podcastpedia_extract.py
@@ -0,0 +1,51 @@
+import re, requests
+
+def get_feed_from_page(url):
+	html = requests.get(url).text
+	match = re.search('<a href="([^"]+)" target="_blank"  class="icon-feed-producer  producer-social" title="Feed">', html)
+
+	if match:
+		return match.group(1)
+
+def parse_search_page(url):
+	html = requests.get(url).text
+	results = []
+
+	for match in re.findall('<a class="item_title" href="([^"]+)">', html):
+		if match and len(match) > 0:
+			results.append("https://www.podcastpedia.org"+match.strip())
+	return results
+
+base_url = 'https://www.podcastpedia.org/search/advanced_search/results?numberResultsPerPage=500&searchTarget=podcasts&categId={0}&searchMode=natural&currentPage=1'
+categories = [
+	45,
+	39,
+	24,
+	48,
+	27,
+	43,
+	38,
+	46,
+	22,
+	44,
+	42,
+	28,
+	35,
+	37,
+	49,
+	21,
+	29,
+	41,
+	25,
+	31,
+	33,
+	47
+]
+
+with open('../data/podcastpedia_list.txt', 'w') as fh:
+	for cat in categories:
+		search_url = base_url.format(cat)
+		for url in parse_search_page(search_url):
+			feed_url = get_feed_from_page(url)
+			if feed_url:
+				fh.write(feed_url+"\n")