In [240]:
from bs4 import BeautifulSoup
import requests as req
from tqdm import tqdm 
import time
import pandas as pd
import random
import matplotlib.pyplot as plt
import re

In [241]:
#what about the itunes search api  
IURL = "https://itunes.apple.com/search?" 
response = req.get(IURL + "term=Joe+Rogan&media=podcast")

In [282]:
def getRSSLinks(query): 
    IURL = "https://itunes.apple.com/search?" 
    
    qString = f'{IURL}term={query.replace(" ", "+")}&media=podcast'
    
    response = req.get(qString)
    resCode = response.status_code 
    
    try: 
        resDict = response.json()["results"]
    except JSONDecodeError: 
        print("JSONDecodeError")
        return resCode, None, None, None
    except: 
        print("Unknown Error")
        return resCode, None, None, None
    
    if len(resDict) > 0: 
        resDict = resDict[0]
        resName = resDict["collectionName"]
        creatorName = resDict["artistName"]
        
        if "feedUrl" in resDict: 
            feedUrl = resDict["feedUrl"]
            return resCode, resName, creatorName, feedUrl 
        else: 
            print("no feed URL")
            return resCode, resName, creatorName, None
    else: 
        print(f"no results for {query}")
        return resCode, None, None, None
    

In [243]:
def getMP3s(RSSLink): 
    response = req.get(RSSLink)
    print(response)
    
    

In [244]:
#load in a large dataframe with names of podcasts to download 
IN_PATH = "/shared/3/projects/benlitterer/podcastData/podNames/spotifySnowball50k.csv"
df = pd.read_csv(IN_PATH, index_col="Unnamed: 0")
df = df.sort_values("total_episodes", ascending=False)

In [260]:
def getMP3Links(rssLink): 
    #get the xml for the rss feed  
    response = req.get(rssLink)
    resCode = response.status_code
    rssText = response.text

    #pare the xml 
    rssSoup = BeautifulSoup(rssText, features="xml")
    
    epLinks = []
    items = rssSoup.find_all("item")
    
    for item in items: 
        title = item.find("title").get_text()
        podUrl = item.find("enclosure").get("url")
        desc = item.find("description").get_text()
        dur = item.find("itunes:duration").get_text()
        date = item.find("pubDate").get_text()
        transcripts = item.find_all(re.compile("transcript"))
        epLinks.append([title, podUrl, desc, dur, date, transcripts])
        
    return epLinks 

In [248]:
midDf = df[10000:10100]

In [283]:
rssList = []
nameList = list(midDf.head(50)["name"])

#just focus on getting the RSS feeds here 
for name in tqdm(nameList): 
    resCode, resName, creatorName, feedUrl = getRSSLinks(name)
    
    while resCode == 404: 
        print("404 response")
        time.sleep(10)
        resCode, resName, creatorName, feedUrl = getRSSLinks(name)
    
    time.sleep(2 + random.random())
    
    rssList.append([name, resName, creatorName, feedUrl])
    


 38%|███▊      | 19/50 [00:48<01:19,  2.55s/it]

no results for Das Mädchen im roten Mantel


 52%|█████▏    | 26/50 [01:05<00:57,  2.39s/it]

no results for Ava Butler Box Set


 80%|████████  | 40/50 [01:41<00:26,  2.65s/it]

no results for The Nightblade Epic Volume One: A Book of Underrealm


100%|██████████| 50/50 [02:10<00:00,  2.61s/it]


In [284]:
rssDf = pd.DataFrame(rssList, columns=["searchName", "resName", "resCreator", "resRSS"])

In [287]:
#start here with rssDf 
#TODO: next we want to use rss links to get our mp3s
rssDf.head(3)

Unnamed: 0,searchName,resName,resCreator,resRSS
0,Southern Gothic,Southern Gothic,Southern Gothic Media,https://feeds.megaphone.fm/ARML5182692989
1,Data Privacy Detective,Data Privacy Detective,Joe Dehner - Global Data Privacy Lawyer,https://feeds.soundcloud.com/users/soundcloud:...
2,The Carpool with Kelly and Lizz,The Carpool with Kelly and Lizz,The Car Mom LLC / tentwentytwo Projects,https://feeds.megaphone.fm/ADV8086685731


In [289]:
epList = []
rssList = rssDf["resRSS"]

for feedUrl in rssList: 
    #only need to do something if there is a feed to follow 
    #could have empty result from previous step of finding RSS feeds 
    if len(feedUrl) > 0: 
        epLinks = getMP3Links(feedUrl)

        #add the pod level and episode level information 
        currList = [[resName, creatorName, feedUrl] + epLink for epLink in epLinks]
        epList += currList 
        time.sleep(random.random() + 3)
    

AttributeError: 'NoneType' object has no attribute 'get_text'

In [237]:
podDf = pd.DataFrame(epList, columns=["name", "creator", "feedUrl", "title", "podUrl", "description", "duration", "date", "transcriptUrl"])

In [238]:
podDf["transLen"] = podDf["transcriptUrl"].apply(len)

In [239]:
podDf

Unnamed: 0,name,creator,feedUrl,title,podUrl,description,duration,date,transcriptUrl,transLen
0,Southern Gothic,Southern Gothic Media,https://feeds.megaphone.fm/ARML5182692989,The Unsolved Murder of Ethel Allen,https://www.podtrac.com/pts/redirect.mp3/chrt....,Ethel Allen was last seen alive at Jack’s Tave...,1592,"Mon, 14 Aug 2023 05:03:00 -0000",[],0
1,Southern Gothic,Southern Gothic Media,https://feeds.megaphone.fm/ARML5182692989,Tuberculosis in Mammoth Cave | Minisode,https://www.podtrac.com/pts/redirect.mp3/chrt....,Last week we returned to the Mammoth Cave Nati...,614,"Thu, 10 Aug 2023 05:03:00 -0000",[],0
2,Southern Gothic,Southern Gothic Media,https://feeds.megaphone.fm/ARML5182692989,Tragedy in Sand Cave,https://www.podtrac.com/pts/redirect.mp3/chrt....,"In 1925, cave explorer Floyd Collins discovere...",2288,"Mon, 31 Jul 2023 05:03:00 -0000",[],0
3,Southern Gothic,Southern Gothic Media,https://feeds.megaphone.fm/ARML5182692989,Kathryn Tucker Windham Museum | Interview,https://www.podtrac.com/pts/redirect.mp3/chrt....,Kathryn Tucker Windham was an American storyte...,2297,"Mon, 24 Jul 2023 05:03:00 -0000",[],0
4,Southern Gothic,Southern Gothic Media,https://feeds.megaphone.fm/ARML5182692989,The Boyington Oak Tree,https://www.podtrac.com/pts/redirect.mp3/chrt....,A beautiful oak tree stands by the Church Stre...,1673,"Mon, 17 Jul 2023 05:03:00 -0000",[],0
...,...,...,...,...,...,...,...,...,...,...
2345,Shawn1113 Podcast Show,Shawn1113,https://anchor.fm/s/239d4d4c/podcast/rss,Shawn1113 Sports Show Episode 3,https://anchor.fm/s/239d4d4c/podcast/play/1516...,<p>In the midst of uncertain times in 2020 the...,00:12:49,"Sun, 14 Jun 2020 13:03:27 GMT",[],0
2346,Shawn1113 Podcast Show,Shawn1113,https://anchor.fm/s/239d4d4c/podcast/rss,Shawn1113 Sports Show Episode 3,https://anchor.fm/s/239d4d4c/podcast/play/1516...,<p>In the midst of uncertain times in 2020 the...,00:12:39,"Sun, 14 Jun 2020 12:57:59 GMT",[],0
2347,Shawn1113 Podcast Show,Shawn1113,https://anchor.fm/s/239d4d4c/podcast/rss,Shawn1113 Sports Show(Episode 2),https://anchor.fm/s/239d4d4c/podcast/play/1469...,<p>Talking briefly about the slow but sure tri...,00:05:04,"Wed, 03 Jun 2020 16:07:54 GMT",[],0
2348,Shawn1113 Podcast Show,Shawn1113,https://anchor.fm/s/239d4d4c/podcast/rss,Shawn1113 Sports Show,https://anchor.fm/s/239d4d4c/podcast/play/1438...,Sports In General\n\n--- \n\nSend in a voice m...,00:07:38,"Wed, 27 May 2020 22:25:59 GMT",[],0


In [163]:
dups = podDf[podDf.duplicated(subset=["podUrl"])].reset_index()

In [178]:
df.head()

Unnamed: 0,searchQuery,name,type,languages,description,is_externally_hosted,total_episodes
13428,The Ryback Show,THE HUGE SHOW,show,['en-US'],"Live across Michigan every weekday, The Huge S...",False,10944
49349,The Steve and Kyle Podcast,Best of The Steve Harvey Morning Show,show,['en-US'],Highlights from The Steve Harvey Morning Show,False,8360
27733,The Ben Shapiro Show,The Ben Maller Show,show,['en-US'],"When the moon comes out, Ben Maller emerges wi...",False,6238
58082,Based on a True Story,JAM Nation with Jonesy & Amanda,show,['en'],Based On A True Story. JAM Nation captures the...,False,5973
30490,2 B's in a Pod,Brooke and Jeffrey,show,['en-US'],Brooke & Jeffrey In The Morning official podca...,False,5898
