In [42]:
import pandas as pd
import requests as req
from bs4 import BeautifulSoup
import re
import time
from tqdm import tqdm
from datetime import datetime
import os
import sys
import atexit 

In [19]:
# https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification

def getMP3Links(rssLink): 
    
    try:
        #get the xml for the rss feed   
        response = req.get(rssLink)
        resCode = response.status_code
        rssText = response.text
    
    #if we have an exception related to our request, return an empty list
    except req.exceptions.RequestException as e:  
        print("request error")
        return []

    try: 
        #parse the xml 
        rssSoup = BeautifulSoup(rssText, features="xml")
    except: 
        print("xml error")
        return []
    
    items = rssSoup.find_all("item")
    
    forText = ["title", "description", "itunes:duration", "pubDate", "copyright", "itunes:type", "itunes:complete", "guid", "itunes:explicit"]
    forUrl = ["enclosure","itunes:image"]
    
    outList = []
    for item in items: 
        currList = []
        for tag in forText: 
            tagHit = item.find(tag)
            if tagHit != None: 
                currList.append(tagHit.get_text())
            else: 
                currList.append("")
                
        for tag in forUrl:  
            tagHit = item.find(tag)
            if tagHit != None: 
                currList.append(tagHit.get("url"))
                if tag == "podcast:transcript": 
            else: 
                currList.append("")
        
        #this might get a little whack when we have to read in the file later
        #but our data isn't that big so should hopefully work out 
        transDict = {}
        for transcript in item.find_all("podcast:transcript"): 
            transType = transcript.get("type")
            transType = transType if transType != None else ""
            
            transLang = transcript.get("language")
            transLang = transLang if transLang != None else ""
            
            transUrl = transcript.get("url")
            
            if transUrl != None: 
                transDict[transUrl] = [transType, transLang]
        
        currList.append(transDict)
        outList.append(currList)
    
    return outList

In [20]:
#load in our RSS data
cols = ["queryName", "title", "url", "originalUrl", "description", "author", "language", \
          "categories", "explicit", "episodeCount"]

#testing 
#IN_FILE = "/shared/3/projects/benlitterer/podcastData/podRss/TESTRSSFeeds.csv"
IN_FILE = sys.argv[1]

rssDf = pd.read_csv(IN_FILE, names=cols)

In [None]:
# our searches are unique 
rssDf.shape

In [23]:
len(rssDf["queryName"].unique())

199613

In [28]:
# and we only have about 3,000 duplicate search results  
len(rssDf["title"].unique())

196391

In [32]:
#we can use the rss url as a unique identifier 
len(rssDf["url"].unique())

196423

In [22]:
#check the redundancy of hosting sites
rssDf["hosts"] = rssDf["url"].apply(lambda x: x.strip("https://").split("/")[0])
rssDf["hosts"].value_counts()

hosts
anchor.fm                                71739
feeds.buzzsprout.com                     21817
www.spreaker.com                          9440
feed.podbean.com                          6324
feeds.megaphone.fm                        6063
                                         ...  
design-spricht-aesthetik.podcaster.de        1
www.authormedia.com                          1
eaveninyourhome.libsyn.com                   1
HuskerCuzCast.podbean.com                    1
forensicpsychologypodcast.libsyn.com         1
Name: count, Length: 26832, dtype: int64

In [41]:
def exit_handler():
    if os.path.exists(OUT_FILE): 
        outHandle.close()

atexit.register(exit_handler)

NameError: name 'atexit' is not defined

In [None]:
#we want to write to a file that way we can keep our results as we call the api 

#testing: 
#OUT_FILE = "/shared/3/projects/benlitterer/podcastData/mp3s/RssMp3s.csv"
OUT_FILE = sys.argv[2]

"""
podLevel = ["podName", "rssUrl"]
forText = ["epTitle", "description", "itunes:duration", "pubDate"]
forUrl = ["enclosure","podcast:transcript"] 
OUT_COLS = podLevel + forText + forUrl
"""

OUT_COLS = ["rssUrl", "title", "description", "itunes:duration", "pubDate", "copyright", \
            "itunes:type", "itunes:complete", "guid", "itunes:explicit", "enclosure","itunes:image", "transDict"]


#a set of url's to query for rss feeds 
toProcess = set(rssDf["url"])

#we want to either write a new file, or append 
#to a file we've already written 
if os.path.exists(OUT_FILE): 
    outHandle = open(OUT_FILE, "a+")
    
    #IF the file exists, we want to decide where to start searching and appending 
    outDf = pd.read_csv(OUT_FILE, names=OUT_COLS)
    
    #get what we've written to output file
    #rssUrl will be our unique key for each feed 
    #randomize ordering 
    processed = set(outDf["rssUrl"].sample(len(outDf)))
    
    #update what needs to be processed according to what we've already done 
    toProcess = toProcess - processed  
else: 
    outHandle = open(OUT_FILE, "w+")
    
print(f"processing {len(toProcess)} rss feeds")

In [43]:
IN_FILE = "/shared/3/projects/benlitterer/podcastData/podRss/feeds1.csv"
OUT_FILE = "/shared/3/projects/benlitterer/podcastData/mp3s/mp3Links1.csv"

rssDf = pd.read_csv(IN_FILE, names=cols)



In [59]:
OUT_COLS = ["rssUrl", "title", "description", "itunes:duration", "pubDate", "copyright", \
            "itunes:type", "itunes:complete", "guid", "itunes:explicit", "enclosure","itunes:image", "transDict"]

#IF the file exists, we want to decide where to start searching and appending 
outDf = pd.read_csv(OUT_FILE, names=OUT_COLS)


In [62]:
processed = set(outDf["rssUrl"].sample(len(outDf)))

In [67]:
#just out of curiousity, do we get transcripts? 
[item for item in outDf["transDict"] if item != "{}"]

["{'https://feeds.buzzsprout.com/1959925/13513533/transcript': ['text/html', ''], 'https://feeds.buzzsprout.com/1959925/13513533/transcript.json': ['application/json', ''], 'https://feeds.buzzsprout.com/1959925/13513533/transcript.srt': ['application/srt', '']}",
 "{'https://feeds.buzzsprout.com/1959925/13465775/transcript': ['text/html', ''], 'https://feeds.buzzsprout.com/1959925/13465775/transcript.json': ['application/json', ''], 'https://feeds.buzzsprout.com/1959925/13465775/transcript.srt': ['application/srt', '']}",
 "{'https://feeds.buzzsprout.com/1959925/10697437/transcript': ['text/html', ''], 'https://feeds.buzzsprout.com/1959925/10697437/transcript.json': ['application/json', '']}",
 "{'https://feeds.buzzsprout.com/1959925/10534744/transcript': ['text/html', ''], 'https://feeds.buzzsprout.com/1959925/10534744/transcript.json': ['application/json', '']}",
 "{'https://feeds.buzzsprout.com/1959925/10358510/transcript': ['text/html', ''], 'https://feeds.buzzsprout.com/1959925/10

In [17]:
TOTAL_WAIT = sys.argv[3]

hostDict = {}
podList = []
for url in tqdm(toProcess): 
    
    #get the host 
    host = url.strip("https://").split("/")[0]
    
    #find out how long it's been since we last scraped this host
    #and update the time at which we are scraping this host 
    if host in hostDict: 
        
        #time since we last called this host 
        timeLag = datetime.now() - hostDict[host]
        timeLag = timeLag.seconds 
        
        #current time now 
        hostDict[host] =  datetime.now()
    
    else: 
        #set when we called this host
        hostDict[host] = datetime.now()
        timeLag = TOTAL_WAIT 
        
    #our remaining wait is the total wait - how much we've already waited 
    rWait = TOTAL_WAIT - timeLag
    
    if rWait > 0: 
        time.sleep(rWait)
    
    #get the mp3 links for this page 
    podInfs = getMP3Links(url)
    
    for podInf in podInfs: 
        outList = [name, url] + podInf
        outStr = '","'.join([str(item).replace('"', '\'').replace("\n", "") for item in outList])
        outStr = f'"{outStr}"'
        outHandle.write(outStr + "\n")

  0%|          | 0/3360 [00:00<?, ?it/s]

request error


 25%|██▍       | 824/3360 [33:53<2:55:18,  4.15s/it] 

request error


 36%|███▌      | 1207/3360 [1:00:31<51:49,  1.44s/it]  

request error


 46%|████▌     | 1539/3360 [1:15:15<1:23:21,  2.75s/it] 

request error


 46%|████▋     | 1561/3360 [1:15:52<47:25,  1.58s/it]  

request error


 59%|█████▉    | 1977/3360 [1:31:09<1:03:46,  2.77s/it]


KeyboardInterrupt: 

In [18]:
outHandle.close()

In [19]:
prevDf = pd.read_csv(OUT_FILE, names=OUT_COLS)

In [23]:
prevDf.head()

Unnamed: 0,podName,rssUrl,epTitle,description,itunes:duration,pubDate,enclosure,podcast:transcript
0,Employment Rights Online: The Podcast,https://feeds.buzzsprout.com/1790667.rss,The New Tribunal Claim Form Has Changed. Use T...,<p>Hi there </p><p>In this edition of the podc...,683,"Thu, 17 Aug 2023 07:00:00 +0100",https://www.buzzsprout.com/1790667/13416841-th...,
1,Employment Rights Online: The Podcast,https://feeds.buzzsprout.com/1790667.rss,I’m on a Zero Hours Contract. What Are My Empl...,<p>Hi There!</p><p>The Employment Rights Onlin...,918,"Sat, 12 Aug 2023 06:00:00 +0100",https://www.buzzsprout.com/1790667/13392410-i-...,
2,Employment Rights Online: The Podcast,https://feeds.buzzsprout.com/1790667.rss,Everything You Wanted to Know About Zero Hours...,<p>Hi There!<br/><br/>The Employment Rights On...,699,"Thu, 03 Aug 2023 07:00:00 +0100",https://www.buzzsprout.com/1790667/13342102-ev...,
3,Employment Rights Online: The Podcast,https://feeds.buzzsprout.com/1790667.rss,Thinking of Taking on a Second Job to Help Wit...,<p>Hi there</p><p>The Employment Rights Online...,867,"Thu, 27 Jul 2023 07:00:00 +0100",https://www.buzzsprout.com/1790667/13298721-th...,
4,Employment Rights Online: The Podcast,https://feeds.buzzsprout.com/1790667.rss,Your Company Can Not Simply Dismiss You Becaus...,<p>Hi there</p><p>Your employment rights are t...,797,"Thu, 20 Jul 2023 07:00:00 +0100",https://www.buzzsprout.com/1790667/13256100-yo...,


In [36]:
temp = prevDf.reset_index(drop=True)

In [40]:
list(temp.head()["enclosure"])

['https://www.buzzsprout.com/1790667/13416841-the-new-tribunal-claim-form-has-changed-use-the-wrong-form-and-your-case-could-be-thrown-out-here-s-what-you-need-to-know.mp3',
 'https://www.buzzsprout.com/1790667/13392410-i-m-on-a-zero-hours-contract-what-are-my-employment-rights.mp3',
 'https://www.buzzsprout.com/1790667/13342102-everything-you-wanted-to-know-about-zero-hours-contracts-but-didn-t-know-where-to-ask.mp3',
 'https://www.buzzsprout.com/1790667/13298721-thinking-of-taking-on-a-second-job-to-help-with-your-finances-here-are-the-top-5-things-you-need-to-know.mp3',
 'https://www.buzzsprout.com/1790667/13256100-your-company-can-not-simply-dismiss-you-because-you-disagree-with-company-vision-and-values-here-s-what-you-need-to-know.mp3']