In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import random
import seaborn as sns
import numpy as np

In [5]:
random.seed(120)

In [6]:
df = pd.read_json("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthDataClean.jsonl", orient="records", lines=True)

In [None]:
#metaDf = pd.read_json("/shared/3/projects/benlitterer/podcastData/processed/mayJune/mayJuneMetadata.jsonl", orient="record", lines=True)

In [None]:
#if we don't already have entities for the transcripts and the descriptions, we need to add them... 

In [7]:
#just in case 
inDf = df

In [8]:
df = df.drop_duplicates(subset=["potentialOutPath"])

In [9]:
#do we want to work with podcasts where there's no description? or no transcript? let's say no for now
df = df.dropna(subset=["rssUrl", "transcript", "enclosure", "potentialOutPath", "epDescription", "podDescription",  "category1", "transEnts", "descEnts", "epDescEnts"])

In [None]:
#odd that we only have 366,013 left after removing na's. Seems to be driven mostly by ents
#BUT: remember that many descriptions won't have entities  

In [10]:
#if either description or transcript has person entity and isn't none/na 
#then keep it
def hasPerson(inRow):
    #descTypes = inRow["DescType"]
    transTypes = inRow["transTypes"]

    #descHas = False
    transHas = False

    """
    if descTypes == descTypes and descTypes != None: 
        descHas = "PERSON" in descTypes
    """
        
    if transTypes == transTypes and transTypes != None: 
        transHas = "PERSON" in transTypes

    return transHas


#keep only those rows where we have a person in the description or the transcript
#TODO: modify this
df["hasPerson"] = df.apply(hasPerson, axis=1)
#df["hasPerson"] = df["transEnts"].apply(lambda x: x != None and x == x and len(x) > 0)

In [11]:
df.shape

(214244, 77)

In [12]:
df = df[df["hasPerson"] == True]

In [13]:
N_CATS=20
print(f'Unique Categories: {len(df["category1"].unique())}')
topCats = pd.DataFrame(df["category1"].value_counts()).reset_index().head(N_CATS)["category1"]
print(topCats)
topCats = list(topCats)

Unique Categories: 26
0       religion
1       business
2        society
3      education
4         sports
5           news
6         health
7           arts
8         comedy
9             tv
10       leisure
11         music
12    technology
13          kids
14       science
15       fiction
16       history
17    government
18    true crime
19         games
Name: category1, dtype: object


In [74]:
sampDf = df[df["category1"].apply(lambda x: x in topCats)]

In [75]:
SAMP_NUM = 100
SEED = 24
sampDf = sampDf.groupby(by="category1").apply(lambda x: x.sample(SAMP_NUM, random_state=SEED)).droplevel("category1")

  sampDf = sampDf.groupby(by="category1").apply(lambda x: x.sample(SAMP_NUM, random_state=SEED)).droplevel("category1")


In [76]:
def wrap(inStr): 
    i = 0 
    STEP = 20
    splitStr = inStr.split()
    outStr = ""
    while i < len(splitStr): 
        if i + STEP < len(splitStr):
            outStr += " ".join(splitStr[i:i+STEP]) + "\n"
        else: 
            outStr +=  " ".join(splitStr[i:]) + "\n"
        i += STEP
    print(outStr)

In [77]:
#df.head()["transcript"]
#did some checks on data quality and things look better after 4-gram cleaning
#print(wrap(df.sample()["transcript"].item()))

In [78]:
def cleaner(inStr, LEN): 
    #clean out all tags in brackets or parenthesis
    inStr = re.sub(r'\[[^()]*\]\s*', '', inStr)
    inStr = re.sub(r'\([^()]*\)\s*', '', inStr)
    
    #replace paragraph breaks with new lines  
    inStr = re.sub("</p>",". ", inStr)
    inStr = re.sub("</br>", ". ", inStr)

    #replace all other tags with space
    inStr = re.sub("<.+?>", " ", inStr)

    #replace all amounts of space with a single space 
    inStr = re.sub(r"[ \t]+", " ", inStr)
    
    inStr = BeautifulSoup(inStr, "html.parser").text
    if len(inStr.split()) > LEN: 
        inStr = " ".join(inStr.split()[:LEN]) + "..."
    
    return inStr

#TODO: clean out html tags and stuff like that...
#TODO: just replace tags with a space and make spaces and then make one space between every word  

#f"<strong>Description:</strong><br>{description}<br><br><strong>Transcript:</strong><br>{transcript}"

In [79]:
sampDf["descClean"] = sampDf["podDescription"].apply(cleaner, args=[150])
sampDf["epDescClean"] =  sampDf["epDescription"].apply(cleaner, args=[150])

  inStr = BeautifulSoup(inStr, "html.parser").text
  inStr = BeautifulSoup(inStr, "html.parser").text


In [80]:
#subset the columns we're working with 
sampDf = sampDf[["potentialOutPath", "descClean", "epDescClean", "transcript", "transEnts", "transTypes",  "rssUrl", "enclosure", "cleanDatesLoc", "epTitle", "title"]]

In [81]:
sampDf = sampDf.explode(["transTypes", "transEnts"])
sampDf = sampDf[sampDf["transTypes"] == "PERSON"]
sampDf = sampDf.dropna(subset=["transEnts"])
sampDf = sampDf[sampDf["transEnts"].apply(lambda x: len(x.split())) == 2]

In [82]:
#only get entities where the first word starts with a capital letter 
sampDf = sampDf[sampDf["transEnts"].apply(lambda x: x[0].isupper())]

In [83]:
#remove any honorifics - entities with [Mr., Mrs., Dr., Professor, Prof., ]
#CHECKING: sampDf[sampDf["transEnts"].str.contains("\\.")].sample(10)
def hasHonorific(inStr): 
    honorifics = ["Mr ", "Mrs ", "Dr ", "Doctor ", "Professor", "Prof."]
    for honorific in honorifics: 
        if honorific in inStr: 
            return True
    return False 

sampDf = sampDf[sampDf["transEnts"].apply(hasHonorific) == False]
sampDf.shape

(29081, 11)

In [84]:
#TODO: add podcast title here 
#crop the transcript so we can ensure we get 
def getCroppedTranscript(inRow, BUFFER=150): 
    ent = inRow["transEnts"]
    trans = inRow["transcript"]

    #we can assume single space between every token now? 
    trans = " ".join(trans.split())

    #trans = re.sub(r"[^a-zA-Z ]", "", trans)
    #ent = re.sub(r"[^a-zA-Z '.\-]", "", ent)
    ent = re.sub(r"\\", "", ent)
    ent = " ".join(ent.split())
    #print(ent)
    
    #ent = re.sub(r"[^a-zA-Z ]", "", ent)
    #find where the entity occurs
    match = re.search(ent, trans, re.IGNORECASE)

    if match == None or len(ent) == 0: 
        return "NO_MATCH"

    start = match.start() 
    end = match.end() 

    #get 200 words in either direction of our match 
    transList = trans.split()

    
    #get index for where our entity starts  
    charIndex = 0 
    entIndex = len(transList)
    tokIndex = 0
    while tokIndex < len(transList):
        tok = transList[tokIndex]
        #the moment our char index passes the char index 
        #of the beginning of our token, we nominate the current token index as
        #the first one of our entity 
        if charIndex >= start:  
            entIndex = tokIndex 
            break 

        #move char index ahead accounting for space inbetween words
        charIndex += len(tok)
        charIndex += 1         
        tokIndex += 1

   
    #we only want entities towards the beginning of the transcript 
    if entIndex > 350: 
        return "TOO_FAR"


    #ensure we won't start before or after the end of the transcript list 
    start = max(entIndex - BUFFER, 0)
    end = min(entIndex + len(ent.split()) + 1 + BUFFER, len(transList))

    transcriptChunk = " ".join(transList[start:end])

    return transcriptChunk

#I confirmed that we are actually finding the entity, just not early in the transcript!
sampDf["transChunk"] = sampDf.apply(getCroppedTranscript, axis=1)



In [85]:
deviantRows = len(sampDf[(sampDf["transChunk"] == "NO_MATCH") | (sampDf["transChunk"] == "NOT_IN_LIST")])
print(f"deviant entity matches: {deviantRows}")
print(f"prop. deviant entity matches: {deviantRows/len(sampDf)}")
tooFar = len(sampDf[sampDf["transChunk"] == "TOO_FAR"])
print(f"entities too far along in transcript:{tooFar}")
print(f"prop. entities too far along in transcript:{tooFar/len(sampDf)}")

deviant entity matches: 2
prop. deviant entity matches: 6.87734259482136e-05
entities too far along in transcript:22854
prop. entities too far along in transcript:0.7858739383102369


In [86]:
sampDf = sampDf[(sampDf["transChunk"] != "TOO_FAR") & (sampDf["transChunk"] != "NO_MATCH") & (sampDf["transChunk"] != "NOT_IN_LIST")]

In [87]:
sampDf.head()["epTitle"]

72260    PK 203: Understanding Artistic Identity with Z...
72260    PK 203: Understanding Artistic Identity with Z...
47612                       Episode 155: Excellent Bullets
47612                       Episode 155: Excellent Bullets
79027    From Human-Centered Design to Relationship-Cen...
Name: epTitle, dtype: object

In [88]:

#fix tiny misspellings 
def getDisplayText(inRow): 
    ent = inRow["transEnts"]

    podTitle = inRow["title"]
    epTitle = inRow["epTitle"]
    desc = inRow["descClean"] #if inRow["descClean"] == inRow["descClean"] and inRow["descClean"] != None else "" 
    epDesc = inRow["epDescClean"] #if inRow["epDescClean"] == inRow["epDescClean"] and inRow["epDescClean"] != None else "" 
    trans = inRow["transChunk"] #if inRow["transcript"] == inRow["transcript"] and inRow["transcript"] != None else "" 
    fullTrans = inRow["transcript"]

    #add ellipses if we've modified the transcript in any way 
    fullTrans = " ".join(fullTrans.split())
    if trans[:3] != fullTrans[:3]: 
        trans = "..." + trans
    
    allText = "<strong>Podcast Title: </strong>" + podTitle + "<br><strong>Podcast Description:</strong><br>" + \
    desc + "<br><br><strong>Podcast Episode Title: </strong>" + epTitle + "<br><strong>Podcast Episode Description:</strong><br>" + epDesc \
    + "<br><br><strong>Podcast Transcript Excerpt:</strong><br>" + trans

    #remove html tags, as they mess up the formatting
    #also remove any extraneous spaces 
    allText = re.sub(r"http[^\s<]*", "[hyperlink]", allText)
    allText = " ".join(allText.split())
    #allText = allText.lower()

    #clean up the entity a bit here 
    #single space in-between should align with our cleaning in text above 
    ent = re.sub(r"\\", "", ent)
    ent = ent.replace(".", "\.").replace("*", "\*").replace("+", "\+").replace("?", "\?")
    ent = " ".join(ent.split())

    #print(ent)
    #bold full occurences of the entity 
    #note that we match irrespective of case 
    outText = re.sub(ent, f'<span style="background-color:#00FF00">{ent}</span>', allText, flags=re.IGNORECASE)
    outText = outText.replace("<a href=", "")
    
    #if we have a multi-word entity, highlight the first word where it hasn't already been bolded 
    firstEnt = ent.split()[0]
    if len(ent.split()) > 1: 
        outText = re.sub(r"(?<!>)" + firstEnt, f'<span style="background-color:#00FF00">{firstEnt}</span>', outText, flags=re.IGNORECASE)

    outText = f"<span style='background-color:#00FF00'>Target Entity: {ent}</span><br><br>{outText}"
    return outText
    #return [[item.start(), item.end(), ent] for item in re.finditer(ent, allText)]

sampDf["displayText"] = list(sampDf.apply(getDisplayText, axis=1))
#sampDf.head().apply(getEntIx, axis=1)

In [89]:
sampDf.head() 

Unnamed: 0,potentialOutPath,descClean,epDescClean,transcript,transEnts,transTypes,rssUrl,enclosure,cleanDatesLoc,epTitle,title,transChunk,displayText
72260,/traffic.libsyn.com/ni/httpstraffic.libsyn.com...,The Pencil Kings Podcast interviews today’s to...,"Zhiwan Cheung is a Fulbright Fellow, artist, v...","[MUSIC] Welcome to the Pencil King show, where...",Mitch Voller,PERSON,https://pencilkings.libsyn.com/rss,https://traffic.libsyn.com/secure/pencilkings/...,2020-05-20 06:00:00+00:00,PK 203: Understanding Artistic Identity with Z...,Pencil Kings | Inspiring Artist Interviews wit...,"[MUSIC] Welcome to the Pencil King show, where...",<span style='background-color:#00FF00'>Target ...
72260,/traffic.libsyn.com/ni/httpstraffic.libsyn.com...,The Pencil Kings Podcast interviews today’s to...,"Zhiwan Cheung is a Fulbright Fellow, artist, v...","[MUSIC] Welcome to the Pencil King show, where...",Zewon Chung,PERSON,https://pencilkings.libsyn.com/rss,https://traffic.libsyn.com/secure/pencilkings/...,2020-05-20 06:00:00+00:00,PK 203: Understanding Artistic Identity with Z...,Pencil Kings | Inspiring Artist Interviews wit...,take you there and beyond for fraction of the ...,<span style='background-color:#00FF00'>Target ...
47612,/sphinx.acast.com/es/httpssphinx.acast.comseri...,A weekly comic book and manga review podcast. ...,Levins has gone JUNJI ITO CRAZY and Siobhan ha...,"Hey Dave, since we founded Bombas, we've alway...",Andrew Levens,PERSON,https://rss.acast.com/serious-issues,https://sphinx.acast.com/serious-issues/episod...,2020-05-25 03:47:20+00:00,Episode 155: Excellent Bullets,Serious Issues: A Comic Book Podcast with Andr...,"one purchase equals one donated. Wow, did we j...",<span style='background-color:#00FF00'>Target ...
47612,/sphinx.acast.com/es/httpssphinx.acast.comseri...,A weekly comic book and manga review podcast. ...,Levins has gone JUNJI ITO CRAZY and Siobhan ha...,"Hey Dave, since we founded Bombas, we've alway...",Chabon Coons,PERSON,https://rss.acast.com/serious-issues,https://sphinx.acast.com/serious-issues/episod...,2020-05-25 03:47:20+00:00,Episode 155: Excellent Bullets,Serious Issues: A Comic Book Podcast with Andr...,"Wow, did we just write an ad? Yes. Bombas. Big...",<span style='background-color:#00FF00'>Target ...
79027,/mcdn.podbean.com/en/httpsmcdn.podbean.commfwe...,Design is Everywhere features stories of peopl...,"Many of us are aware of human-centered design,...","[MUSIC PLAYING] Hello, and welcome to Design i...",Sam Aquilano,PERSON,https://feed.podbean.com/designmuseum/feed.xml,https://mcdn.podbean.com/mf/web/7pqn9m/POD-007...,2020-06-04 10:00:00+00:00,From Human-Centered Design to Relationship-Cen...,Design is Everywhere,"[MUSIC PLAYING] Hello, and welcome to Design i...",<span style='background-color:#00FF00'>Target ...


In [90]:
#sampDf = sampDf.drop(columns=["text"])

In [91]:
sampDf["cleanEnt"] = sampDf["transEnts"].apply(lambda x: re.sub(r"[^a-zA-Z ]" , "", x.lower()))

In [92]:
#shuffle rows, remove duplicates on the cleaned version of our entities
#then get one row per show 
sampDf = sampDf.sample(len(sampDf), random_state=SEED).drop_duplicates("cleanEnt").drop_duplicates("rssUrl")

In [93]:
print(f"shape after dropping duplicates on entity and podcast: {sampDf.shape}")

shape after dropping duplicates on entity and podcast: (1344, 14)


In [94]:
wrap(sampDf["displayText"].iloc[900])

<span style='background-color:#00FF00'>Target Entity: David Politis</span><br><br><strong>Podcast Title: </strong>Four Corners Crime Cast<br><strong>Podcast Description:</strong><br>Four Corners Crime Cast brings you true crime from the
four corner states: Arizona, Colorado, New Mexico, and Utah. Hosted by Katie Renner and cohosted by Jake Sanders and Rory
Allard, we deep dive into killer's crimes and minds, with a little comedic relief in between.<br><br><strong>Podcast Episode Title: </strong>Episode 46:
Missing Persons of Rocky Mountain National Park<br><strong>Podcast Episode Description:</strong><br> This week we are taking a little break from horrific awfulness,
and covering missing persons of Rocky Mountain National Park. Join us as Katie takes us through some of the known
cases, and the guys try, not very well, to help shed light on the multiple possibilities.. <br><br><strong>Podcast Transcript Excerpt:</strong><br>...favorite book
I've ever read., "Death in Rocky Mountain Nat

In [95]:
sampDf = sampDf.rename(columns={"displayText":"text", "enclosure":"id"})

In [96]:
#WHERE WERE GOING TO OUTPUT FOR NOW
sampDf.to_json("/shared/3/projects/benlitterer/podcastData/annotation/label1000/2kpodsClassification.jsonl", orient="records", lines=True)