In [228]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import json
import numpy as np

In [25]:
IN_PATH = "/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthData.jsonl"
#with open(IN_PATH) as f:
#    df = pd.DataFrame(json.loads(line) for line in f)
    
df = pd.read_json("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthData.jsonl", orient="records", lines=True, nrows=10000)


In [26]:
df.columns

Index(['potentialOutPath', 'transcript', 'rssUrl', 'epTitle', 'epDescription',
       'duration', 'pubDate', 'copyright', 'itunes:type', 'itunes:complete',
       'guid', 'itunes:explicit', 'enclosure', 'itunes:image', 'transDict',
       'id', 'title', 'lastUpdate', 'link', 'lastHttpStatus', 'dead',
       'contentType', 'itunesId', 'originalUrl', 'itunesAuthor',
       'itunesOwnerName', 'explicit', 'imageUrl', 'itunesType', 'generator',
       'newestItemPubdate', 'language', 'oldestItemPubdate', 'episodeCount',
       'popularityScore', 'priority', 'createdOn', 'updateFrequency', 'chash',
       'host', 'newestEnclosureUrl', 'podcastGuid', 'podDescription',
       'category1', 'category2', 'category3', 'category4', 'category5',
       'category6', 'category7', 'category8', 'category9', 'category10',
       'newestEnclosureDuration', 'oldestItemDatetime', 'cleanDates',
       'cleanDatesLoc', '500ent', '500start', '500end', '500type', 'DescEnt',
       'DescStart', 'DescEnd', 'DescT

In [139]:
df = df[["potentialOutPath", "rssUrl", "transcript", "epTitle", "epDescription", "itunesAuthor", "title", "popularityScore", "episodeCount", "podDescription", "cleanDatesLoc", '500ent', '500start', '500end', '500type', 'DescEnt',
       'DescStart', 'DescEnd', 'DescType']]


In [185]:
#we want to use the frequency of entities in the transcripts and the descriptions to make a prediction 
#so first check out the frequency of entities in the transcript beginnings? 

transcriptEnts = df[["rssUrl", "500ent","500type"]]
transcriptEnts = transcriptEnts.explode(["500ent", "500type"]) 
transcriptEnts = transcriptEnts[transcriptEnts["500type"] == "PERSON"]

#get podMentions, the # of mentions of a given ent across episodes of a podcast 
transcriptEnts = transcriptEnts.groupby(by=["rssUrl", "500ent"]).agg(len).reset_index().rename(columns={"500type":"podMentions"}) 
transcriptEnts = transcriptEnts.sort_values("podMentions", ascending=False)


#get totalPodMentions, the total entity mentions in this podcast across all episodes 
transcriptEnts = transcriptEnts.groupby(by=["rssUrl"]).agg(list) 
transcriptEnts["totalPodMentions"] = transcriptEnts["podMentions"].apply(sum) 
transcriptEnts = transcriptEnts.explode(["500ent", "podMentions"]) 

#get totalEntMentions, the total number of mentions of this entity across all podcasts 
transcriptEnts = transcriptEnts.reset_index().groupby(by=["500ent"]).agg(list)
transcriptEnts["totalEntMentions"] = transcriptEnts["totalPodMentions"].apply(sum)
transcriptEnts = transcriptEnts.explode(["rssUrl", "podMentions", "totalPodMentions"])
transcriptEnts = transcriptEnts.reset_index()

In [186]:
#now look at unique mentions within transcripts 
transcriptUnique = df[["rssUrl", "potentialOutPath", "500ent","500type"]]
transcriptUnique = transcriptUnique.explode(["500ent", "500type"])
transcriptUnique = transcriptUnique[transcriptUnique["500type"] == "PERSON"]

#we only want one mention of each entity per episode 
transcriptUnique = transcriptUnique.drop_duplicates(subset=["potentialOutPath", "500ent"])  
transcriptUnique = transcriptUnique.groupby(by=["rssUrl", "500ent"]).agg(list).reset_index()

#transcriptUnique["uniquePodMentions"] = transcriptUnique["rssUrl"]
transcriptUnique["uniquePodMentions"] = transcriptUnique["potentialOutPath"].apply(len)
transcriptUnique = transcriptUnique.drop(columns=["500type", "potentialOutPath"])

In [187]:
#get total number of episodes for podcasts 
#after removing same podcast duplicates on unique podcast key
podEpisodes = pd.DataFrame(df.drop_duplicates(subset=["rssUrl", "potentialOutPath"])["rssUrl"].value_counts()).rename(columns={"count":"podEpisodes"})
podEpisodes = podEpisodes.reset_index() 

In [188]:
entDf = pd.merge(transcriptEnts, transcriptUnique, on=["rssUrl", "500ent"], how="inner")
entDf = pd.merge(entDf, podEpisodes, on = "rssUrl", how="inner") 
entDf = entDf.rename(columns={"500ent":"ent"}) 

In [169]:
"""
500ent: the entity from the transcript data
podMentions: the # of mentions of a given ent across episodes of a podcast
totalPodMentions: the total entity mentions in this podcast across all episodes
totalEntMentions: the total number of mentions of this entity across all podcasts 
uniquePodMentions: the number of podcast episodes from this podcast mentioning this entity 
podEpisodes: the number of episodes we have for this podcast 
"""

Unnamed: 0,ent,rssUrl,podMentions,totalPodMentions,totalEntMentions,uniquePodMentions,podEpisodes
13498,Anne Landers,https://feeds.buzzsprout.com/909061.rss,1,25,25,1,4
19433,Peechie Williams,https://feeds.buzzsprout.com/929599.rss,1,10,10,1,2
21276,Dene,https://anchor.fm/s/1a1c5574/podcast/rss,4,14,14,1,3
19459,Carol Dweck,https://empoweredathlete.podbean.com/feed.xml,2,5,5,1,2
4957,Steve,https://anchor.fm/s/1f999408/podcast/rss,2,154,2580,1,20
9922,Jack Jack Attack,https://anchor.fm/s/869a46c/podcast/rss,1,103,103,1,14
12307,Kelly,https://anchor.fm/s/651f62b4/podcast/rss,1,40,811,1,7
9890,Mia,https://www.omnycontent.com/d/playlist/3ea926b...,15,98,107,5,9
17484,Lee,https://www.spreaker.com/show/4237852/episodes...,3,17,1570,3,4
22315,Linda McCullough,https://mixed.libsyn.com/rss,1,9,9,1,1


In [189]:
#just get one podcast description per podcast 
descDf = df[["rssUrl", "DescEnt", "DescType"]].drop_duplicates(subset=["rssUrl"]).explode(["DescEnt", "DescType"])
descDf = descDf[descDf["DescType"] == "PERSON"].groupby(by=["rssUrl", "DescEnt"]).agg(len).reset_index().rename(columns={"DescEnt":"ent", "DescType":"DescMentions"}) 
descDf.head()

Unnamed: 0,rssUrl,ent,DescMentions
0,http://churchontherock2.cloversites.com/podcas...,Michael Jacobs,1
1,http://faithheightschurch.cloversites.com/podc...,John Cappetto,1
2,http://fbcchurch.sermon.net/rss/main/audio,Lance Walker,1
3,http://feeds.ancientfaith.com/HealthyMindsHeal...,Fr,2
4,http://feeds.ancientfaith.com/HealthyMindsHeal...,Nicholas,2


In [190]:
#merge whether the podcast description mentions this entity into our entity information df 
entDf = pd.merge(entDf, descDf, on=["rssUrl", "ent"], how="left")
entDf["DescMentions"] = entDf["DescMentions"].fillna(0) 

In [207]:
def getSnippet(inRow): 
    BUFFER = 80
    left = inRow["500start"] - BUFFER
    left = left if left > 0 else 0
    
    right = inRow["500end"] + BUFFER
    right = right if right < len(inRow["transcript"]) else len(inRow["transcript"]) 

    return inRow["transcript"][left:right]

In [247]:
snippetDf = df[["potentialOutPath", "rssUrl", "transcript", "500start", "500end", "500ent", "500type",  "itunesAuthor", "podDescription"]].explode(["500start", "500end", "500ent", "500type"]).dropna()   
snippetDf = snippetDf[snippetDf["500type"] == "PERSON"]
snippetDf["entContext"] = snippetDf.apply(getSnippet, axis=1)
snippetDf = snippetDf.drop(columns=["transcript"]).rename(columns={"500start":"entStart", "500end":"entEnd", "500ent":"ent", "500type":"type"})
snippetDf.head() 

Unnamed: 0,potentialOutPath,rssUrl,entStart,entEnd,ent,500type,itunesAuthor,podDescription,entContext
0,/shared/3/projects/benlitterer/podcastData/pro...,https://feeds.buzzsprout.com/783020.rss,12,25,Simon Shapiro,PERSON,Simon Shapiro,Award winning Australian singer-songwriter Sim...,I'm Simon Shapiro and this is Sing Out Speak O...
0,/shared/3/projects/benlitterer/podcastData/pro...,https://feeds.buzzsprout.com/783020.rss,1446,1461,Mike Haveername,PERSON,Simon Shapiro,Award winning Australian singer-songwriter Sim...,o the Kiss Tone album Way to Nowhere which I m...
0,/shared/3/projects/benlitterer/podcastData/pro...,https://feeds.buzzsprout.com/783020.rss,1466,1476,Lee Walker,PERSON,Simon Shapiro,Award winning Australian singer-songwriter Sim...,m Way to Nowhere which I made with my great fr...
0,/shared/3/projects/benlitterer/podcastData/pro...,https://feeds.buzzsprout.com/783020.rss,2010,2024,Lindsey Rhimes,PERSON,Simon Shapiro,Award winning Australian singer-songwriter Sim...,favourite bass parts that I've ever recorded a...
1,/shared/3/projects/benlitterer/podcastData/pro...,https://feeds.buzzsprout.com/783020.rss,12,25,Simon Shapiro,PERSON,Simon Shapiro,Award winning Australian singer-songwriter Sim...,I'm Simon Shapiro and this is Sing Out Speak O...


In [248]:
entDf = pd.merge(snippetDf, entDf, on=["rssUrl", "ent"])  

In [226]:
def cleanAuthors(inStr): 
    #clean up spacing discrepancies
    inStr = " ".join(inStr.split())
    return inStr

def getAuthorMatches(inRow): 
    authors = inRow["itunesAuthor"].split("&")
    authors = [cleanAuthors(author).lower() for author in authors]
    entGuess = cleanAuthors(inRow["ent"]).lower()
    return entGuess in authors 
        

In [232]:
#figure out when we have an exact match between an entity in NER and the author field 
#helpful for quickly finding ground truth 
entDf["itunesMatch"] = entDf.apply(getAuthorMatches, axis=1) 

In [238]:
entDf.columns

Index(['potentialOutPath', 'rssUrl', 'entStart', 'entEnd', 'ent',
       'itunesAuthor', 'podDescription', 'entContext', 'podMentions',
       'totalPodMentions', 'totalEntMentions', 'uniquePodMentions',
       'podEpisodes', 'DescMentions', 'itunesMatch'],
      dtype='object')

In [242]:
#now add some extra features which will be useful for our logistic regression 
entDf["fracOfPodEntities"] = entDf["podMentions"] / entDf["totalPodMentions"]
entDf["fracOfAllMentions"] = entDf["podMentions"] / entDf["totalEntMentions"]
entDf["propOfEpisodeMentions"] = entDf["uniquePodMentions"] / entDf["podEpisodes"] 

In [244]:
#entDf[entDf["itunesMatch"] == True].sample(10)

In [245]:
entDf.to_json("/shared/3/projects/benlitterer/podcastData/hostIdentification/logisticRegression/allEntityFeatures.jsonl", orient="records", lines=True) 