In [2]:
import pandas as pd
from tqdm import tqdm 

In [4]:
#dataframe with the entities extracted from the introduction of each podcast 
entDf = pd.read_csv("/shared/3/projects/benlitterer/podcastData/NER/podDescriptions/floydMonthNEs.tsv", sep="\t", names=["potentialOutPath", "ent", "start", "end", "type"])
entDf = entDf[entDf["type"] == "PERSON"] 

In [9]:
entDf["potentialOutPath"] = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + entDf["potentialOutPath"] 

In [5]:
#metadata (this has author field)  
metaDf = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthEn.csv")
metaDf["potentialOutPath"] = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth" + metaDf["potentialOutPath"]

  metaDf = pd.read_csv("/shared/3/projects/benlitterer/podcastData/processed/floydMonth/floydMonthEn.csv")


In [18]:
#merges on only the transcripts we actually have 
#merge metadata, transcript beginnings, and entities
df = pd.merge(entDf, metaDf[["podDescription", "itunesAuthor","rssUrl", "potentialOutPath"]], on="potentialOutPath", how="inner")
df = df.dropna() 

In [21]:
#sort the dataframe so that within each episode (potentialOutPath)
#we have the starts of the entities in ascending order
df = df.sort_values(["potentialOutPath", "start"]) 

#we want to figure out where to create our snippets 
df = df.groupby("potentialOutPath").agg(list) 

In [22]:
BEFORE_BUFF = 10 
AFTER_BUFF = 10

#go through and create snippets of text 
entSnippets = []
for i, row in tqdm(df.iterrows()): 
    prevEntEnd = 0 
    currEntSnippets = []
    for j in range(len(row["start"])): 
        snippet = row["podDescription"][j]
        entStart = row["start"][j]
        entEnd = row["end"][j]
        
        #get position that is BUFFER words before and after entity
        beforeWords = snippet[0:entStart].split(" ")
        
        #only if we have enough words before to get entire buffer 
        if len(beforeWords) >= BEFORE_BUFF: 
            #get the snippet before 
            buffStart = entStart - len(" ".join(beforeWords[-BEFORE_BUFF:])) -1
        else: 
            buffStart = entStart - len(" ".join(beforeWords)) -1
        
        #get position that is BUFFER words before and after entity
        afterWords = snippet[entEnd:len(snippet)].split(" ")
        
        #only if we have enough words after to get entire buffer
        if len(afterWords) >= AFTER_BUFF: 
            #get the snippet after
            buffEnd = entEnd + len(" ".join(afterWords[:AFTER_BUFF])) + 1
        else:  
            buffEnd = entEnd + len(" ".join(afterWords)) + 1
        
        """
        testing
        print(row["ent"][j])
        print(snippet[entStart:buffEnd])
        print("---------------") 
        """
        
        """
        right now this is set up so that we don't include entities before 
        
        """
        snippetLeft = max(prevEntEnd, buffStart) 
        
        #if we have a next entity to look ahead to, see where it starts
        #if it starts before where our buffer will end, stop when we hit that next entity 
        if j + 1 < len(row["start"]):
            snippetRight = min(row["start"][j + 1], buffEnd)
        else: 
            snippetRight = buffEnd
            
        currEntSnippets.append(snippet[snippetLeft: snippetRight])
        prevEntEnd = entEnd 
    entSnippets.append(currEntSnippets) 

353245it [01:02, 5654.86it/s]


In [23]:
df["entSnippets"] = entSnippets

In [24]:
#now that we've gotten the snippets we can explode back out
df = df.explode(list(df.columns))

In [76]:
pd.options.display.max_colwidth = 25
currSample = df.sample(20)
ix = 19
print(list(currSample["ent"])[ix])  
print(list(currSample["entSnippets"])[ix])  

Laura
 out on the latest episodes. You can also buy Laura a Chai Latte to express your thanks at https://www.buymeacoffee.com/soulcoach 


In [77]:
#now we want to determine whether an entity is a ground truth label or not 
#we can do so by checking if it is contained in the itunes author field 
def entIsAuthor(inRow):
    if inRow["ent"] == inRow["ent"]: 
        ent = inRow["ent"] 
    else: 
        return False 
    if inRow["itunesAuthor"] == inRow["itunesAuthor"]: 
        auth = inRow["itunesAuthor"]
    else: 
        return False 
        
    return ent.lower() in auth.lower()

df["entInAuthor"] = df.apply(lambda x: x["ent"].lower() in x["itunesAuthor"].lower(), axis=1)

In [81]:
df.shape

(994159, 9)

In [83]:
len(set(df.index))

353245

In [86]:
posClass = df[df["entInAuthor"] == True]
len(set(posClass.index))

148423

In [88]:
#we can get decent ground truths for around 1 / 2 of the podcasts for which we have entities 
#
print(f"frac. of podcasts for which entity exists {148423 / 353245}") 
print(f"frac. of podcasts for which we have metadata {148423/750000}")  

frac. of podcasts for which entity exists 0.4201701368738411
frac. of podcasts for which we have metadata 0.19789733333333334


In [90]:
df = df.drop(columns=["podDescription"])

In [96]:
df["snippetLen"] = df["entSnippets"].apply(lambda x: len(x.split())) 

In [97]:
df.shape

(994159, 9)

In [100]:
#but need to figure out why this is happening..
df = df[df["snippetLen"] > 0]

In [101]:
posClass = df[df["entInAuthor"] == True] 
negClass = df[df["entInAuthor"] == False].sample(len(posClass)) 

In [102]:
trainData = pd.concat([posClass, negClass], axis=0) 

In [103]:
trainData.to_csv("/shared/3/projects/benlitterer/podcastData/hostIdentification/itunesGTsubsetDescriptions.tsv", sep="\t") 