In [1]:
import pandas as pd
from collections import deque

In [3]:
inPath = "/shared/3/projects/benlitterer/podcastData/diarization/mayJune/mcdn.podbean.com/0e/httpsmcdn.podbean.commfweb0e8donliveshow_202005311600.mp3.rttm"
cols = ["dummy1", "dummy2", "dummy3", "start", "duration", "dummy4", "dummy5", "speakerNum", "dummy6", "dummy7"]
diarizeDf = pd.read_csv(inPath, sep=" ", names=cols)

mergedPath = "/shared/3/projects/benlitterer/podcastData/prosodyMerged/floydMonth/mcdn.podbean.com/0e/httpsmcdn.podbean.commfweb0e8donliveshow_202005311600.mp3MERGED"
mergedDf = pd.read_csv(mergedPath)

In [4]:
#whether the diarization chunk has passed and we need to no longer
#keep it in the queue
def hasPassed(tStart, tEnd, dStart, dEnd): 
    if tStart > dEnd: 
        return True
    return False 

def isOverlapping(tStart, tEnd, dStart, dEnd): 
    if tStart < dEnd and tEnd > dStart: 
        return True
    return False 

In [5]:
#get speaker segments as lists of tuples
diarizeDf["end"] = diarizeDf["start"] + diarizeDf["duration"]
dTuples = [(row["speakerNum"], row["start"], row["end"]) for i, row in diarizeDf.iterrows()]

In [6]:
#we want to merge these!
#NOTE: this works because the diarization is in order of start time
#once we hit a non-overlapping diarization segment, we know the rest of the segments 
#in the queue don't overlap
q = deque(dTuples) 
overlappingSegs = []
for i, row in mergedDf.iterrows(): 
    tStart = row["start"]
    tEnd = row["end"]

    #go through queue and keep removing while the speaker segment is no longer overlapping
    passed = True 

    #if there is more queue left and we have non-overlapping segments in it
    while len(q) > 0 and passed == True: 
        #get first item in queue, unpack tuple into variables 
        sNum, dStart, dEnd = q[0]
            
        passed = hasPassed(tStart, tEnd, dStart, dEnd)

        #if the current queue item no longer overlaps with this diarization segment, remove it 
        if passed == True: 
            q.popleft()

    #go through elements in queue that overlap with current word
    overlapping = True
    tupIter = 0 
    currentOverlap = []
    while tupIter < len(q) and overlapping == True: 
        #get speaker segment
        sNum, dStart, dEnd = q[tupIter]

        #check whether we are overlapping
        overlapping = isOverlapping(tStart, tEnd, dStart, dEnd)

        if overlapping: 
            currentOverlap.append(sNum)
        tupIter += 1
    
    #add the overlapping segments for the current word
    overlappingSegs.append(currentOverlap)


In [7]:
mergedDf["speakers"] = overlappingSegs

In [8]:
mergedDf.head() 

Unnamed: 0.1,Unnamed: 0,index,start,end,content,mfcc1_sma3,mfcc2_sma3,mfcc3_sma3,mfcc4_sma3,F0semitoneFrom27.5Hz_sma3nz,F1frequency_sma3nz,mfcc1_sma3Slope,mfcc2_sma3Slope,mfcc3_sma3Slope,mfcc4_sma3Slope,F0semitoneFrom27.5Hz_sma3nzSlope,F1frequency_sma3nzSlope,speakers
0,0,0,0.0,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,[]
1,1,1,0.0,0.21,[,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[]
2,2,2,0.21,0.42,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,[]
3,3,3,0.42,1.04,US,0.220951,0.077264,-0.000688,0.158025,0.0,28.934045,2.071412,0.72435,-0.006454,1.48148,0.0,271.256673,[]
4,4,4,1.04,1.26,IC,33.3587,1.734861,16.575285,6.717922,15.148576,777.832618,40.809847,14.434736,10.976565,10.033106,-0.510043,918.843818,[]


In [9]:
#from colorama import Fore
#foreColors = [Fore.RED, Fore.GREEN, Fore.YELLOW, Fore.BLUE, Fore.MAGENTA, Fore.CYAN]
colList = ['#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628','#f781bf']

In [11]:
uSpeakers = set([speakerList[0] for speakerList in mergedDf["speakers"] if len(speakerList) > 0])
numSpeakers = len(uSpeakers) 
colDict = dict(zip(uSpeakers, colList[:numSpeakers]))
colDict["NONE"] = "#000000"
colDict["MULT"] = '#999999'

In [12]:
mergedDf.columns

Index(['Unnamed: 0', 'index', 'start', 'end', 'content', 'mfcc1_sma3',
       'mfcc2_sma3', 'mfcc3_sma3', 'mfcc4_sma3', 'F0semitoneFrom27.5Hz_sma3nz',
       'F1frequency_sma3nz', 'mfcc1_sma3Slope', 'mfcc2_sma3Slope',
       'mfcc3_sma3Slope', 'mfcc4_sma3Slope',
       'F0semitoneFrom27.5Hz_sma3nzSlope', 'F1frequency_sma3nzSlope',
       'speakers'],
      dtype='object')

In [13]:
#highlight transcript colors 
pastSpeakList = "NONE"
currText = ""
allText = ""

for i, row in mergedDf.iterrows(): 
    word = row["content"]
    speakList = row["speakers"]

    """ 
    if len(speakers) > 0: 
        speakList = speakers[-1]
    else: 
    """
    if len(speakList) == 0: 
        speakList = ["NONE"]

    if speakList != pastSpeakList: 
        if len(pastSpeakList) > 1: 
            allText += f'<font color = "{colDict["MULT"]}">{currText}</font>'
        else:  
            allText += f'<font color = "{colDict[pastSpeakList[0]]}">{currText}</font>'

        currText = ""

    if word == word: 
        currText += word
    pastSpeakList = speakList

    


In [14]:
allText

'<font color = "#999999"></font><font color = "#000000"> [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUS</font><font color = "#984ea3">IC PLAYING] Ladies and gentlemen,</font><font color = "#999999"> get</font><font color = "#984ea3"> those dollars ready</font><font color = "#000000">.</font><font color = "#984ea3"> Comin

In [134]:
def pPrint(inStr): 
    outStr = ""
    for i, subStr in enumerate(inStr.split()): 
        if i % 20 == 0: 
            outStr += subStr + "\n"
        else: 
            outStr += subStr + " "
        
    print(outStr)

In [136]:
pPrint(allText)

[30m
[MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING]
[MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING]
[MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING]
[MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING]
[MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUSIC PLAYING] [MUS[35mIC PLAYING] Ladies and gentlemen, get those dollars ready[30m.[35m Coming
up and next to the stage on the other ground podcast, it's[30m your[35m host, passive J and Ryan. [MUS[30mIC PLAYING]
Good[34m afternoon and welcome to a very special edition of Other G