In [6]:
import pandas as pd
import numpy as np
from collections import Counter

In [4]:
inPath = "/shared/3/projects/benlitterer/podcastData/hostIdentification/hostGuestPredictions/10000LongPredictions.json"
df = pd.read_json(inPath, orient="records", lines=True)

In [7]:
def getMode(inList): 
    if len(inList) == 1: 
        return inList[0]
    
    data = Counter(inList)
    modeVal, modeCount = data.most_common(1)[0]

    #we default to neither if we have a split decision
    
    if modeCount == 1: 
        return 2
    else: 
        return modeVal 
    
    return modeVal

#here we take the index of the maximum probability prediction 
#after mean pooling over columns 
def getConfidenceAggregation(inList): 
    inList = np.array(inList)
    return np.argmax(np.mean(inList, axis=0))

#we take in a 2d array of shape n x 3
#get the prediction for the row with the highest probability 
def getMostConfident(inList): 

    maxVal = 0 
    maxValIx = 2
    for row in inList: 
        for colNum, item in enumerate(row): 

            #if we have a new highest value, update 
            #note that maxValIx is just our prediction of 0, 1, or 2
            if item > maxVal: 
                maxVal = item 
                maxValIx = colNum
    return maxValIx

aggDf = df[["potentialOutPath", "ent", "pred", "prob"]].groupby(["potentialOutPath", "ent"]).agg(list)
aggDf["modalPred"] = aggDf["pred"].apply(getMode)
aggDf["confPred"] = aggDf["prob"].apply(getMostConfident)
aggDf["meanAggPred"] = aggDf["prob"].apply(getConfidenceAggregation)


In [8]:
aggArr = aggDf[["modalPred", "confPred", "meanAggPred"]].T.values.tolist()

In [9]:
np.corrcoef(aggArr)

array([[1.        , 0.96811818, 0.9708293 ],
       [0.96811818, 1.        , 0.99684888],
       [0.9708293 , 0.99684888, 1.        ]])

In [13]:
#aggDf["numPreds"] = aggDf["pred"].apply(len)

In [14]:
#roughly 1/6th of entities have 2 predictions to use 
#aggDf["numPreds"].value_counts()

numPreds
1    6917
2    1013
3     182
4      47
5      18
6       7
7       4
9       1
8       1
Name: count, dtype: int64

In [11]:
outPath = "/shared/3/projects/benlitterer/podcastData/hostIdentification/hostGuestPredictions/10000AggPredictions.json"
aggDf.to_json(outPath, orient="records", lines=True)