## Data Preparation

In [333]:
import json
import pandas as pd
import nltk
from collections import Counter
import operator
import os
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

In [3]:
# load podcast metadata
metadata_df = pd.read_csv("podcast_data_no_audio/metadata/metadata.tsv",sep='\t')
metadata_df.head(1)

Unnamed: 0,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix
0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,['en'],https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj


In [291]:
# get list of all paths to the json-files of english episodes given subset number (bart: 0 , juno: 1, joris: 2)

def get_paths_for_en_episodes(subset_number):
    """
    Function returns list of all paths to the json-files of english 
    episodes given subset number (bart: 0 , juno: 1, joris: 2)
    
    """

    path1 = 'podcast_data_no_audio/podcasts-transcripts/' + str(subset_number)

    folders = listdir(path1)

    if '.DS_Store' in folders:
        folders.remove('.DS_Store')

    podcast_episodes_paths = []

    for letter_or_number in tqdm(folders):    
        path2 = path1 + '/' + letter_or_number


        for show_uri in listdir(path2):
            path3 = path2 + '/' + show_uri

            # select english shows only
            show_metadata = metadata_df.loc[metadata_df['show_filename_prefix'] == show_uri]

            if len(show_metadata['language'].unique()) > 0:
                if 'en' in show_metadata['language'].unique()[0]:
                    for episode_uri in listdir(path3):
                        path4 = path3 + '/' + episode_uri

                        if '.json' in path4:
                            podcast_episodes_paths.append(path4)

                
        
    return len(podcast_episodes_paths), podcast_episodes_paths

In [292]:
get_paths_for_en_episodes(0)

100%|██████████| 36/36 [00:19<00:00,  2.01it/s]


(13169,
 ['podcast_data_no_audio/podcasts-transcripts/0/R/show_0rEzHBbqtuqgP4zEmKzqIH/1vQEPaGpic4rvaVGf3B9P6.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RQpGNWVQtBBwKP0TEPPyM/1RvpxteWceRd94M0jgY6kv.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RQpGNWVQtBBwKP0TEPPyM/34qeqrulTtcZTkECBAdIPy.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0R6EeMT7ViHV1rCqNw9FNg/6Y077pNlcA4bIt311QV198.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/4lgBQcROehM4nmmsOcRcOV.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/7wSNsD65ducgtyMcblEOuP.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/6fSvpbHfRKD8GaimKhlqb8.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/0qSVjYk4YRh9nl61KRjLKj.json',
  'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RFnIxovd7WWgpLXzzXrKF/1hnWMZXjCPLBA0gaAI1AZU.json',
  'podcast_data_no_a

In [293]:
## Belangrijke functie voor representatie van een podcast episode ##

def dialogue_json_to_pandas(json_path):
    """
    This function converts a podcast .json transcript into a 
    pandas dataframe with speaker tags, utterance text and open labels
    
    """
    
    with open(json_path) as f:
        data = json.load(f)

    # get transcript parts from json file, remove empty parts
    transcript_parts = []
    for utt in data['results']:
        try:
            trans = utt['alternatives'][0]['transcript']
        except KeyError:
            trans = 0

        if trans != 0:
            transcript_parts.append(utt)
    

    # create list of sentences from dialogue
    sentences = []
    for index, utterance in enumerate(transcript_parts):

        # get text of utterance
        utterance_text = utterance['alternatives'][0]['transcript']
        
        # get sentences from text to split based on speakerTag
        utterance_sentences = nltk.sent_tokenize(utterance_text)
        for sent in utterance_sentences:
            sent = sent.split(" ")
            if '' in sent:
                sent.remove('')
            sentences.append(sent)
                
    
    # get words with tags from transcript file
    words_with_tags = data['results'][-1]['alternatives'][0]['words']
    
    
    # assign speakerTag to each sentence
    # also fix mistakes when speakerTag switches to other speaker
    # in the middle of a sentence
    sentences_with_tags = []
    
    word_idx = 0
    for index, sentence in enumerate(sentences):
        sent_with_tags = []
        for word in sentence:
            sent_with_tags.append((word, words_with_tags[word_idx]['speakerTag']))
            word_idx += 1
        
        c = Counter(elem[1] for elem in sent_with_tags)
        
        sent_speakerTag = max(c.items(), key=operator.itemgetter(1))[0]
        
        
        sentences_with_tags.append((' '.join(sentence), sent_speakerTag))
        
        
    # merge sentences with same consecutive tags
    utterances_texts = []
    utterances_tags = []
    merged_sents = []
    for index, tagged_sent in enumerate(sentences_with_tags):

        
        # set initial value for tagged_sent
        if index == 0:
            curr_tag = tagged_sent[1]
        
        # speaker switch
        if curr_tag != tagged_sent[1] and index > 0:
             
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])

            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            merged_sents = []

        curr_tag = tagged_sent[1]
        merged_sents.append(tagged_sent)
        
        if index == len(sentences_with_tags)-1:
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])

            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            
            
   
    # make utterances and tags are the same shape
    if len(utterances_texts) == len(utterances_tags):
        
        # create pandas dataframe
        dialogue_df = pd.DataFrame(columns=['speaker_tag', 'text', 'sentiment_score'])

        # fill dataframe
        for i, text in enumerate(utterances_texts):
                        
            dialogue_df.loc[i] = [utterances_tags[i]] + [text] + [222]

    
    return dialogue_df

In [287]:
dialogue_json_to_pandas(zero[2])

Unnamed: 0,speaker_tag,text,sentiment_score
0,3,Hello and welcome to the sixth episode of the ...,
1,1,It's going very good David. How are you?,
2,3,I'm brilliant. Thank you. So you must be trave...,
3,1,"No at the moment. I'm in Budapest, but you're ...",
4,3,How's that you're meeting with your clients? Y...,
5,1,"Yes, we have meetings with the client. We have...",
6,3,"You work in a very modern technology company, ...",
7,1,Yeah. It's not like the usual path as in terms...,
8,3,So this is how I found this opportunity and I ...,
9,1,Yes.,


In [None]:
# sample 10 random podcast dialogues
import random

# eps_to_be_labeled = random.sample(podcast_episodes_paths, 25)

# print(eps_to_be_labeled)

In [343]:
# hardcoded sampled results from previous cell
zero = ['podcast_data_no_audio/podcasts-transcripts/2/H/show_2hg2vvqunks00U9f4qUOC3/2qk9W8qwBuk0LCKqRFpH1r.json', 'podcast_data_no_audio/podcasts-transcripts/2/J/show_2jHZGbtwbEEFc7qE55mhUK/1SRGP3FESnnZai2udszkfn.json', 'podcast_data_no_audio/podcasts-transcripts/2/S/show_2SdLm6vyD0QNxkY4hI6s2I/2CzjVyFj7XFglvmRa1iucZ.json', 'podcast_data_no_audio/podcasts-transcripts/2/E/show_2e6uMlVjMtBhL8i0x64JKM/40tos1pWuH7DPkwWvCfjJJ.json', 'podcast_data_no_audio/podcasts-transcripts/2/H/show_2hZEAiWSgzlSNiq51esZ01/40l5bpwt3dG2zIP8V5RlFa.json', 'podcast_data_no_audio/podcasts-transcripts/2/O/show_2OfLkbr9RZOYqOopSgvvKw/6Xz0Ux4TXEONvSC1DJJWcK.json', 'podcast_data_no_audio/podcasts-transcripts/2/T/show_2tgYzl2R49gtGAwBZB6uBY/6NPj3lOojprCcgLCwYrbAW.json', 'podcast_data_no_audio/podcasts-transcripts/2/D/show_2dY2l2v95zz9HTlYvDSAdA/5M2OBQ28epJzCK7jrp6wXh.json', 'podcast_data_no_audio/podcasts-transcripts/2/Q/show_2qGq4769JdhDJjjm2OxgdK/4LlBO1sknL3Dhxbe2TdETZ.json', 'podcast_data_no_audio/podcasts-transcripts/2/O/show_2OVM4aQOEgo8uvuUcqJSei/3W8hTv3AZEqNyfKmXeO9TB.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21UuVYwAChI6cmIHsjSvEI/6mBjhK9KtMuUG0s9gPKfF0.json', 'podcast_data_no_audio/podcasts-transcripts/2/D/show_2Dg4vcTJNeKC9XaoZP41eo/57t1O5EO9Yj4YybNVcF8JK.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/271HzioAyAVVnw8Dvg7YLZ.json', 'podcast_data_no_audio/podcasts-transcripts/2/F/show_2ffrIgrl1RHH31bCPjUoHY/4u81O6LMUepisJ60XT0MyN.json', 'podcast_data_no_audio/podcasts-transcripts/2/O/show_2ojPEbgnYzPwY3VnCzsJ3O/5xnLfGruUoLNThFMGt6sO6.json', 'podcast_data_no_audio/podcasts-transcripts/2/X/show_2xfvMWMHveZLDSXAzW1XLA/26TEr59yoiYVlYHmviSbRe.json', 'podcast_data_no_audio/podcasts-transcripts/2/P/show_2pLmUXCLpZKNPGWdeWYljO/434gssKeXoc0UtdULnS8IR.json', 'podcast_data_no_audio/podcasts-transcripts/2/O/show_2OIRaDoDPKx77wz1945ihy/4vjwlEtMWZ1OSgoB2ZLxAm.json', 'podcast_data_no_audio/podcasts-transcripts/2/4/show_24uMTkmK7t6muj1DVGh1TY/3O76RjHDtLsp3h079OpAII.json', 'podcast_data_no_audio/podcasts-transcripts/2/3/show_23avhq1Xtlxb0Cs2cHGfbo/7zaMIq5zfSKMYrqNXEQaqy.json', 'podcast_data_no_audio/podcasts-transcripts/2/V/show_2VAf90gyHoLUWjgdiKVtm9/36oNQGvH8ofMxc9KKfwqAc.json', 'podcast_data_no_audio/podcasts-transcripts/2/J/show_2jnh0eW8ZMDdfFehKH5Wh0/3ClkdaFYGJlqkW1rCUlIBm.json', 'podcast_data_no_audio/podcasts-transcripts/2/Q/show_2qrukKr6EMxUJJLSLNADS0/0gIBv5NQtIdgJic4NzU1jr.json', 'podcast_data_no_audio/podcasts-transcripts/2/8/show_287pHl4IAzcUQbjKYjaVJc/2NNlWt3OjmS8pl26nJWTHm.json', 'podcast_data_no_audio/podcasts-transcripts/2/W/show_2wXJp6I575NASrepx3PtLr/20zGd9l0bGv155EUN8Y0Bg.json']
one = ['podcast_data_no_audio/podcasts-transcripts/2/6/show_26VrdARIwcUfOROjvjx7yS/4DS5eUnvwZPREo5xqfz2vT.json', 'podcast_data_no_audio/podcasts-transcripts/2/U/show_2uLBfGb9Q8Rfp2ZFZeACQh/0vCc53XFpC2sdSDi66aGaf.json', 'podcast_data_no_audio/podcasts-transcripts/2/A/show_2arnknbdyFPwKDWYNsqdZh/3KI3O0wtxzMRZHBdXkXQws.json', 'podcast_data_no_audio/podcasts-transcripts/2/T/show_2ty8gvAnvYP31X8TUrFwoj/0L3wIQ6bK5c6ncC28k8CUc.json', 'podcast_data_no_audio/podcasts-transcripts/2/M/show_2M81RFGI2Smj7rXb2CrEOi/1P1mkiPg1hbh8ZgNMjc4vm.json', 'podcast_data_no_audio/podcasts-transcripts/2/2/show_226e6FuXejgLy4RKvy8LPV/3HsmPJu2y27NzHvKp8eme2.json', 'podcast_data_no_audio/podcasts-transcripts/2/0/show_20EHdsnb0hsiyKttM8IphJ/2j51Fuehs28Tkw35UW6gQN.json', 'podcast_data_no_audio/podcasts-transcripts/2/H/show_2hXGSpQc5I2jBHWAthRiLk/4ythGnUFhq4p8e7qYBczuE.json', 'podcast_data_no_audio/podcasts-transcripts/2/K/show_2keDw7cUZ1diyWvKSVKRG7/3l7jSYdxXPEsqVUPwCfLX1.json', 'podcast_data_no_audio/podcasts-transcripts/2/G/show_2gYqd8FU6rvfKDwKveS75n/1KqvErtjh8JBYzAsirNnmJ.json', 'podcast_data_no_audio/podcasts-transcripts/2/4/show_24sk7Itw6gygKQBxxul0gN/5RYRkqoVpUSABiHpmPgS3t.json', 'podcast_data_no_audio/podcasts-transcripts/2/E/show_2EKLcRbxTurq4uEQbvvFUl/1nyWp3I8gRbJWY7YzFw4AK.json', 'podcast_data_no_audio/podcasts-transcripts/2/G/show_2gTyfeofJqZ5AD91QtZ6Zl/7pIXjTYHZ6388g2TO7moLh.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/4DFCzpDjakmh6mCe9WbeJc.json', 'podcast_data_no_audio/podcasts-transcripts/2/K/show_2KqaP5uKhR6oTYsRKIX7gS/620T9kUq72aOIpp7mmNrsY.json', 'podcast_data_no_audio/podcasts-transcripts/2/F/show_2FwUZk5JRc5ACGBBHhoEmD/0DgqdwCPFZMPdvfv9Zqd5k.json', 'podcast_data_no_audio/podcasts-transcripts/2/G/show_2GdUJ58o7BFc031jaAPCKp/0SwlegOqdcs3KaGHZf7XuO.json', 'podcast_data_no_audio/podcasts-transcripts/2/O/show_2oiqxGCkxiC55ZXxXHP8KM/1cTcQWVrZ609svNJo0tstr.json', 'podcast_data_no_audio/podcasts-transcripts/2/F/show_2FCf4j5jUfGqB4LGMlxtUV/1A1aP40kBs9A7ynRPDuJqr.json', 'podcast_data_no_audio/podcasts-transcripts/2/K/show_2KqaP5uKhR6oTYsRKIX7gS/0QyYgwsDtYGsQCJtb92yHo.json', 'podcast_data_no_audio/podcasts-transcripts/2/L/show_2L9QhUwd9072wPGkQJqpLJ/0pb2vvJcKC67W8y3KdsUnq.json', 'podcast_data_no_audio/podcasts-transcripts/2/J/show_2Jjf9Qa10Xwl6hj1mz5enA/6JKbodBOu8QCqBjCvpFeOw.json']
two = ['podcast_data_no_audio/podcasts-transcripts/2/G/show_2gs5645b3F1d2End3KXBp4/0iSMYT4A1ULCUCY8STewvY.json', 'podcast_data_no_audio/podcasts-transcripts/2/9/show_29Qk08pTL6B9LPRejivbS6/4VvILwzXpbTcWt9YMoyFUZ.json', 'podcast_data_no_audio/podcasts-transcripts/2/I/show_2iHte4DYbwL2mlhgysmb7l/11ybYZlrlm3lmkWX1OXl3p.json', 'podcast_data_no_audio/podcasts-transcripts/2/T/show_2tetA7Ub1xxSLm0oHA1gmV/6VuAZ7nxVzGDcK7Ie8jXyJ.json', 'podcast_data_no_audio/podcasts-transcripts/2/Q/show_2qY4nOu5zaZ9CMTlB6XBj5/6y7bLRK9VKOJiDhZaZM9Sy.json', 'podcast_data_no_audio/podcasts-transcripts/2/I/show_2iaGIA0ODxmgHSmKXYopRX/0GfpjRHpEhQK1bwWkoNbza.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/6D59qfOpamnVRDElnuTocj.json', 'podcast_data_no_audio/podcasts-transcripts/2/W/show_2W1Hy8xOcSi0b8ppdQ52qU/21iECAkP64WkYOc9kIusyI.json', 'podcast_data_no_audio/podcasts-transcripts/2/8/show_28KKqFWNBw6kk5aBFu2viN/6WphEO0vKaCcJeVsYa7Kng.json', 'podcast_data_no_audio/podcasts-transcripts/2/K/show_2kl1GlTTgSwgobmxgGEaBX/4RO6YTUkTTryRb3uNXL4l0.json', 'podcast_data_no_audio/podcasts-transcripts/2/0/show_20zWNMtAU8S3rir62g1r0Q/1T3FUP68I7oYVnuOyM44uE.json', 'podcast_data_no_audio/podcasts-transcripts/2/M/show_2M8KJDwxT0zzwXEJhDZYCL/06mLFp9wQFnIQd79Qjg4jq.json', 'podcast_data_no_audio/podcasts-transcripts/2/C/show_2C0AgUOt4eCULjFjb3mynN/4jaXWbdotGutHNxKKjMOWs.json', 'podcast_data_no_audio/podcasts-transcripts/2/H/show_2hq8BvOX4DocQpqkm20XLu/6kz63TvGtLKe5dI9qr8sqm.json', 'podcast_data_no_audio/podcasts-transcripts/2/A/show_2AkW5V4H6xAh8IXJU0jHUm/2PHi40upTwPGW5btJugKuK.json', 'podcast_data_no_audio/podcasts-transcripts/2/H/show_2HIFmqNqJkR2SADAcG4Fpq/6EUGCLZqWARW8TcNzezI0e.json', 'podcast_data_no_audio/podcasts-transcripts/2/E/show_2eXdry3liXk0Z0BfTCVKff/4TssEgbtKfFBhH2NVpkhD2.json', 'podcast_data_no_audio/podcasts-transcripts/2/Q/show_2qy2KehR0K2FGZqsEsU2CY/59S06bau9DJEEx1f1dC4y7.json', 'podcast_data_no_audio/podcasts-transcripts/2/O/show_2OVM4aQOEgo8uvuUcqJSei/7o7HaMZf49jS2yyXYUdSCc.json', 'podcast_data_no_audio/podcasts-transcripts/2/J/show_2jW0aO9MqYFndIcIAhFukV/2Tt47VISLTvm8gTyhFjyb8.json', 'podcast_data_no_audio/podcasts-transcripts/2/4/show_24aqN472kMGKAhdJbIK59L/7sNutOB7XH6jihysx0tfiT.json', 'podcast_data_no_audio/podcasts-transcripts/2/A/show_2AsvwIbKhe8yV3ePi8nGY0/0ULidGb0REL65hEkd7zwLo.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/6FSUZ1PWVjCa7aHLifMPGa.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/2aZy5qlRmNbPDXIiV1hxQJ.json', 'podcast_data_no_audio/podcasts-transcripts/2/9/show_29DGC8r0eQZWtzPs10jASp/5qHQFfGcYnLWUPCOQt5PfM.json']

unlabeled_json_paths = [zero, one, two]
# Prepare csv dataset (per episode) for labeling

dialogue_number = 0
for index, part in tqdm(enumerate(unlabeled_json_paths)):
    for path in part:
        df = dialogue_json_to_pandas(path)
        
        if index == 0:
            df.to_csv('labeled_data/bart/dialogue_{}.csv'.format(dialogue_number), sep='\t', encoding='utf-8', index=False)
        elif index == 1:
            df.to_csv('labeled_data/juno/dialogue_{}.csv'.format(dialogue_number), sep='\t', encoding='utf-8', index=False)
        elif index == 2:
            df.to_csv('labeled_data/joris/dialogue_{}.csv'.format(dialogue_number), sep='\t', encoding='utf-8', index=False)


        dialogue_number += 1
   
    

3it [00:11,  4.04s/it]
