## Data Preparation

In [4]:
import json
import pandas as pd
import nltk
from collections import Counter
import operator
import os
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

In [5]:
# load podcast metadata
metadata_df = pd.read_csv("podcast_data_no_audio/metadata/metadata.tsv",sep='\t')
metadata_df.head(1)

Unnamed: 0,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix
0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,['en'],https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj


In [6]:
# get list of all paths to the json-files of english episodes given subset number (bart: 0 , juno: 1, joris: 2)

def get_paths_for_en_episodes(subset_number):
    """
    Function returns list of all paths to the json-files of english 
    episodes given subset number (bart: 0 , juno: 1, joris: 2)
    
    """

    path1 = 'podcast_data_no_audio/podcasts-transcripts/' + str(subset_number)

    folders = listdir(path1)

    if '.DS_Store' in folders:
        folders.remove('.DS_Store')

    podcast_episodes_paths = []

    for letter_or_number in tqdm(folders):    
        path2 = path1 + '/' + letter_or_number


        for show_uri in listdir(path2):
            path3 = path2 + '/' + show_uri

            # select english shows only
            show_metadata = metadata_df.loc[metadata_df['show_filename_prefix'] == show_uri]

            if len(show_metadata['language'].unique()) > 0:
                if 'en' in show_metadata['language'].unique()[0]:
                    for episode_uri in listdir(path3):
                        path4 = path3 + '/' + episode_uri

                        if '.json' in path4:
                            podcast_episodes_paths.append(path4)

                
        
    return len(podcast_episodes_paths), podcast_episodes_paths

In [8]:
## Belangrijke functie voor representatie van een podcast episode ##

def dialogue_json_to_pandas(json_path):
    """
    This function converts a podcast .json transcript into a 
    pandas dataframe with speaker tags, utterance text and open labels
    
    """
    
    with open(json_path) as f:
        data = json.load(f)

    # get transcript parts from json file, remove empty parts
    transcript_parts = []
    for utt in data['results']:
        try:
            trans = utt['alternatives'][0]['transcript']
        except KeyError:
            trans = 0

        if trans != 0:
            transcript_parts.append(utt)
    

    # create list of sentences from dialogue
    sentences = []
    for index, utterance in enumerate(transcript_parts):

        # get text of utterance
        utterance_text = utterance['alternatives'][0]['transcript']
        
        # get sentences from text to split based on speakerTag
        utterance_sentences = nltk.sent_tokenize(utterance_text)
        for sent in utterance_sentences:
            sent = sent.split(" ")
            if '' in sent:
                sent.remove('')
            sentences.append(sent)
                
    
    # get words with tags from transcript file
    words_with_tags = data['results'][-1]['alternatives'][0]['words']
    
    
    # assign speakerTag to each sentence
    # also fix mistakes when speakerTag switches to other speaker
    # in the middle of a sentence
    sentences_with_tags = []
    
    word_idx = 0
    for index, sentence in enumerate(sentences):
        sent_with_tags = []
        for word in sentence:
            sent_with_tags.append((word, words_with_tags[word_idx]['speakerTag']))
            word_idx += 1
        
        c = Counter(elem[1] for elem in sent_with_tags)
        
        sent_speakerTag = max(c.items(), key=operator.itemgetter(1))[0]
        
        
        sentences_with_tags.append((' '.join(sentence), sent_speakerTag))
        
        
    # merge sentences with same consecutive tags
    utterances_texts = []
    utterances_tags = []
    merged_sents = []
    for index, tagged_sent in enumerate(sentences_with_tags):

        
        # set initial value for tagged_sent
        if index == 0:
            curr_tag = tagged_sent[1]
        
        # speaker switch
        if curr_tag != tagged_sent[1] and index > 0:
             
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])

            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            merged_sents = []

        curr_tag = tagged_sent[1]
        merged_sents.append(tagged_sent)
        
        if index == len(sentences_with_tags)-1:
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])

            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            
            
   
    # make utterances and tags are the same shape
    if len(utterances_texts) == len(utterances_tags):
        
        # create pandas dataframe
        dialogue_df = pd.DataFrame(columns=['speaker_tag', 'text', 'sentiment_score'])

        # fill dataframe
        for i, text in enumerate(utterances_texts):
                        
            dialogue_df.loc[i] = [utterances_tags[i]] + [text] + ['']

    
    return dialogue_df

In [16]:
# # sample 10 random podcast dialogues
# import random

# eps_to_be_labeled = random.sample(get_paths_for_en_episodes(1)[1], 25)

# total_length = 0
# for path in eps_to_be_labeled:
#     df_ = dialogue_json_to_pandas(path)
#     total_length += len(df_)
    
# print(total_length)
# print(eps_to_be_labeled)

In [19]:
# hardcoded sampled results from previous cell
zero = ['podcast_data_no_audio/podcasts-transcripts/0/E/show_0e2tMqHNabAf1lJUF2Nakg/1Y6InCAx7VPhMB67HHEpVE.json', 'podcast_data_no_audio/podcasts-transcripts/0/F/show_0f2P0fH4EwuEtXKpXIt7Ui/5Km5wY535jnqy9glmwjuV8.json', 'podcast_data_no_audio/podcasts-transcripts/0/6/show_06DN2th96dYmRtHEQFNKTo/0lmQa6w0BR8e5TyzF5wjdN.json', 'podcast_data_no_audio/podcasts-transcripts/0/L/show_0L5kSg5frpqKFnQFasMcGG/3oYP9Ukre8PhDHG4TBRKn8.json', 'podcast_data_no_audio/podcasts-transcripts/0/C/show_0CSrUveOqf2QM7fgrdIkVy/0sP7Z8pgO0QP1cWeavTCnB.json', 'podcast_data_no_audio/podcasts-transcripts/0/T/show_0tN8aZWZ5GKbkc0gzUUFDP/7j7G7kaPgxH8kspdXW3HiA.json', 'podcast_data_no_audio/podcasts-transcripts/0/V/show_0vG9O03AYlTclAHNxfFlDI/4vmVc5gXNBdMkUydhi7WcR.json', 'podcast_data_no_audio/podcasts-transcripts/0/N/show_0NpVLVfKd8mtKlBjmEj0vu/16bv16BRw8oV8AUH5JaicY.json', 'podcast_data_no_audio/podcasts-transcripts/0/G/show_0gflUCrpF0H9uXuWtMQWLx/1Zk1xrDmSpwuCtdWHF8dMn.json', 'podcast_data_no_audio/podcasts-transcripts/0/L/show_0LaYlRViq9hVwwGykQLHd3/4q9et9b1xeAJcN3FnUq1vN.json', 'podcast_data_no_audio/podcasts-transcripts/0/2/show_02qeRiltSbNgNczt4wjP6q/72paNmckHAMP0B8dj7qgAy.json', 'podcast_data_no_audio/podcasts-transcripts/0/M/show_0m4KhpeNnmuvPlCJj5l0oV/61BB4ODbjq0RQM2mQOrZPO.json', 'podcast_data_no_audio/podcasts-transcripts/0/I/show_0I22C9iyvVT3M6DEILuD9F/1O7Q39FeikjQofDo2QpzT4.json', 'podcast_data_no_audio/podcasts-transcripts/0/L/show_0L04op9D76TOfmzm7yOf9T/1zWtKaYeyBd6k7MkOb67V4.json', 'podcast_data_no_audio/podcasts-transcripts/0/N/show_0NGeePSmXrnyU4k6EX4wNv/2i84Vw9tZDWrD10bt2uvR2.json', 'podcast_data_no_audio/podcasts-transcripts/0/I/show_0I22C9iyvVT3M6DEILuD9F/1pef0MZpUV05KlJGqEPjTG.json', 'podcast_data_no_audio/podcasts-transcripts/0/0/show_00kBfZGbf0p8LKaPxpvkbi/0oVZ7OHBw3R6op5FioXQrQ.json', 'podcast_data_no_audio/podcasts-transcripts/0/9/show_09CgCaGCGxbeaeMVviFKxw/00u97YwLndEB0KSNoaIA22.json', 'podcast_data_no_audio/podcasts-transcripts/0/M/show_0MTCY7tw7AKad94BlV25Lh/6XnXwwno1h7vHIgzy9Pwyg.json', 'podcast_data_no_audio/podcasts-transcripts/0/R/show_0RCLNMBkrHcVMruEONenxD/25V7rh0ypzAwdKD1UWBTse.json', 'podcast_data_no_audio/podcasts-transcripts/0/A/show_0AQnbBbrcnOEVvpyEt2hDg/3o4nDbWg5WImMkjo6eWANS.json', 'podcast_data_no_audio/podcasts-transcripts/0/O/show_0Ow12LWRnZAjrC9CdzcVXf/0DpZyyqz4j6PDJbAKbcE0L.json', 'podcast_data_no_audio/podcasts-transcripts/0/N/show_0nooPN4bqy4mXjxzcw6Q8x/5tqBWNeJFqluMscMPNx3Jl.json', 'podcast_data_no_audio/podcasts-transcripts/0/E/show_0E2L8zPYhApYkmWWFef7aK/7KuzIPpOo3ce9JoapOyx7x.json', 'podcast_data_no_audio/podcasts-transcripts/0/4/show_04oioSRpSb6NwO8L6SOODX/2BqGPeR8EtXRQAtOk1FnZu.json']
one = ['podcast_data_no_audio/podcasts-transcripts/1/W/show_1W8XRepty6Lw4UtSKQypoW/5UXTpnjCqRoXQrakHUIixK.json', 'podcast_data_no_audio/podcasts-transcripts/1/F/show_1F8eCztQpRKBXGfXn60hfA/0sjKTXOdiq21FWfURkNhK9.json', 'podcast_data_no_audio/podcasts-transcripts/1/N/show_1NuPhncm1111kQjUQEgVjB/5aW8y7qZYy7fVMuOZTstFO.json', 'podcast_data_no_audio/podcasts-transcripts/1/K/show_1K2Fzlro0Lmp9pGS7Ak00D/4kwQvhhqChdY7wmWr0IEC0.json', 'podcast_data_no_audio/podcasts-transcripts/1/Z/show_1ZKHflAANeLU5C4XZ1Aa2O/78mQuY226OGypfZTE0eXEF.json', 'podcast_data_no_audio/podcasts-transcripts/1/M/show_1MOX9JsJxj4qMVkECiO4dF/1j7r7kFe2lw0OqaaUbufa6.json', 'podcast_data_no_audio/podcasts-transcripts/1/I/show_1IJslH3oyMzNDjlGyb1D15/2rpuMACJeBr3mU5QAspY0i.json', 'podcast_data_no_audio/podcasts-transcripts/1/F/show_1FcrPHfZW2YPygOev9mp6G/03NWT0unEHJf0qwSEcPF8U.json', 'podcast_data_no_audio/podcasts-transcripts/1/W/show_1wkrwgQJo7I6wEC4RwU8x2/38m9SUCzPKWurLqoqKp1x7.json', 'podcast_data_no_audio/podcasts-transcripts/1/T/show_1T69Xe0EJ4n0gOO4RD9qv0/6wtwVAYvYUpWDMGk4apvyC.json', 'podcast_data_no_audio/podcasts-transcripts/1/H/show_1hAEAWtQFOaIYTMRZlMcdg/6zYGnMcEPqVyuryRUr4MvF.json', 'podcast_data_no_audio/podcasts-transcripts/1/U/show_1UO3SVMDFdE9hUkvi9G6QS/6YaAfVdDepKf7iBflevWMV.json', 'podcast_data_no_audio/podcasts-transcripts/1/W/show_1w5HzKD00MO1PpzEJJo3Vn/0HPK9fGvi4FrMEf2vzqyo0.json', 'podcast_data_no_audio/podcasts-transcripts/1/C/show_1cTKtWwQ2BXvK5Q0wfPiQ5/67vPsEwE5Vp4r218jLX3rl.json', 'podcast_data_no_audio/podcasts-transcripts/1/Y/show_1yBJMbywkEEhIGKkvwoIVg/24mGzaqBlP6taAtMwiECSB.json', 'podcast_data_no_audio/podcasts-transcripts/1/F/show_1FQS4jnubbMxz5OvVlsxwX/6o4ojoy8AL1CLW1IHoqOTJ.json', 'podcast_data_no_audio/podcasts-transcripts/1/S/show_1SXD1U55jqbK9HHoPvdbsw/2EN1wmxv3M259CwSkVi2Rv.json', 'podcast_data_no_audio/podcasts-transcripts/1/Z/show_1z7SoufmYZoz2Gqvel1IzO/29RGXrRgPCAcUcRlR0R31Y.json', 'podcast_data_no_audio/podcasts-transcripts/1/L/show_1lbzRaT4n1Rxx04QatZo9Y/1Jikd0xspLVluTaYJWftMX.json', 'podcast_data_no_audio/podcasts-transcripts/1/Q/show_1qZ5TK5ghLJnNeWP0NvSKp/1H2uvhjwMtPCoxFL6KFBOE.json', 'podcast_data_no_audio/podcasts-transcripts/1/I/show_1iHjo84YtAIWa2GaYwowiq/31jD07gEHkjZ4kEttHP2bl.json', 'podcast_data_no_audio/podcasts-transcripts/1/Z/show_1Z9gEuGf562fInSsMoLqyu/5ifXdhAl7qPXDQpUrzgpaJ.json', 'podcast_data_no_audio/podcasts-transcripts/1/H/show_1hygb4nGhNhlLn4pBnN00j/5XXWZGEAANXkIIQ7X2U2R7.json', 'podcast_data_no_audio/podcasts-transcripts/1/Y/show_1y0ZxlG9I4t9YImbff00I2/3UHX2Ax6B6VHwwhKXUHbkw.json', 'podcast_data_no_audio/podcasts-transcripts/1/J/show_1J0yZFJ0lUUvpMb8ZJRbXy/3qnjfTYDjoV5lRCDusjzSI.json']
two = ['podcast_data_no_audio/podcasts-transcripts/2/G/show_2gs5645b3F1d2End3KXBp4/0iSMYT4A1ULCUCY8STewvY.json', 'podcast_data_no_audio/podcasts-transcripts/2/9/show_29Qk08pTL6B9LPRejivbS6/4VvILwzXpbTcWt9YMoyFUZ.json', 'podcast_data_no_audio/podcasts-transcripts/2/I/show_2iHte4DYbwL2mlhgysmb7l/11ybYZlrlm3lmkWX1OXl3p.json', 'podcast_data_no_audio/podcasts-transcripts/2/T/show_2tetA7Ub1xxSLm0oHA1gmV/6VuAZ7nxVzGDcK7Ie8jXyJ.json', 'podcast_data_no_audio/podcasts-transcripts/2/Q/show_2qY4nOu5zaZ9CMTlB6XBj5/6y7bLRK9VKOJiDhZaZM9Sy.json', 'podcast_data_no_audio/podcasts-transcripts/2/I/show_2iaGIA0ODxmgHSmKXYopRX/0GfpjRHpEhQK1bwWkoNbza.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/6D59qfOpamnVRDElnuTocj.json', 'podcast_data_no_audio/podcasts-transcripts/2/W/show_2W1Hy8xOcSi0b8ppdQ52qU/21iECAkP64WkYOc9kIusyI.json', 'podcast_data_no_audio/podcasts-transcripts/2/8/show_28KKqFWNBw6kk5aBFu2viN/6WphEO0vKaCcJeVsYa7Kng.json', 'podcast_data_no_audio/podcasts-transcripts/2/K/show_2kl1GlTTgSwgobmxgGEaBX/4RO6YTUkTTryRb3uNXL4l0.json', 'podcast_data_no_audio/podcasts-transcripts/2/0/show_20zWNMtAU8S3rir62g1r0Q/1T3FUP68I7oYVnuOyM44uE.json', 'podcast_data_no_audio/podcasts-transcripts/2/M/show_2M8KJDwxT0zzwXEJhDZYCL/06mLFp9wQFnIQd79Qjg4jq.json', 'podcast_data_no_audio/podcasts-transcripts/2/C/show_2C0AgUOt4eCULjFjb3mynN/4jaXWbdotGutHNxKKjMOWs.json', 'podcast_data_no_audio/podcasts-transcripts/2/H/show_2hq8BvOX4DocQpqkm20XLu/6kz63TvGtLKe5dI9qr8sqm.json', 'podcast_data_no_audio/podcasts-transcripts/2/A/show_2AkW5V4H6xAh8IXJU0jHUm/2PHi40upTwPGW5btJugKuK.json', 'podcast_data_no_audio/podcasts-transcripts/2/H/show_2HIFmqNqJkR2SADAcG4Fpq/6EUGCLZqWARW8TcNzezI0e.json', 'podcast_data_no_audio/podcasts-transcripts/2/E/show_2eXdry3liXk0Z0BfTCVKff/4TssEgbtKfFBhH2NVpkhD2.json', 'podcast_data_no_audio/podcasts-transcripts/2/Q/show_2qy2KehR0K2FGZqsEsU2CY/59S06bau9DJEEx1f1dC4y7.json', 'podcast_data_no_audio/podcasts-transcripts/2/O/show_2OVM4aQOEgo8uvuUcqJSei/7o7HaMZf49jS2yyXYUdSCc.json', 'podcast_data_no_audio/podcasts-transcripts/2/J/show_2jW0aO9MqYFndIcIAhFukV/2Tt47VISLTvm8gTyhFjyb8.json', 'podcast_data_no_audio/podcasts-transcripts/2/4/show_24aqN472kMGKAhdJbIK59L/7sNutOB7XH6jihysx0tfiT.json', 'podcast_data_no_audio/podcasts-transcripts/2/A/show_2AsvwIbKhe8yV3ePi8nGY0/0ULidGb0REL65hEkd7zwLo.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/6FSUZ1PWVjCa7aHLifMPGa.json', 'podcast_data_no_audio/podcasts-transcripts/2/1/show_21ASCcEXgUlbFSmoqjroZm/2aZy5qlRmNbPDXIiV1hxQJ.json', 'podcast_data_no_audio/podcasts-transcripts/2/9/show_29DGC8r0eQZWtzPs10jASp/5qHQFfGcYnLWUPCOQt5PfM.json']

unlabeled_json_paths = [zero, one] # excluded two
# Prepare csv dataset (per episode) for labeling

dialogue_number = 80
for index, part in tqdm(enumerate(unlabeled_json_paths)):
    for path in part:
        df = dialogue_json_to_pandas(path)
        
        if index == 0:
            df.to_csv('labeled_data/bart/dialogue_{}.csv'.format(dialogue_number), sep='\t', encoding='utf-8', index=False)
        elif index == 1:
            df.to_csv('labeled_data/juno/dialogue_{}.csv'.format(dialogue_number), sep='\t', encoding='utf-8', index=False)
#         elif index == 2:
#             df.to_csv('labeled_data/joris/dialogue_{}.csv'.format(dialogue_number), sep='\t', encoding='utf-8', index=False)


        dialogue_number += 1
   
    

2it [00:09,  4.81s/it]
