## Niet runnen!

Deze notebook is bedoeld voor tests en dingen die maar een keer gerund hoeven te worden, sommige cells duren meer dan een uur. Het uiteindelijke werk komt allemaal in semantic_orientation.ipynb.

In [10]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from collections import Counter
import os
import pathlib
import time
import operator
from os import listdir
from os.path import isfile, join
from tqdm import tqdm
import csv
from sklearn.metrics import classification_report
import zipfile
import gdown

In [None]:
metadata_df = pd.read_csv("podcast_data_no_audio/metadata/metadata.tsv",sep='\t')

In [None]:
def get_paths_for_en_episodes(subset_number):
    """
    Function returns list of all paths to the json-files of english 
    episodes given subset number (bart: 0 , juno: 1, joris: 2)
    
    """

    path1 = 'podcast_data_no_audio/podcasts-transcripts/' + str(subset_number)

    folders = listdir(path1)

    if '.DS_Store' in folders:
        folders.remove('.DS_Store')

    podcast_episodes_paths = []

    for letter_or_number in tqdm(folders):    
        path2 = path1 + '/' + letter_or_number


        for show_uri in listdir(path2):
            path3 = path2 + '/' + show_uri

            # select english shows only
            show_metadata = metadata_df.loc[metadata_df['show_filename_prefix'] == show_uri]

            if len(show_metadata['language'].unique()) > 0:
                if 'en' in show_metadata['language'].unique()[0]:
                    for episode_uri in listdir(path3):
                        path4 = path3 + '/' + episode_uri

                        if '.json' in path4:
                            podcast_episodes_paths.append(path4)

                
        
    return len(podcast_episodes_paths), podcast_episodes_paths

In [None]:
paths = get_paths_for_en_episodes(1)[1]

In [None]:
def dialogue_json_to_pandas(json_path):
    """
    This function converts a podcast .json transcript into a 
    pandas dataframe with speaker tags, utterance text and open labels
    
    """
    
    with open(json_path) as f:
        data = json.load(f)

    # get transcript parts from json file, remove empty parts
    transcript_parts = []
    for utt in data['results']:
        try:
            trans = utt['alternatives'][0]['transcript']
        except KeyError:
            trans = 0

        if trans != 0:
            transcript_parts.append(utt)
    

    # create list of sentences from dialogue
    sentences = []
    for index, utterance in enumerate(transcript_parts):

        # get text of utterance
        utterance_text = utterance['alternatives'][0]['transcript']
        
        # get sentences from text to split based on speakerTag
        utterance_sentences = nltk.sent_tokenize(utterance_text)
        for sent in utterance_sentences:
            sent = sent.split(" ")
            if '' in sent:
                sent.remove('')
            sentences.append(sent)
                
    
    # get words with tags from transcript file
    words_with_tags = data['results'][-1]['alternatives'][0]['words']
    
    
    # assign speakerTag to each sentence
    # also fix mistakes when speakerTag switches to other speaker
    # in the middle of a sentence
    sentences_with_tags = []
    
    word_idx = 0
    for index, sentence in enumerate(sentences):
        sent_with_tags = []
        for word in sentence:
            sent_with_tags.append((word, words_with_tags[word_idx]['speakerTag']))
            word_idx += 1
        
        
        c = Counter(elem[1] for elem in sent_with_tags)
        sent_speakerTag = max(c.items(), key=operator.itemgetter(1))[0]
        sentences_with_tags.append((' '.join(sentence), sent_speakerTag))
        
        
    # merge sentences with same consecutive tags
    utterances_texts = []
    utterances_tags = []
    merged_sents = []
    for index, tagged_sent in enumerate(sentences_with_tags):

        
        # set initial value for tagged_sent
        if index == 0:
            curr_tag = tagged_sent[1]
        
        # speaker switch
        if curr_tag != tagged_sent[1] and index > 0:
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])
            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            merged_sents = []

            
        curr_tag = tagged_sent[1]
        merged_sents.append(tagged_sent)
        
        
        if index == len(sentences_with_tags)-1:
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])
            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            
   
    # make utterances and tags are the same shape
    if len(utterances_texts) == len(utterances_tags):
        # create pandas dataframe
        dialogue_df = pd.DataFrame(columns=['speaker_tag', 'text', 'sentiment_score'])

        
        # fill dataframe
        for i, text in enumerate(utterances_texts): 
            dialogue_df.loc[i] = [utterances_tags[i]] + [text] + ['']

    
    return dialogue_df

In [None]:
# Downloads the labeled .csv-files from Google Drive to use for validation.

gdown.download('https://drive.google.com/uc?id=1aqE8yS7Lf8GfljmFEuW5pd3i5S2raW1B', 'separate_csv_files.zip', quiet=False)
with zipfile.ZipFile('separate_csv_files.zip', 'r') as zip_ref:
    zip_ref.extractall('')

In [41]:
# changes all labeled data to binary values
for filename in [f for f in listdir('labeled_datasets/separate_csv_files') 
                 if isfile(join('labeled_datasets/separate_csv_files', f))]:
    path = 'labeled_datasets/separate_csv_files/' + filename
    df = pd.read_csv(path, sep='\t')
    df.loc[df['sentiment_score'] == 0.0, 'sentiment_score'] = 1.0
    df.loc[df['sentiment_score'] == -1.0, 'sentiment_score'] = 0.0
    df.to_csv(path, index=False, sep='\t')

In [4]:
# get list of all paths to the json-files of english episodes given subset number (bart: 0 , juno: 1, joris: 2)
metadata_df = pd.read_csv("podcast_data_no_audio/metadata/metadata.tsv",sep='\t')


def get_paths_for_en_episodes(subset_number):
    """
    Function returns list of all paths to the json-files of english 
    episodes given subset number (bart: 0 , juno: 1, joris: 2)
    
    """

    path1 = 'podcast_data_no_audio/podcasts-transcripts/' + str(subset_number)

    folders = listdir(path1)

    if '.DS_Store' in folders:
        folders.remove('.DS_Store')

    podcast_episodes_paths = []

    for letter_or_number in tqdm(folders):    
        path2 = path1 + '/' + letter_or_number


        for show_uri in listdir(path2):
            path3 = path2 + '/' + show_uri

            # select english shows only
            show_metadata = metadata_df.loc[metadata_df['show_filename_prefix'] == show_uri]

            if len(show_metadata['language'].unique()) > 0:
                if 'en' in show_metadata['language'].unique()[0]:
                    for episode_uri in listdir(path3):
                        path4 = path3 + '/' + episode_uri

                        if '.json' in path4:
                            podcast_episodes_paths.append(path4)

                
        
    return len(podcast_episodes_paths), podcast_episodes_paths

In [5]:
paths = get_paths_for_en_episodes(1)[1]

# Om een dict te maken met filenames als keys en de volledige transcripts als values

folder_path = 'podcast_data_no_audio/podcasts-transcripts/1'
# folder_path = 'podcast_data_no_audio/podcasts-transcripts/1/0/show_10AlBXJul8JZ5bREZUXBep'
transcripts_dict = dict()


for subdir, dirs, files in os.walk(folder_path):
    for file in files:
        if os.path.join(subdir, file)[-5:] == '.json':
            filepath = os.path.join(subdir, file)
            data = json.load(open(filepath))
            utterances = ''
            
            
            for utterance_number, _ in enumerate(data['results']):  
                utterance_dict = data['results'][utterance_number]['alternatives'][0]
                try:
                    utterance = utterance_dict['transcript'].strip()
                    utterances += ' ' + utterance
                except KeyError:
                    pass
                

            transcripts_dict[file] = utterances

In [None]:
## Belangrijke functie voor representatie van een podcast episode ##

def dialogue_json_to_pandas(json_path):
    """
    This function converts a podcast .json transcript into a 
    pandas dataframe with speaker tags, utterance text and open labels
    
    """
    
    with open(json_path) as f:
        data = json.load(f)

    # get transcript parts from json file, remove empty parts
    transcript_parts = []
    for utt in data['results']:
        try:
            trans = utt['alternatives'][0]['transcript']
        except KeyError:
            trans = 0

        if trans != 0:
            transcript_parts.append(utt)
    

    # create list of sentences from dialogue
    sentences = []
    for index, utterance in enumerate(transcript_parts):

        # get text of utterance
        utterance_text = utterance['alternatives'][0]['transcript']
        
        # get sentences from text to split based on speakerTag
        utterance_sentences = nltk.sent_tokenize(utterance_text)
        for sent in utterance_sentences:
            sent = sent.split(" ")
            if '' in sent:
                sent.remove('')
            sentences.append(sent)
                
    
    # get words with tags from transcript file
    words_with_tags = data['results'][-1]['alternatives'][0]['words']
    
    
    # assign speakerTag to each sentence
    # also fix mistakes when speakerTag switches to other speaker
    # in the middle of a sentence
    sentences_with_tags = []
    
    word_idx = 0
    for index, sentence in enumerate(sentences):
        sent_with_tags = []
        for word in sentence:
            sent_with_tags.append((word, words_with_tags[word_idx]['speakerTag']))
            word_idx += 1
        
        
        c = Counter(elem[1] for elem in sent_with_tags)
        sent_speakerTag = max(c.items(), key=operator.itemgetter(1))[0]
        sentences_with_tags.append((' '.join(sentence), sent_speakerTag))
        
        
    # merge sentences with same consecutive tags
    utterances_texts = []
    utterances_tags = []
    merged_sents = []
    for index, tagged_sent in enumerate(sentences_with_tags):

        
        # set initial value for tagged_sent
        if index == 0:
            curr_tag = tagged_sent[1]
        
        # speaker switch
        if curr_tag != tagged_sent[1] and index > 0:
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])
            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            merged_sents = []

            
        curr_tag = tagged_sent[1]
        merged_sents.append(tagged_sent)
        
        
        if index == len(sentences_with_tags)-1:
            utterance_tag = merged_sents[0][1]
            utterance_text = ' '.join([sent[0] for sent in merged_sents])
            utterances_texts.append(utterance_text)
            utterances_tags.append(utterance_tag)
            
   
    # make utterances and tags are the same shape
    if len(utterances_texts) == len(utterances_tags):
        # create pandas dataframe
        dialogue_df = pd.DataFrame(columns=['speaker_tag', 'text', 'sentiment_score'])

        
        # fill dataframe
        for i, text in enumerate(utterances_texts): 
            dialogue_df.loc[i] = [utterances_tags[i]] + [text] + ['']

    
    return dialogue_df

In [45]:
def add_scores(df):
    NEAR = 1000
    positive_words = ['good', 'great', 'nice', 'happy', 'easy', 'yes', 'yeah', 'love']
    negative_words = ['hard', 'bad', 'wrong', 'tough', 'negative', 'no', 'shit', 'hate']
    utt_lens = []
    utt_scores = []
    all_words = []
    df_idx = 0


    for utt in df.text:
        utt_lens.append(len([w for w in nltk.word_tokenize(utt) if w not in ['.', ',', '?', '!', '\'']]))
        all_words += [w for w in nltk.word_tokenize(utt) if w not in ['.', ',', '?', '!', '\'']]

    word_counter = Counter([w.lower() for w in all_words])
    positive_hits = sum([word_counter[p] for p in positive_words]) + 0.01
    negative_hits = sum([word_counter[n] for n in negative_words]) + 0.01


    for i in range(len(all_words) - 1):
        if i >= sum(utt_lens[:df_idx+1]) or i == len(all_words) - 2:
            if utt_scores:
                df.sentiment_score[df_idx] = np.mean(utt_scores)
            else:
                df.sentiment_score[df_idx] = 0
            df_idx += 1
            utt_scores = []
        if is_phrase(i, all_words):
            neighbourhood = []
            if NEAR > i:
                neighbourhood += all_words[:i]
            else:
                neighbourhood += all_words[i-NEAR:i]
            if i != len(all_words) - 2:
                try:
                    neighbourhood += all_words[i+2:i+2+NEAR]
                except IndexError:
                    neighbourhood += all_words[i+2:len(words)]


            neighbourhood_counter = Counter([w.lower() for w in neighbourhood])
            pos_neigh_hits = sum([neighbourhood_counter[p] for p in positive_words]) + 0.01
            neg_neigh_hits = sum([neighbourhood_counter[n] for n in negative_words]) + 0.01


            if pos_neigh_hits > 2 and neg_neigh_hits > 2:
                score = np.log2((pos_neigh_hits * negative_hits)/ (neg_neigh_hits * positive_hits))
                utt_scores.append(score)
    df.sentiment_score /= max([max(df.sentiment_score), -min(df.sentiment_score)])
    return df

#     if orientations:
#         plt.plot(np.arange(0, 1, 1/len(orientations)), orientations)
# plt.show()

df = dialogue_json_to_pandas(paths[10])
add_scores(df).sentiment_score

NameError: name 'dialogue_json_to_pandas' is not defined

In [None]:
# OUD
# Om semantic orientations van woorden te bepalen volgens 'Thumbs Up or Thumbs Down? Semantic Orientation 
# Applied to Unsupervised Classification of Reviews'

NEAR = 500
positive_words = ['good', 'great', 'nice', 'happy', 'easy']
negative_words = ['hard', 'bad', 'wrong', 'tough', 'negative']


for t in list(transcripts_dict.values())[:50]:
    words = [w for w in nltk.word_tokenize(t) if w not in ['.', ',', '?', '!', '\'']]
    if len(words) > 10000:
        orientations = []
        word_counter = Counter([w.lower() for w in words])
        positive_hits = sum([word_counter[p] for p in positive_words]) + 0.01
        negative_hits = sum([word_counter[n] for n in negative_words]) + 0.01
        
        
        for i in range(len(words) - 1):
            if is_phrase(i, words):
                neighbourhood = []
                if NEAR > i:
                    neighbourhood += words[:i]
                else:
                    neighbourhood += words[i-NEAR:i]
                if i != len(words) - 2:
                    try:
                        neighbourhood += words[i+2:i+2+NEAR]
                    except IndexError:
                        neighbourhood += words[i+2:len(words)]
                        
                
                neighbourhood_counter = Counter([w.lower() for w in neighbourhood])
                pos_neigh_hits = sum([neighbourhood_counter[p] for p in positive_words]) + 0.01
                neg_neigh_hits = sum([neighbourhood_counter[n] for n in negative_words]) + 0.01
                
                
                if pos_neigh_hits > 2 and neg_neigh_hits > 2:
                    o = np.log((pos_neigh_hits * negative_hits)/ (neg_neigh_hits * positive_hits))
                    orientations.append(o)
#                     print(words[i:i+2], o)
#                     print(i, len(neighbourhood))
    
    
        if orientations:
            plt.plot(np.arange(0, 1, 1/len(orientations)), orientations)


plt.show()

In [None]:
def is_phrase(word_index, words):
    first_tag = nltk.pos_tag([words[word_index]])[0][1]
    second_tag = nltk.pos_tag([words[word_index + 1]])[0][1]
    try:
        third_tag = nltk.pos_tag([words[word_index + 2]])[0][1]
    except IndexError:
        pass
    
    if first_tag == 'JJ':
        if second_tag in ['NN', 'NNS']:
            return True
        elif second_tag == 'JJ':
            try:
                if third_tag not in ['NN', 'NNS']:
                    return True
            except NameError:
                return True
    elif first_tag in ['RB', 'RBR', 'RBS']:
        if second_tag == 'JJ':
            try:
                if third_tag not in ['NN', 'NNS']:
                    return True
            except NameError:
                return True
        elif second_tag in ['VB', 'VBD', 'VBN', 'VBG']:
            return True
    elif first_tag in ['NN', 'NNS']:
        if second_tag == 'JJ':
            try:
                if third_tag not in ['NN', 'NNS']:
                    return True
            except NameError:
                return True
    return False

In [44]:
# # Om de meest voorkomende alternatieven voor good/bad en excellent/poor te vinden

# adjectives = []
# c = 0


# for t in transcripts_dict.values():
#     c += 1
#     if c % 100 == 0:
#         print (str(c) + ' / ' + str(len(transcripts_dict)))
#     words = [w for w in nltk.word_tokenize(t) if w not in['.', ',']]
#     for i in range(2, len(words)-2):
# #         if nltk.pos_tag([words[i]])[0][1] == 'NN':
#         if any(w in ['not', 'n\'t'] for w in [words[i-1].lower(), words[i-2].lower()]):
#             adjectives.append('not ' + words[i].lower())
#         else:
#             adjectives.append(words[i].lower())

# from nltk.corpus import stopwords
# nltk.download('stopwords')

# # # adj_counter = Counter(adjectives).most_common(10000)
# for w in [x[0] for x in adj_counter]:
#     if nltk.pos_tag([w])[0][1] == 'NN' and nltk.pos_tag([w])[0][0] not in stopwords.words('english'):
#         print(w)

# count = Counter(adjectives)
count['na']

# # positive_words = ['good', 'great', 'nice', 'happy', 'easy', 'special', ]
# # negative_words = ['hard', 'bad', 'wrong', 'tough', 'not good']

61217

# All words
[('i', 2920664),
 ('the', 2707330),
 ('and', 2499693),
 ('you', 2267175),
 ('to', 2107211),
 ('that', 1910118),
 ('a', 1710804),
 ('it', 1694550),
 ("'s", 1628554),
 ('of', 1420169),
 ('like', 1362499),
 ('in', 939313),
 ('is', 936290),
 ('so', 885230),
 ('was', 806712),
 ('we', 765953),
 ("n't", 730109),
 ('do', 708999),
 ('this', 645240),
 ('know', 638151),
 ('but', 637519),
 ('just', 598106),
 ('for', 594334),
 ('they', 571416),
 ('yeah', 567540),
 ('on', 565937),
 ('he', 555150),
 ('what', 482788),
 ('have', 482146),
 ("'re", 475482),
 ('be', 456634),
 ('with', 444109),
 ('not', 435110),
 ('my', 423325),
 ("'m", 422080),
 ('there', 387415),
 ('are', 364117),
 ('your', 358892),
 ('?', 356301),
 ('because', 353328),
 ('all', 347258),
 ('think', 342071),
 ('about', 336843),
 ('if', 334364),
 ('me', 324107),
 ('or', 319309),
 ('at', 319235),
 ('going', 306029),
 ('as', 295102),
 ('can', 291830),
 ('one', 290678),
 ('out', 275277),
 ('really', 264046),
 ('right', 258486),
 ('get', 257908),
 ('up', 254788),
 ('when', 249582),
 ('then', 242441),
 ('people', 236501),
 ('did', 225425),
 ('got', 214973),
 ('would', 207706),
 ('she', 207256),
 ('go', 206082),
 ('from', 205767),
 ('had', 195906),
 ('now', 194102),
 ("'ve", 190121),
 ('how', 189243),
 ('no', 185158),
 ('some', 182741),
 ('time', 179229),
 ('kind', 178656),
 ('want', 170841),
 ('them', 169121),
 ('well', 168485),
 ('were', 166182),
 ('more', 164005),
 ('good', 158472),
 ('an', 152452),
 ('who', 148357),
 ('mean', 147539),
 ('see', 145603),
 ('his', 144276),
 ('lot', 142270),
 ('okay', 141085),
 ('back', 140301),
 ('our', 138981),
 ('say', 138289),
 ('been', 136266),
 ('will', 131598),
 ('where', 131512),
 ("'ll", 131387),
 ('here', 130981),
 ('thing', 130444),
 ('things', 129757),
 ('very', 126542),
 ('something', 126291),
 ('way', 126063),
 ('oh', 124976),
 ('could', 124007),
 ('has', 123704),
 ('him', 123602),
 ('their', 120746),
 ('into', 120670),
 ('little', 120398),
 ('not know', 118985),
 ('by', 118860),
 ('her', 116549),
 ('said', 115169),
 ('which', 111120),
 ('us', 109909),
 ('other', 109900),
 ('not to', 108871),
 ('first', 107927),
 ('those', 103962),
 ('these', 102243),
 ('actually', 102148),
 ('let', 101977),
 ('also', 101563),
 ('make', 101505),
 ('does', 99408),
 ('much', 97717),
 ('guys', 97609),
 ('feel', 91631),
 ('doing', 91099),
 ('love', 90736),
 ('even', 86842),
 ('why', 85685),
 ('over', 85057),
 ('look', 83951),
 ('two', 83632),
 ('podcast', 82093),
 ('day', 81897),
 ('down', 80843),
 ('being', 80782),
 ('need', 78262),
 ('come', 77296),
 ('maybe', 77295),
 ('still', 77215),
 ('life', 75402),
 ('take', 75160),
 ('year', 74405),
 ('great', 73846),
 ('off', 73804),
 ('always', 72459),
 ('game', 72296),
 ('bit', 72041),
 ('yes', 71814),
 ('through', 69633),
 ('ca', 68326),
 ('work', 67987),
 ('never', 67841),
 ('every', 67656),
 ('than', 66602),
 ('should', 66465),
 ('last', 66305),
 ('new', 66107),
 ('only', 65046),
 ('probably', 64930),
 ('not i', 64858),
 ('talk', 64837),
 ('before', 64115),
 ('put', 64056),
 ('stuff', 64024),
 ('again', 63951),
 ('god', 63632),
 ('man', 63504),
 ('not it', 62572),
 ('years', 62566),
 ('different', 62114),
 ('not a', 61942),
 ('week', 61817),
 ('not that', 61812),
 ('big', 61660),
 ('na', 61217),
 ('not the', 60229),
 ('around', 59737),
 ('gon', 59294),
 ('any', 58806),
 ('after', 58148),
 ('sure', 58073),
 ('not have', 57811),
 ('next', 57120),
 ('three', 56817),
 ('same', 55701),
 ('everything', 54664),
 ('guy', 54639),
 ('show', 54430),
 ('pretty', 53787),
 ('most', 53365),
 ('point', 53299),
 ('start', 53214),
 ('not like', 53004),
 ('give', 52572),
 ('went', 52496),
 ('thought', 52393),
 ('many', 51991),
 ('getting', 51980),
 ('talking', 51481)]

# Adjectives:
[('good', 158472),
 ('little', 120398),
 ('other', 109900),
 ('much', 97717),
 ('great', 70420),
 ('last', 66305),
 ('big', 61647),
 ('different', 61615),
 ('next', 57120),
 ('same', 54776),
 ('many', 51991),
 ('new', 49781),
 ('whole', 44875),
 ('own', 41809),
 ('real', 37783),
 ('able', 34866),
 ('hard', 33712),
 ('bad', 33376),
 ('old', 29014),
 ('free', 28277),
 ('few', 28218),
 ('second', 27157),
 ('nice', 25714),
 ('live', 25662),
 ('high', 24937),
 ('important', 24218),
 ('such', 24067),
 ('true', 20846),
 ('wrong', 20042),
 ('happy', 20006),
 ('open', 18210),
 ('young', 16942),
 ('huge', 16414),
 ('full', 15458),
 ('black', 15396),
 ('certain', 15295),
 ('easy', 14913),
 ('single', 14601),
 ('social', 14550),
 ('ready', 14527),
 ('small', 13403),
 ('third', 12978),
 ('white', 11577),
 ('entire', 11112),
 ('hot', 10869),
 ('special', 10398),
 ('personal', 10243),
 ('possible', 9822),
 ('short', 9416),
 ('green', 9309),
 ('tough', 8912),
 ('not good', 8795),
 ('strong', 8636),
 ('red', 8500),
 ('positive', 8450),
 ('dead', 8212),
 ('general', 8078),
 ('similar', 7973),
 ('main', 7760),
 ('clear', 7567),
 ('low', 7546),
 ('difficult', 7533),
 ('actual', 7273),
 ('incredible', 7148),
 ('interested', 7138),
 ('final', 7055),
 ('negative', 7052),
 ('specific', 6891),
 ('healthy', 6831),
 ('extra', 6813),
 ('daily', 6616),
 ('particular', 6503),
 ('normal', 6476),
 ('serious', 6460),
 ('major', 6264),
 ('terrible', 6150),
 ('married', 6139),
 ('physical', 6125),
 ('not much', 5833),
 ('fantastic', 5722),
 ('american', 5561),
 ('local', 5394),
 ('likely', 5386),
 ('successful', 5301),
 ('available', 5296),
 ('stupid', 5242),
 ('safe', 5223),
 ('wide', 5103),
 ('overall', 5089),
 ('common', 5084),
 ('regular', 5050),
 ('comfortable', 5034),
 ('emotional', 4988),
 ('potential', 4966),
 ('massive', 4862),
 ('powerful', 4828),
 ('offensive', 4785),
 ('original', 4724),
 ('large', 4695),
 ('sudden', 4647),
 ('several', 4577),
 ('fourth', 4552),
 ('english', 4545),
 ('willing', 4453),
 ('professional', 4452),
 ('busy', 4449),
 ('natural', 4444),
 ('aware', 4332),
 ('bible', 4327),
 ('total', 4268),
 ('current', 4183),
 ('average', 4169),
 ('complete', 4138),
 ('lucky', 4126),
 ('christian', 4113),
 ('spiritual', 4106),
 ('creative', 4083),
 ('solid', 4023),
 ('heavy', 3986),
 ('not bad', 3982),
 ('scary', 3939),
 ('not able', 3913),
 ('popular', 3887),
 ('poor', 3823),
 ('curious', 3806),
 ('alive', 3802),
 ('famous', 3799),
 ('worried', 3753),
 ('sure', 3749),
 ('individual', 3698),
 ('surprised', 3676),
 ('rich', 3643),
 ('horrible', 3482),
 ('rid', 3470),
 ('previous', 3463),
 ('private', 3392),
 ('angry', 3342),
 ('due', 3332),
 ('expensive', 3331),
 ('tiny', 3330),
 ('fresh', 3300),
 ('ridiculous', 3299),
 ('nervous', 3237),
 ('basic', 3209),
 ('active', 3185),
 ('quiet', 3125),
 ('not great', 3092),
 ('financial', 3031),
 ('classic', 2993),
 ('medical', 2969),
 ('experienced', 2946),
 ('separate', 2939),
 ('modern', 2918),
 ('related', 2907),
 ('obvious', 2877),
 ('senior', 2849),
 ('recent', 2797),
 ('limited', 2771),
 ('sexual', 2739),
 ('significant', 2667),
 ('direct', 2623),
 ('french', 2584),
 ('eric', 2565),
 ('uncomfortable', 2528),
 ('familiar', 2421),
 ('various', 2364),
 ('not big', 2346),
 ('dangerous', 2337),
 ('central', 2312),
 ('flat', 2270),
 ('concerned', 2230),
 ('military', 2212),
 ('valuable', 2210),
 ('legal', 2210),
 ('former', 2184),
 ('political', 2177),
 ('british', 2067),
 ('prepared', 2063),
 ('long-term', 2050),
 ('weekly', 2046),
 ('soft', 2040),
 ('impossible', 2011),
 ('national', 2005),
 ('injured', 1991),
 ('effective', 1982),
 ('hilarious', 1981),
 ('competitive', 1948),
 ('necessary', 1946),
 ('impressive', 1933),
 ('technical', 1926),
 ('traditional', 1917),
 ('critical', 1905),
 ('not true', 1893),
 ('complex', 1860),
 ('not easy', 1847),
 ('optimal', 1836),
 ('global', 1829),
 ('not many', 1827),
 ('initial', 1821),
 ('grand', 1818)]

In [None]:
# OUD
# Om alle phrases (2 woorden combis) uit een transcript te halen
subset_number = '1'
folder_number = '0'
show_uri = '10AlBXJul8JZ5bREZUXBep'
episode_uri = '1am2bPIgTuCcAfqOY3rQZ1'
path = 'podcast_data_no_audio/podcasts-transcripts/{}/{}/show_{}/{}.json'.format(subset_number, folder_number, show_uri, episode_uri)


with open(path) as f:
    data = json.load(f)


utterances = ''
for utterance_number, _ in enumerate(data['results']):  
    utterance_dict = data['results'][utterance_number]['alternatives'][0]
    try:
        utterance = utterance_dict['transcript'].strip()
        utterances += ' ' + utterance
    except KeyError:
        pass


sentences = nltk.sent_tokenize(utterances)
phrases = []


for s in sentences:
    words = nltk.word_tokenize(s)
    pos_tags = nltk.pos_tag(words)
    for i in range(len(pos_tags)-1):
        if pos_tags[i][1] in ['NNP', 'NNPS'] or pos_tags[i+1][1] in ['NNP', 'NNPS']:
            continue
        if pos_tags[i][1] == 'JJ':
            if pos_tags[i+1][1] in ['NN', 'NNS']:
                phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
            elif pos_tags[i+1][1] == 'JJ':
                try:
                    if pos_tags[i+2][1] not in ['NN', 'NNS']:
                        phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
                except KeyError:
                    phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
        elif pos_tags[i][1] in ['RB', 'RBR', 'RBS']:
            if pos_tags[i+1][1] == 'JJ':
                try:
                    if pos_tags[i+2][1] not in ['NN', 'NNS']:
                        phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
                except KeyError:
                    phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
            elif pos_tags[i+1][1] in ['VB', 'VBD', 'VBN', 'VBG']:
                phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
        elif pos_tags[i][1] in ['NN', 'NNS']:
            if pos_tags[i+1][1] == 'JJ':
                try:
                    if pos_tags[i+2][1] not in ['NN', 'NNS']:
                        phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
                except KeyError:
                    phrases.append(pos_tags[i][0] + ' ' + pos_tags[i+1][0])
print(phrases)