In [13]:
import requests
import json

import random

from collections import Counter

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /cfs/home/u021320/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
seasons = []

for i in range(1,11):

    # load season i
    if i<10:
        json_file = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/friends_season_0'+str(i)+'.json'
    else:
        json_file = 'https://raw.githubusercontent.com/emorynlp/character-mining/master/json/friends_season_10.json'
    
    r = requests.get(json_file)
    
    seasons.append(json.loads(r.text))


In [5]:
#for statistical purposes

utterances = []
episodes = []
scenes = []
speakers = []

utterances_lines = []


for season in seasons:
    for episode in season['episodes']:
        episodes.append(episode['episode_id'])
        for scene in episode['scenes']:
            scenes.append(scene['scene_id'])
            for utterance in scene['utterances']:
                #dont want non-speech
                if utterance['transcript'] != "":
                    utterances.append(utterance['utterance_id'])
                    utterances_lines.append(utterance['transcript'])
                    for speaker in utterance['speakers']:
                        if speaker not in speakers:
                            speakers.append(utterance['speakers'][0])

print("Number of episodes: ", len(episodes))
print("Number of scenes: ", len(scenes))
print("Number of utterances: ", len(utterances))
print("Number of speakers: ", len(speakers))
print("Average number of tokens per utterance: ", sum([len(utterance.split()) for utterance in utterances_lines])/len(utterances_lines))
print("Average number of utterances per scene: ", len(utterances)/len(scenes))


Number of episodes:  236
Number of scenes:  3107
Number of utterances:  61310
Number of speakers:  701
Average number of tokens per utterance:  10.159745555374327
Average number of utterances per scene:  19.732861280978437


Vocalurary statistics by speaker

In [23]:
import string


characters = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Ross Geller', 'Rachel Green']

for speaker in characters:
    utterances = []
    for season in seasons:
        for episode in season['episodes']:
            for scene in episode['scenes']:
                for utterance in scene['utterances']:
                    if speaker in utterance['speakers'] and utterance['transcript']!="":
                        for word in utterance['transcript'].translate(str.maketrans('', '', string.punctuation)).lower().split():
                            utterances.append(word)
    print(speaker)
    print("vocabulary size: ", len(Counter(utterances)))

print("stop words", stopwords.words('english'))

for speaker in characters:
    utterances = []
    for season in seasons:
        for episode in season['episodes']:
            for scene in episode['scenes']:
                for utterance in scene['utterances']:
                    if speaker in utterance['speakers'] and utterance['transcript']!="":
                        for word in utterance['transcript'].translate(str.maketrans('', '', string.punctuation)).lower().split():
                            # we do not want stopwords for 10 words
                            if word not in stopwords.words('english'):
                                utterances.append(word)
    print(speaker)
    print("Top 10 words without stop-words: ", Counter(utterances).most_common(10))


Monica Geller
vocabulary size:  5616
Joey Tribbiani
vocabulary size:  5949
Chandler Bing
vocabulary size:  6445
Phoebe Buffay
vocabulary size:  5989
Ross Geller
vocabulary size:  6550
Rachel Green
vocabulary size:  5837
stop words ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over'

Monica Geller
Top 10 words without stop-words:  [('oh', 949), ('im', 708), ('okay', 677), ('dont', 577), ('know', 552), ('well', 475), ('right', 448), ('gonna', 447), ('youre', 436), ('hey', 423)]
Joey Tribbiani
Top 10 words without stop-words:  [('hey', 999), ('yeah', 936), ('oh', 809), ('im', 805), ('know', 670), ('right', 665), ('dont', 595), ('okay', 555), ('well', 537), ('like', 529)]
Chandler Bing
Top 10 words without stop-words:  [('im', 801), ('oh', 766), ('okay', 708), ('well', 655), ('know', 613), ('dont', 604), ('yeah', 559), ('hey', 512), ('right', 485), ('get', 437)]
Phoebe Buffay
Top 10 words without stop-words:  [('oh', 1442), ('okay', 816), ('know', 802), ('yeah', 799), ('im', 742), ('well', 575), ('dont', 562), ('like', 525), ('hey', 465), ('right', 406)]
Ross Geller
Top 10 words without stop-words:  [('im', 941), ('oh', 898), ('okay', 836), ('yeah', 828), ('know', 746), ('hey', 741), ('uh', 678), ('dont', 661), ('well', 633), ('right', 516)]
Rachel Green
Top 10 words 

In [34]:
#test set 1: season 8
#train set 2: first 7 seasons

test_set = []

for episode in seasons[7]['episodes']:
    for scene in episode['scenes']:
        for utterance in scene['utterances']:
            if utterance['speakers'] != []:
                test_set.append(utterance)


train_set = []

for season in seasons:
    if season['season_id'] in ['s01', 's02', 's03', 's04', 's05', 's06', 's07']:
        for episode in season['episodes']:
            for scene in episode['scenes']:
                for utterance in scene['utterances']:
                    if utterance['speakers'] != [] and utterance not in test_set:
                        train_set.append(utterance)

In [35]:
utterances = []
episodes = []
scenes = []
speakers = []

utterances_lines = []

set1 = test_set + train_set

for utterance in set1:
    if utterance['transcript'] != "":
        utterances.append(utterance['utterance_id'])
        utterances_lines.append(utterance['transcript'])

        #regex to remove episode from utterance id
        episode = utterance['utterance_id'][0:7]
        scene = utterance['utterance_id'][0:11]

        if episode not in episodes:
            episodes.append(episode)

        if scene not in scenes:
            scenes.append(scene)

        for speaker in utterance['speakers']:
            if speaker not in speakers:
                speakers.append(utterance['speakers'][0])

print("Number of episodes: ", len(episodes))
print("Number of scenes: ", len(scenes))
print("Number of utterances: ", len(utterances))
print("Number of speakers: ", len(speakers))
print("Average number of tokens per utterance: ", sum([len(utterance.split()) for utterance in utterances_lines])/len(utterances_lines))
print("Average number of utterances per scene: ", len(utterances)/len(scenes))


#lets count utterances per speaker
monica = []
chandler = []
ross = []
joey = []
phoebe = []
rachel = []
others = []



for utterance in set1:
    if utterance['speakers'] == ['Monica Geller']:
        monica.append(utterance)
    elif utterance['speakers'] == ['Chandler Bing']:
        chandler.append(utterance)
    elif utterance['speakers'] == ['Ross Geller']:
        ross.append(utterance)
    elif utterance['speakers'] == ['Joey Tribbiani']:
        joey.append(utterance)
    elif utterance['speakers'] == ['Phoebe Buffay']:
        phoebe.append(utterance)
    elif utterance['speakers'] == ['Rachel Green']:
        rachel.append(utterance)
    else:
        others.append(utterance)

#lets get the percentages of each speaker
print("Monica: ", len(monica)/len(set1))
print("Chandler: ", len(chandler)/len(set1))
print("Ross: ", len(ross)/len(set1))
print("Joey: ", len(joey)/len(set1))
print("Phoebe: ", len(phoebe)/len(set1))
print("Rachel: ", len(rachel)/len(set1))
print("Others: ", len(others)/len(set1))


Number of episodes:  194
Number of scenes:  2579
Number of utterances:  49740
Number of speakers:  596
Average number of tokens per utterance:  10.07227583433856
Average number of utterances per scene:  19.2865451725475
Monica:  0.13772106109324758
Chandler:  0.13818327974276529
Ross:  0.14887459807073955
Joey:  0.132516077170418
Phoebe:  0.12067926045016077
Rachel:  0.1534967845659164
Others:  0.1685289389067524


In [4]:
with open("sets/test_set1.json", "w") as outfile:
    json.dump(test_set, outfile)


with open("sets/train_set1.json", "w") as outfile:
    json.dump(train_set, outfile)

In [30]:
#test set 2: 10% of utterances from each season

all_seasons = []
for season in seasons:
    utterances = []
    selection = []
    for episode in season['episodes']:
        for scene in episode['scenes']:
            for utterance in scene['utterances']:
                if utterance['speakers'] != []:
                    utterances.append(utterance)

    num_elements = len(utterances)
    num_elements_to_extract = int(num_elements * 0.1)  # Calculate 10% of the total elements
    selection = random.sample(utterances, num_elements_to_extract)

    all_seasons.append(selection)


test_set = []
for season in all_seasons:
    for utterance in season:
        test_set.append(utterance)

train_set = []
for season in seasons:
    for episode in season['episodes']:
        for scene in episode['scenes']:
            for utterance in scene['utterances']:
                if utterance['speakers'] != [] and utterance not in test_set:
                    train_set.append(utterance)



In [31]:
utterances = []
episodes = []
scenes = []
speakers = []

utterances_lines = []

set2 = test_set + train_set

for utterance in set2:
    if utterance['transcript'] != "":
        utterances.append(utterance['utterance_id'])
        utterances_lines.append(utterance['transcript'])

        #regex to remove episode from utterance id
        episode = utterance['utterance_id'][0:7]
        scene = utterance['utterance_id'][0:11]

        if episode not in episodes:
            episodes.append(episode)

        if scene not in scenes:
            scenes.append(scene)

        for speaker in utterance['speakers']:
            if speaker not in speakers:
                speakers.append(utterance['speakers'][0])

print("Number of episodes: ", len(episodes))
print("Number of scenes: ", len(scenes))
print("Number of utterances: ", len(utterances))
print("Number of speakers: ", len(speakers))
print("Average number of tokens per utterance: ", sum([len(utterance.split()) for utterance in utterances_lines])/len(utterances_lines))
print("Average number of utterances per scene: ", len(utterances)/len(scenes))


monica = []
chandler = []
ross = []
joey = []
phoebe = []
rachel = []
others = []

for utterance in set2:
    if utterance['speakers'] == ['Monica Geller']:
        monica.append(utterance)
    elif utterance['speakers'] == ['Chandler Bing']:
        chandler.append(utterance)
    elif utterance['speakers'] == ['Ross Geller']:
        ross.append(utterance)
    elif utterance['speakers'] == ['Joey Tribbiani']:
        joey.append(utterance)
    elif utterance['speakers'] == ['Phoebe Buffay']:
        phoebe.append(utterance)
    elif utterance['speakers'] == ['Rachel Green']:
        rachel.append(utterance)
    else:
        others.append(utterance)

#lets get the percentages of each speaker
print("Monica: ", len(monica)/len(set2))
print("Chandler: ", len(chandler)/len(set2))
print("Ross: ", len(ross)/len(set2))
print("Joey: ", len(joey)/len(set2))
print("Phoebe: ", len(phoebe)/len(set2))
print("Rachel: ", len(rachel)/len(set2))
print("Others: ", len(others)/len(set2))

Number of episodes:  236
Number of scenes:  3099
Number of utterances:  61308
Number of speakers:  701
Average number of tokens per utterance:  10.159881255301103
Average number of utterances per scene:  19.783155856727976
Monica:  0.13763083243666244
Chandler:  0.13800580390622452
Ross:  0.1493038573152043
Joey:  0.13331050898301217
Phoebe:  0.12233851772147772
Rachel:  0.1518634451726499
Others:  0.16754703446476898


In [21]:
with open("sets/test_set2.json", "w") as outfile:
    json.dump(test_set, outfile)


with open("sets/train_set2.json", "w") as outfile:
    json.dump(train_set, outfile)