# Load and Create Dataset

In [1]:
import nltk
import os, sys
import numpy as np
import pandas as pd
from pathlib import Path
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict, Counter
from w266_common import utils, vocabulary, tf_embed_viz, patched_numpy_io

In [2]:
tv_show = "friends"

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

root_path = Path().resolve().joinpath("..")
show_data_path = root_path.joinpath("scrape", "data", tv_show, "parsed")
embeddings_path = root_path.joinpath("embeddings", "newscrawl.300d.W.pos.vectors.gz")
embeddings_url = "https://www.dropbox.com/s/kguufyc2xcdi8yk/lexvec.enwiki%2Bnewscrawl.300d.W.pos.vectors.gz?dl=1"

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(embeddings_path)

if not os.path.isfile(embeddings_path):
    print("downloading embeddings...")
    urllib.request.urlretrieve(embeddings_url, embeddings_path)

# !gunzip {str(embeddings_path)} 

/home/jovyan/work/analysis/../embeddings/newscrawl.300d.W.pos.vectors.gz


In [5]:
dialog_datas = []

for filename in os.listdir(show_data_path):
    dialog_data = pd.read_csv(show_data_path.joinpath(filename), header=None, names=("speaker", "utterance"))
    dialog_data["episode"] = filename.split(".")[0]
    dialog_datas.append(dialog_data)
    
all_dialog_data = pd.concat(dialog_datas).dropna().reset_index(drop=True)

In [6]:
print(all_dialog_data.count())
all_dialog_data.head()

speaker      55025
utterance    55025
episode      55025
dtype: int64


Unnamed: 0,speaker,utterance,episode
0,Phoebe,"Oh, hey Joey.",1012
1,Joey,"Uh, hey.",1012
2,Phoebe,"Listen, I need to ask you something. Ok, you k...",1012
3,Joey,Yeah.,1012
4,Phoebe,"Yeah. Well, uhm... listen he was supposed to g...",1012


In [7]:
character_counts = Counter()
for character in all_dialog_data.speaker:
    character_counts[character] += 1
    
top_characters = character_counts.most_common(6)

char_id_to_word = dict(enumerate([w for w, c in top_characters]))
char_word_to_id = {v:k for k,v in char_id_to_word.items()}

major_dialog_data = all_dialog_data[all_dialog_data.speaker.isin(char_word_to_id.keys())]

utterance_tokenized = [word_tokenize(sentence) for sentence in major_dialog_data.utterance]
vocab = vocabulary.Vocabulary(utils.canonicalize_word(w) for w in utils.flatten(utterance_tokenized))

In [8]:
max_len = 40

utterances_index = [vocab.words_to_ids(words) for words in utterance_tokenized]
speaker_index = np.array([char_word_to_id[speaker] for speaker in major_dialog_data.speaker])
utterances_index_nparray = np.zeros((len(utterances_index), max_len), dtype=np.int32)
utterances_length = np.zeros([len(utterances_index)], dtype=np.int32)

for i, row in enumerate(utterances_index):
    cpy_len = min(len(row), max_len)
    utterances_index_nparray[i,:cpy_len] = row[:cpy_len]
    utterances_length[i] = cpy_len

In [12]:
human_check_df = pd.DataFrame()
human_check_df['utterance'] = major_dialog_data.utterance
human_check_df['utterance_tokenized'] = utterance_tokenized
human_check_df['speaker'] = major_dialog_data.speaker

# Human Test

In [15]:
pd.options.display.max_colwidth = 1000

human_check_df_sample = human_check_df.sample(10)

human_check_df_sample[['utterance', 'utterance_tokenized']]

Unnamed: 0,utterance,utterance_tokenized
19872,I still cannot believe you’re engaged! Just ‘cause its happening so fast; not ‘cause you’re such a loser.,"[I, still, can, not, believe, you, ’, re, engaged, !, Just, ‘, cause, its, happening, so, fast, ;, not, ‘, cause, you, ’, re, such, a, loser, .]"
52827,But you-you-you came to see Lilly?,"[But, you-you-you, came, to, see, Lilly, ?]"
44527,"Come on, show me.","[Come, on, ,, show, me, .]"
23812,You pushed him!,"[You, pushed, him, !]"
29400,Could there be more Kims?,"[Could, there, be, more, Kims, ?]"
16405,"Although, don’t feel like you can’t visit.","[Although, ,, don, ’, t, feel, like, you, can, ’, t, visit, .]"
42587,Oh.,"[Oh, .]"
16720,I know.,"[I, know, .]"
34501,I’ll take a card.,"[I, ’, ll, take, a, card, .]"
50504,That is.,"[That, is, .]"


In [16]:
human_check_df_sample[['utterance', 'utterance_tokenized', 'speaker']]

Unnamed: 0,utterance,utterance_tokenized,speaker
19872,I still cannot believe you’re engaged! Just ‘cause its happening so fast; not ‘cause you’re such a loser.,"[I, still, can, not, believe, you, ’, re, engaged, !, Just, ‘, cause, its, happening, so, fast, ;, not, ‘, cause, you, ’, re, such, a, loser, .]",Phoebe
52827,But you-you-you came to see Lilly?,"[But, you-you-you, came, to, see, Lilly, ?]",Phoebe
44527,"Come on, show me.","[Come, on, ,, show, me, .]",Rachel
23812,You pushed him!,"[You, pushed, him, !]",Joey
29400,Could there be more Kims?,"[Could, there, be, more, Kims, ?]",Chandler
16405,"Although, don’t feel like you can’t visit.","[Although, ,, don, ’, t, feel, like, you, can, ’, t, visit, .]",Phoebe
42587,Oh.,"[Oh, .]",Rachel
16720,I know.,"[I, know, .]",Phoebe
34501,I’ll take a card.,"[I, ’, ll, take, a, card, .]",Chandler
50504,That is.,"[That, is, .]",Phoebe
