In [1]:
import gensim
from collections import defaultdict
from scipy import spatial
from gensim.models import KeyedVectors
import random
import pandas as pd
from collections import Counter
import glob
import pickle

In [2]:
def train_word2vec(documents, dim=300, min_count=2, iters=100, window=5, negative=5):
	model = gensim.models.Word2Vec(
        documents,
        sg=1,
        size=dim,
        window=window,
        min_count=min_count,
#         max_final_vocab=3000,
        sample=1e-5,
        iter=iters,
        ns_exponent=0.75,
        negative=negative,
        workers=4)
	model.train(documents, total_examples=len(documents), epochs=model.epochs)
	return model


In [3]:
def read_docs(csv_file, column='stem'):
    """read stem utterances from childes csv files"""
    df = pd.read_csv(csv_file)
    tags = df['part_of_speech'].values
    stems = df['stem'].values
    ret_list = []
    for t, s in zip(tags, stems):
        tl, sl = str(t).lower().split(), str(s).lower().split()
        
        # replace NAME and interjections with $name$ and $co$ respectively
        ntl = []
        for t, s in zip(tl, sl):
            if t == "n:prop":
                ntl.append('$name$')
#             elif t == 'co':
#                 ntl.append('$co$')
            else:
                ntl.append(s)

#         print(' '.join(ntl))
        ret_list.append(ntl)
    return ret_list

In [4]:
childes_files = sorted(glob.glob("./data/childes-en/*.csv"))
num_tokens = []
for filename in sorted(childes_files, key=lambda x: int(x.split('_')[-1][:-4])):
    month = int(filename.split('_')[-1][:-4])
    lines = read_docs(filename)
    num_tokens.append(sum([len(l) for l in lines]))
print(num_tokens)

  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):


[0, 0, 0, 675, 0, 2996, 15663, 17582, 25704, 43371, 2235, 18595, 73200, 93436, 109389, 62921, 80428, 93764, 194213, 153429, 161326, 189616, 222385, 281451, 574601, 499293, 515099, 532281, 505978, 548514, 670957, 565947, 469748, 501356, 464451, 405575, 776889, 313177, 205337, 230318, 154513, 155255, 259921, 169736, 139363, 188878, 154077, 146393, 195701, 127483, 113799, 109585, 124600, 97402, 198899, 129798, 88846, 660399, 76209, 94505, 65465, 46699, 40163, 34145, 37276, 33480, 84709, 67468, 34144, 40616, 27063, 28027, 15985, 17189, 14914, 14057, 9497, 9951, 9696, 16858, 12940, 16609, 6876, 30265, 26949, 38818, 20143, 18162, 19409, 16892, 22755, 29847, 10955, 6206, 9887, 13402, 1879, 2702, 3110, 5285, 2891, 3035, 4418, 2771, 4252, 340, 6896, 6225, 2122, 7346, 4187, 5699, 11342, 6588, 10035, 8916, 12356, 11844, 1925, 4827, 14867, 6036, 2715, 5008, 154, 1968, 2625, 39, 28, 27, 493, 74, 2514, 9475, 19, 6, 23, 151, 3801, 24, 2093, 681, 2, 11, 667, 0, 5070, 0, 0, 0, 0]


In [6]:
# this is taken from `1-train_word_embeddings-English` so that the shuffled corpus
# have similar size as the original (time-series) corpus
controlled_windows= [2416980, 2052651, 2255166, 2148271, 2116968, 2017226]

In [8]:
import numpy as np
childes_files = sorted(glob.glob("./data/childes-en/*.csv"), key=lambda x: int(x.split('_')[-1][:-4]))

for shuffle_idx in range(1, 6):

    year2vecs = {}

    docs = []  
    for filename in childes_files:
        docs.extend(read_docs(filename))
    
    print(f"============SHUFFLE-{shuffle_idx}===================")
    print("Before shuffling:")
    print(docs[0])
    random.shuffle(docs)
    print("After shuffling")
    print(docs[0])

    num_tokens = 0
    each_docs = []
    which_period = 0
    for doc in docs:
        each_docs.append(doc)
        num_tokens += len(doc)

    # start and end index of docs
    start, end = 0, 0
    for i, window_size in enumerate(controlled_windows):
        num_tokens = 0
        docs_per_period = []

        for j, doc in enumerate(docs[start:]):
            docs_per_period.append(doc)
            num_tokens += len(doc)
            if num_tokens > window_size:
                print(f"Training {i}th period w/ {num_tokens} tokens:")
                model = train_word2vec(docs_per_period, dim=100, min_count=15, iters=50, window=5)
                d = {w:v for w, v in zip(model.wv.index2word, model.wv.vectors)}
                year2vecs['period'+str(i)] = d
                print("Finished training.")

                # reset the params
                num_tokens = 0
                docs_per_period = []
                start += j
                break

    with open(f'./data/embeddings-over-time/embeddings-English-1M-ep50-f15-shuffle-{shuffle_idx}.pickle', 'wb') as handle:
        pickle.dump(year2vecs, handle, protocol=pickle.HIGHEST_PROTOCOL)

Before shuffling:
['just', 'like', 'your', 'book', 'at', 'home']
After shuffling
['will', 'like', 'to', 'look', 'at', 'these', 'again', 'for', 'a', 'few', 'minute']
Training 0th period w/ 2416984 tokens:
Finished training.
Training 1th period w/ 2052653 tokens:
Finished training.
Training 2th period w/ 2255172 tokens:
Finished training.
Training 3th period w/ 2148272 tokens:
Finished training.
Training 4th period w/ 2116970 tokens:
Finished training.
Training 5th period w/ 2017227 tokens:
Finished training.
Before shuffling:
['just', 'like', 'your', 'book', 'at', 'home']
After shuffling
['look_it', 'happy']
Training 0th period w/ 2416982 tokens:
Finished training.
Training 1th period w/ 2052653 tokens:
Finished training.
Training 2th period w/ 2255167 tokens:
Finished training.
Training 3th period w/ 2148278 tokens:
Finished training.
Training 4th period w/ 2116972 tokens:
Finished training.
Training 5th period w/ 2017227 tokens:
Finished training.
Before shuffling:
['just', 'like', 'y