In [1]:
import gensim
from collections import defaultdict
from scipy import spatial
from gensim.models import KeyedVectors
from igraph import *
import random
import pandas as pd
from collections import Counter
import glob
import pickle

In [2]:
def train_word2vec(documents, dim=300, min_count=2, iters=100, window=5, negative=5):
	model = gensim.models.Word2Vec(
        documents,
        sg=1,
        size=dim,
        window=window,
        min_count=min_count,
#         max_final_vocab=3000,
        sample=1e-5,
        iter=iters,
        ns_exponent=0.75,
        negative=negative,
        workers=4)
	model.train(documents, total_examples=len(documents), epochs=model.epochs)
	return model


In [3]:
def read_docs(csv_file, column='stem'):
    """read stem utterances from childes csv files"""
    df = pd.read_csv(csv_file)
    tags = df['part_of_speech'].values
    stems = df['stem'].values
    ret_list = []
    for t, s in zip(tags, stems):
        tl, sl = str(t).lower().split(), str(s).lower().split()
        
        # replace NAME and interjections with $name$ and $co$ respectively
        ntl = []
        for t, s in zip(tl, sl):
            if t == "n:prop":
                ntl.append('$name$')
#             elif t == 'co':
#                 ntl.append('$co$')
            else:
                ntl.append(s)

#         print(' '.join(ntl))
        ret_list.append(ntl)
    return ret_list

In [4]:
childes_files = sorted(glob.glob("./data/childes-en/*.csv"))
num_tokens = []
for filename in sorted(childes_files, key=lambda x: int(x.split('_')[-1][:-4])):
    month = int(filename.split('_')[-1][:-4])
    lines = read_docs(filename)
    num_tokens.append(sum([len(l) for l in lines]))
print(num_tokens)

  if (yield from self.run_code(code, result)):
  if (yield from self.run_code(code, result)):


[0, 0, 0, 675, 0, 2996, 15663, 17582, 25704, 43371, 2235, 18595, 73200, 93436, 109389, 62921, 80428, 93764, 194213, 153429, 161326, 189616, 222385, 281451, 574601, 499293, 515099, 532281, 505978, 548514, 670957, 565947, 469748, 501356, 464451, 405575, 776889, 313177, 205337, 230318, 154513, 155255, 259921, 169736, 139363, 188878, 154077, 146393, 195701, 127483, 113799, 109585, 124600, 97402, 198899, 129798, 88846, 660399, 76209, 94505, 65465, 46699, 40163, 34145, 37276, 33480, 84709, 67468, 34144, 40616, 27063, 28027, 15985, 17189, 14914, 14057, 9497, 9951, 9696, 16858, 12940, 16609, 6876, 30265, 26949, 38818, 20143, 18162, 19409, 16892, 22755, 29847, 10955, 6206, 9887, 13402, 1879, 2702, 3110, 5285, 2891, 3035, 4418, 2771, 4252, 340, 6896, 6225, 2122, 7346, 4187, 5699, 11342, 6588, 10035, 8916, 12356, 11844, 1925, 4827, 14867, 6036, 2715, 5008, 154, 1968, 2625, 39, 28, 27, 493, 74, 2514, 9475, 19, 6, 23, 151, 3801, 24, 2093, 681, 2, 11, 667, 0, 5070, 0, 0, 0, 0]


In [9]:
window = []
months = []
periods = []

for i, num in enumerate(num_tokens):
    window.append(num)
    months.append(i)
    if sum(window) > 2000000:
        periods.append(months)
        print(sum(window), months, sum(months)/len(months))
        window = []
        months = []

2416980 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] 12.0
2052651 [25, 26, 27, 28] 26.5
2255166 [29, 30, 31, 32] 30.5
2148271 [33, 34, 35, 36] 34.5
2116968 [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] 42.0
2017226 [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59] 53.5


In [11]:
import numpy as np
childes_files = sorted(glob.glob("./data/childes-en/*.csv"), key=lambda x: int(x.split('_')[-1][:-4]))
year2vecs = {}
for i, period in enumerate(periods):
    print(np.array(childes_files)[period])
    print('key', 'period'+str(i))
    docs = []
    for filename in np.array(childes_files)[period]:
        docs.extend(read_docs(filename))
    model = train_word2vec(docs, dim=100, min_count=15, iters=25, window=5)
    d = {w:v for w, v in zip(model.wv.index2word, model.wv.vectors)}
    year2vecs['period'+str(i)] = d
print(num_tokens)

['./data/childes-en/month_0_1.csv' './data/childes-en/month_1_2.csv'
 './data/childes-en/month_2_3.csv' './data/childes-en/month_3_4.csv'
 './data/childes-en/month_4_5.csv' './data/childes-en/month_5_6.csv'
 './data/childes-en/month_6_7.csv' './data/childes-en/month_7_8.csv'
 './data/childes-en/month_8_9.csv' './data/childes-en/month_9_10.csv'
 './data/childes-en/month_10_11.csv' './data/childes-en/month_11_12.csv'
 './data/childes-en/month_12_13.csv' './data/childes-en/month_13_14.csv'
 './data/childes-en/month_14_15.csv' './data/childes-en/month_15_16.csv'
 './data/childes-en/month_16_17.csv' './data/childes-en/month_17_18.csv'
 './data/childes-en/month_18_19.csv' './data/childes-en/month_19_20.csv'
 './data/childes-en/month_20_21.csv' './data/childes-en/month_21_22.csv'
 './data/childes-en/month_22_23.csv' './data/childes-en/month_23_24.csv'
 './data/childes-en/month_24_25.csv']
key period0
['./data/childes-en/month_25_26.csv' './data/childes-en/month_26_27.csv'
 './data/childes-en/

In [10]:
with open('./data/embeddings-over-time/embeddings-English-1M-ep25-f15.pickle', 'wb') as handle:
    pickle.dump(year2vecs, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)