In [1]:
#!/usr/bin/env python
# coding: utf-8

import os, sys, gzip, csv
from glob import glob

import numpy as np
import collections

import matplotlib.pyplot as plt
%matplotlib inline

# for PCA
from sklearn.decomposition import PCA

In [2]:
models = [
    ["1530-1539","../models/eebo-1530-1539-drift.csv.gz"],
    ["1540-1549","../models/eebo-1540-1549-drift.csv.gz"],
    ["1550-1559","../models/eebo-1550-1559-drift.csv.gz"],
    ["1560-1569","../models/eebo-1560-1569-drift.csv.gz"],
    ["1570-1579","../models/eebo-1570-1579-drift.csv.gz"],
    ["1580-1589","../models/eebo-1580-1589-drift.csv.gz"],
    ["1590-1599","../models/eebo-1590-1599-drift.csv.gz"],
    ["1610-1619","../models/eebo-1610-1619-drift.csv.gz"],
    ["1620-1629","../models/eebo-1620-1629-drift.csv.gz"],
    ["1630-1639","../models/eebo-1630-1639-drift.csv.gz"]
]
    
drift_data = list()
for model_name, file in models:
    print("starting {0}".format(model_name))
    input_data = dict()
    row_count = 0
    with gzip.open(file, 'rt') as csvfile:    
        reader = csv.reader(csvfile)
        for row in reader:
            val = float(row[1].replace('[[','').replace(']]',''))
        
            # convert to angular distance for values from 0 - 1
            val = np.arccos(val) / np.pi
            input_data[row[0]] = val
            row_count += 1

    drift_data.append([model_name,input_data])

starting 1530-1539
starting 1540-1549
starting 1550-1559
starting 1560-1569
starting 1570-1579
starting 1580-1589
starting 1590-1599
starting 1610-1619
starting 1620-1629
starting 1630-1639


In [6]:
unique_terms = set().union(*(m[1].keys() for m in drift_data))

In [42]:
# basic stats
words = list()
model_names = [x[0] for x in drift_data]
mc = len(model_names)
rows = list()

for word in unique_terms:
    row = list()
    for model, data in drift_data:
        if word in data.keys():
            row.append(data[word])
    
    if len(row) == mc:
        words.append(word)
        mean = np.mean(row)
        rows.append(mean)

In [43]:
# display words with greatest change over time
for idx in np.argsort(rows)[:-25 - 1:-1]:
    print(words[idx],rows[idx])

succession 0.4896920783751207
rule 0.48887738293293986
ſay 0.48569088269555805
co 0.48557435749826094
geue 0.48473655989181913
becauſe 0.48443314929087655
request 0.4841320475630937
greateſt 0.4839808837242905
masse 0.48371854667289166
re 0.483563005172282
vn 0.4833849534239854
etly 0.48338266224716425
ſure 0.48321444734894303
deede 0.4824988021130242
same 0.48247587477363824
con 0.48207149220754114
churche 0.48171020176391366
render 0.48142320156238283
thoſe 0.4813365895779375
shewe 0.4808324440634141
kinde 0.48077296426383764
sende 0.4805880486004999
full 0.4805399979217301
kind 0.48014674268463126
meane 0.48000992918326724


In [47]:
# display words with least change over time
for idx in np.argsort(rows)[:25]:
    print(words[idx],rows[idx])

inquam 0.3069783818364363
praemia 0.30827270510159815
etia 0.3097732299445368
proinde 0.3104699783121545
eandem 0.31095723805627734
debeat 0.3132597812067088
dignitatem 0.31404712980450444
sese 0.3151235342564968
isti 0.31518495831031595
quarum 0.3153209983773232
fuerint 0.3153936395889373
dicens 0.31680134933591625
hijs 0.31754216710719074
itaque 0.31764862757452417
vlla 0.31900233343592876
porro 0.31910284735118344
tantam 0.3199167421884201
quales 0.32003019065487
teris 0.3212556785060285
saltem 0.3224660055240959
sententiam 0.3225160263427803
videatur 0.3226870328672467
criminis 0.32297048971569076
pariter 0.3233658119076416
propterea 0.324748228357952


In [44]:
def get_mean(word):
    idx = words.index(word)
    return(rows[idx])

In [45]:
get_mean("rule")

0.48887738293293986

In [50]:
words_of_interest = ["religion","elect","election",
                     "church","churche","trinity","jesus","individual",
                     "predestination","chosen","soul","redemption","faith",
                     "fate","self","selfe","sacrement","authority"]

words = list()
model_names = [x[0] for x in drift_data]
rows = list()

for word in words_of_interest:
    words.append(word)
    row = list()
    for model, data in drift_data:
        if word in data.keys():
            row.append(data[word])
        else:
            row.append("N/A")
        
    rows.append(row)

In [51]:
import pandas as pd
change_chart = pd.DataFrame(rows, index=words, columns=model_names)
change_chart['Most Significant Change'] = np.argmax(change_chart.values, axis=1)
change_chart

Unnamed: 0,1530-1539,1540-1549,1550-1559,1560-1569,1570-1579,1580-1589,1590-1599,1610-1619,1620-1629,1630-1639,Most Significant Change
religion,0.493383,0.469319,0.48573,0.518318,0.448997,0.453087,0.419821,0.423166,0.445745,0.47223,3
elect,0.510397,0.465616,0.508061,0.501173,0.449421,0.448795,0.406264,0.435357,0.408305,0.403442,0
election,0.507809,0.462878,0.51712,0.510901,0.465611,0.476783,0.445162,0.486795,0.470399,0.434499,2
church,0.485912,0.424418,0.500723,0.469463,0.463372,0.454793,0.4155,0.44245,0.473776,0.492245,2
churche,0.54709,0.483025,0.491327,0.483575,0.488927,0.440412,0.399464,0.52649,0.460245,0.496547,0
trinity,0.392503,0.531171,0.459675,0.437215,0.435992,0.433391,0.423946,0.48722,0.456454,0.44552,1
jesus,0.419323,0.429077,0.485802,0.480998,0.477789,0.476234,0.475031,0.501463,0.443799,0.441286,7
individual,,,,,,,,,0.413821,,0
predestination,0.426131,0.452143,0.453523,0.483249,0.431998,0.4437,0.46168,0.462335,0.456266,0.396375,3
chosen,0.442028,0.428599,0.483498,0.458485,0.501414,0.424181,0.450696,0.441017,0.473599,0.468746,4
