In [2]:
#!/usr/bin/env python
# coding: utf-8

import os, sys, gzip, csv
from glob import glob

import numpy as np
import collections

import matplotlib.pyplot as plt
%matplotlib inline

# for PCA
from sklearn.decomposition import PCA

In [3]:
models = [
    ["Edward VII","../models/crown-edward-drift.csv.gz"],
    ["Mary I","../models/crown-mary-drift.csv.gz"],
    ["Elizabeth I","../models/crown-elizabeth-drift.csv.gz"],
    ["James I","../models/crown-james-drift.csv.gz"],
    ["Charles I","../models/crown-charles-drift.csv.gz"]
]
    
drift_data = list()
for model_name, file in models:
    print("starting {0}".format(model_name))
    input_data = dict()
    row_count = 0
    with gzip.open(file, 'rt') as csvfile:    
        reader = csv.reader(csvfile)
        for row in reader:
            val = float(row[1].replace('[[','').replace(']]',''))
        
            # convert to angular distance for values from 0 - 1
            val = np.arccos(val) / np.pi
            input_data[row[0]] = val
            row_count += 1

    drift_data.append([model_name,input_data])

starting Edward VII
starting Mary I
starting Elizabeth I
starting James I
starting Charles I


In [4]:
unique_terms = set().union(*(m[1].keys() for m in drift_data))

In [5]:
# basic stats
words = list()
model_names = [x[0] for x in drift_data]
mc = len(model_names)
rows = list()

for word in unique_terms:
    row = list()
    for model, data in drift_data:
        if word in data.keys():
            row.append(data[word])
    
    if len(row) == mc:
        words.append(word)
        mean = np.mean(row)
        rows.append(mean)

In [6]:
# display words with greatest change over time
for idx in np.argsort(rows)[:-25 - 1:-1]:
    print(words[idx],rows[idx])

effect 0.4603308884048373
lady 0.45965735406399794
provision 0.45778339857676953
appointed 0.45373979757211336
servants 0.453176974418701
say 0.45298552752532284
man 0.4524220975633577
learned 0.4505101252652066
natural 0.4487170294143509
my 0.4439260740445696
past 0.44372980973241705
lord 0.4428416184067071
sixth 0.4405171146667247
queen 0.4356032428052329
command 0.43256934911184297
men 0.43145201128944083
grace 0.43058898437777415
from 0.430585069007171
france 0.4303534008492906
they 0.4299738474585707
day 0.4280614805838753
fourth 0.42796784272072286
no 0.4274598534976863
godly 0.42703316322556767
done 0.42690169372827824


In [7]:
# display words with least change over time
for idx in np.argsort(rows)[:25]:
    print(words[idx],rows[idx])

kinds 0.3316840429907139
virtue 0.3419566275759947
form 0.34394860564706703
several 0.34660094981487
ministers 0.3469494659840867
lying 0.3473048261467625
points 0.34959983955992857
parish 0.3541865849328035
contrary 0.3547441300275751
bishops 0.3551629598301379
general 0.3559846232260406
without 0.3564680111503479
example 0.3575358502922891
public 0.3591235443048382
use 0.36031594018537294
whole 0.3603196101504563
preaching 0.3605342074130545
having 0.3606081961394543
duty 0.3607045765550148
act 0.36088702059769584
used 0.3618989134620407
like 0.36281302550528566
make 0.3630760848702141
account 0.36313373404557886
commission 0.36340526600741974


In [8]:
def get_mean(word):
    idx = words.index(word)
    return(rows[idx])

In [9]:
len(words)

284

In [10]:
get_mean("kinds")

0.3316840429907139

In [14]:
words_of_interest = ["religion","elect","election",
                     "church","churche","trinity","jesus","individual",
                     "predestination","chosen","soul","redemption","faith",
                     "fate","self","selfe","sacrement","authority"]

words = list()
model_names = [x[0] for x in drift_data]
rows = list()

for word in words_of_interest:
    words.append(word)
    row = list()
    for model, data in drift_data:
        if word in data.keys():
            row.append(data[word])
        else:
            row.append(0)
        
    rows.append(row)

In [16]:
import pandas as pd
change_chart = pd.DataFrame(rows, index=words, columns=model_names)
change_chart['Most Significant Change'] = np.argmax(change_chart.values, axis=1)
change_chart

Unnamed: 0,Edward VII,Mary I,Elizabeth I,James I,Charles I,Most Significant Change
religion,0.362904,0.277589,0.397345,0.450557,0.420019,3
elect,0.341152,0.27824,0.0,0.489292,0.374883,3
election,0.556768,0.450281,0.0,0.392039,0.366398,0
church,0.303293,0.356347,0.424829,0.483819,0.438675,3
churche,0.0,0.294091,0.0,0.398928,0.0,3
trinity,0.0,0.3586,0.0,0.396893,0.0,3
jesus,0.371013,0.284087,0.0,0.468375,0.515217,4
individual,0.0,0.0,0.0,0.0,0.0,0
predestination,0.0,0.0,0.0,0.0,0.0,0
chosen,0.291171,0.633001,0.0,0.438243,0.501372,1
