In [1]:
#!/usr/bin/env python
# coding: utf-8

import os, sys
from glob import glob
import gzip
import gc
import csv
import networkx as nx
import re
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def spellProcessor(word):
    word = word.lower()
    word = word.replace("gyftis", "gifts")
    word = word.replace("gether", "gather")
    word = word.replace("spirituall", "spiritual")
    word = word.replace("feythfull", "faith")
    word = word.replace("wytnes", "witness")
    word = word.replace("almes", "alms")
    word = word.replace("desyre", "desire")
    word = word.replace("selfe", "self")
    word = word.replace("saffely", "safely")
    word = word.replace("realme", "realm")
    word = word.replace("acte", "act")
    word = word.replace("fourme", "form")
    word = word.replace("subiectes", "subjects")
    word = word.replace("theyr", "their")
    word = word.replace("kynde", "kind")
    word = word.replace("kynge", "king")
    word = word.replace("kyndes", "kinds")
    word = word.replace("vpon", "unto")
    word = word.replace("purueyours", "purveyors")
    word = word.replace("highnes", "highness")
    word = word.replace("euery", "every")
    word = word.replace("quene", "queen")
    word = word.replace("quenes", "queens")
    word = word.replace("whiche", "which")
    word = word.replace("bloude", "blood")
    word = word.replace("soueraine", "sovereign")
    word = word.replace("enactd", "enacted")
    word = word.replace("vs", "us")
    
    word = re.sub('\d','', word)
    return word

stopWords = [",","the","and","of","or","to","in","shall","be","that","any","by",".",
              "such","as","this","for","same","all","said","other","'s",";",
              "her","is","every","[","]","they","within", "our", "not", "so",
              "made", "no", "then", ":", "do", "from", "if", "it", "which", "at", "with",
             "thereof","upon", "a", "because", "used", "some", "but", "aforesaid", "also",
             ")","(", "what", "&", "may", "are", "their", "them", "sayde", "suche", "shalbe", "anye", "sayd",
             "thesaid", "/", "...", "/", "either", "haue", "vnto", "thy", "did", "was", "were", "have", "thee", 
             "your", "thou", "unto", "hath", "had", "went","kl"]

In [None]:
def extract_five_grams(term):
    words = list()
    for i, idx in vectorizer.vocabulary_.items():
        i = i.split()
        if i[2] == term:
            # get sums
            vocab_sums = dtm.sum(axis=0)
            count = (vocab_sums[0, idx]) 
            for j in range(count):
                words.append(i[0])
                words.append(i[1])
                words.append(i[3])
                words.append(i[4])
    return(words)

In [3]:
files = glob("../texts/eebo/eebo-year*gz")
labels = [os.path.basename(x).split(".")[0] for x in files]
labels = [x.split("-")[2] for x in labels]

In [4]:
# create a list of file objects
fp_list = [gzip.open(x) for x in files]

In [5]:
five_gram_data = dict()
for i, y in enumerate(fp_list):
    # re-establisher vectorizer
    vectorizer = CountVectorizer(
        input='files',
        lowercase=True,
        ngram_range=(5,5),
        preprocessor=spellProcessor,
        stop_words=stopWords)
    
    print("starting: {0}".format(labels[i]))

    # vectorize model
    print(" vectorizing...")
    dtm = vectorizer.fit_transform(fp_list[i])

    print(" saving five-grams...")
    csvfile = gzip.open('../texts/eebo/eebo-5gram-' + labels[i] + '.csv.gz', 'wt')
    writer = csv.writer(csvfile, delimiter=',')
    sums = dtm.sum(axis=0)
    for i, idx in vectorizer.vocabulary_.items():
        writer.writerow([i,sums[0,idx]])
    
    # free memory
    del vectorizer, dtm, sums
    gc.collect()

starting: 1632
 vectorizing...
 saving five-grams...
starting: 1585
 vectorizing...
 saving five-grams...
starting: 1589
 vectorizing...
 saving five-grams...
starting: 1597
 vectorizing...
 saving five-grams...
starting: 1620
 vectorizing...
 saving five-grams...
starting: 1608
 vectorizing...
 saving five-grams...
starting: 1616
 vectorizing...
 saving five-grams...
starting: 1604
 vectorizing...
 saving five-grams...
starting: 1564
 vectorizing...
 saving five-grams...
starting: 1576
 vectorizing...
 saving five-grams...
starting: 1568
 vectorizing...
 saving five-grams...
starting: 1540
 vectorizing...
 saving five-grams...
starting: 1523
 vectorizing...
 saving five-grams...
starting: 1552
 vectorizing...
 saving five-grams...
starting: 1531
 vectorizing...
 saving five-grams...
starting: 1574
 vectorizing...
 saving five-grams...
starting: 1578
 vectorizing...
 saving five-grams...
starting: 1566
 vectorizing...
 saving five-grams...
starting: 1533
 vectorizing...
 saving five-gr

In [None]:
#
# DEFINE term of interest
# 
# we'll search for five-grams with this word as the center word:
# [term1,term2,key_term,term4,term5]

key_term = "elect"

### for loop goes here
    print(" extracting five-gram terms...")
    terms = extract_five_grams("elect")
    five_gram_data[labels[i]] = terms

In [None]:
# now collect frequencies
year_frequencies = dict()
for y in five_gram_data:
    t = list()
    w = set(five_gram_data[y])
    for i in w:
        t.append([i,five_gram_data[y].count(i)])
    year_frequencies[y] = t

In [None]:
def graph_year(key_term,year):
    G = nx.Graph()
    node_list = list()
    labels=dict()

    # ensure that we have our key term
    labels[key_term] = key_term
    G.add_node(key_term)
    
    center_term = dict()
    center_term[key_term] = (0,-20)
    
    for t, w in sorted(year_frequencies[year], key=lambda x: x[1], reverse=True)[:15]:
        G.add_node(t)
        node_list.append(t)
        G.add_edge(key_term,t,weight=w)
        labels[t] = t
   
    plt.figure(figsize=(15,12))   
    pos=nx.spring_layout(G,scale=2,k=10,pos=center_term,fixed=center_term)
    nx.draw_networkx_nodes(G,pos,node_color='lightblue',node_size=1500)
    nx.draw_networkx_labels(G,pos,labels,font_size=10)

    all_weights = []
    for (node1,node2,data) in G.edges(data=True):
        all_weights.append(data['weight'])
    unique_weights = list(set(all_weights))

    for weight in unique_weights:
        weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in G.edges(data=True) 
                          if edge_attr['weight']==weight]
        width = weight*len(node_list)*3.0/sum(all_weights)
        nx.draw_networkx_edges(G,pos,edgelist=weighted_edges,width=width)
    plt.show()

In [None]:
graph_year("elect","1546")

In [None]:
graph_year("elect","1600")

In [None]:
graph_year("elect","1620")

In [None]:
graph_year("elect","1600")