In [2]:
import os 
import re 
import matplotlib.pyplot as plt
import numpy as np

def scatter(dates,freqs,title,color,x_label,y_label):
    xData = list(map(int, dates))
    yData = list(map(int, freqs))
    
    xyCoords=np.c_[xData,yData]
    
    uniquePoints, counts = np.unique(xyCoords, return_counts=True,axis=0)
    plt.scatter(uniquePoints[:,0],uniquePoints[:,1],s=counts*100,c=color,)

    fig = plt.gcf()
    fig.set_size_inches(15, 10)

    plt.xlabel(x_label, fontsize=20)
    plt.ylabel(y_label, fontsize=20)
    
    plt.title(title,fontsize=25)
    
    plt.show()

def bar(xData,yData,title,xlabel,ylabel):
    if (xData == "Year"):
        xData = list(map(int, xData))
        yData = list(map(int, yData))
    
    plt.figure(figsize = (15, 10))

    plt.bar(xData,yData,color='purple')
    
    plt.title(title, fontsize=20)
    
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    
    plt.show()

In [4]:
dir = '/srv/data/'
f1 = 'eebo_phase1_IDs_and_dates.txt'
f2 = 'EEBO_Phase2_IDs_and_dates.txt' 

def getDates(folder,f,dRanges,unknown,ourTimes):
    file = os.path.join(folder,f)
    data = open(file,'r')
    data = data.readlines()
    dates = {}
    for d in data:
        datum = d.replace('\n','')
        datum = d.split('\t')
        id = datum[0]
        date = datum[1].replace('\n','')
        
        if re.search('-',date):
            r = date.split('-')
            dRanges[id] = date
            date = r[0] 
            if re.search('u', r[1]) or r[1].find('?') != -1:
                unknown[id] = dRanges[id]
            # continue
      
        if re.search('u', date) or date.find('?') != -1:
            unknown[id] = date
            continue
        
        value = int(date)
        if value in range(1580,1640+1):
            ourTimes[id] = value  
            if value not in dates.keys():
                dates[value] = []
            else:
                dates[value].append(id)
    
    print("There are " + str(len(ourTimes)) + " texts within 1580-1640 (including the first estimated date in a range), " + 
            str(len(dRanges)) + " texts with date ranges, and", 
            str(len(unknown)) + " texts with unknown dates (marked with a 'u' or '?').")
    # print(f"Here are the texts within your range: {ourTimes}")
    # print(f"Here are the texts with a time range: {dRanges}")
    # print(f"These are the texts with unknown dates: {unknown}")
    return dates,unknown,dRanges,ourTimes

def plotFreqs(dList):
    freqs = {}
    for dict in dList: 
        for key in dict.keys():
            if key not in freqs.keys():
                freqs[key] = len(dict[key])
            else: 
                freqs[key] += len(dict[key])
    scatter(freqs.keys(),freqs.values(),"TCP Texts Per Year","purple","Year","Number of TCP Texts")
    bar(freqs.keys(),freqs.values(),"TCP Texts Per Year","Year","Number of TCP Texts")

In [5]:
print('Phase I')
d1 = getDates(dir,f1,{},{},{})
print('Phase II')
d2 = getDates(dir,f2,{},{},{})

Phase I
There are 5309 texts within 1580-1640 (including the first estimated date in a range), 452 texts with date ranges, and 383 texts with unknown dates (marked with a 'u' or '?').
Phase II
There are 5136 texts within 1580-1640 (including the first estimated date in a range), 371 texts with date ranges, and 302 texts with unknown dates (marked with a 'u' or '?').


In [110]:
def plotUnknowns(list):
    freqs = {}
    for d in list: 
        for id in d.keys(): 
            start = d[id].split('-')[0]
            if start.find('?') != -1 or re.search('u',start):
                print(start)
                continue
            else:
                if start not in freqs.keys():
                    freqs[start] = 1
                else:
                    freqs[start] += 1 
    print(sorted (freqs.items(), key=lambda kv:(kv[1], kv[0]), reverse=True))
    # scatter(freqs.keys(),freqs.values(),"TCP Texts w/ Date Ranges","purple","Earliest Estimated Date","Number of TCP Texts")

plotUnknowns([d1[2],d2[2]])

1697?
1uuu
15uu
[('1670', 100), ('1685', 77), ('1688', 65), ('1674', 57), ('1663', 40), ('1658', 39), ('1690', 35), ('1684', 35), ('1680', 34), ('1681', 33), ('1678', 27), ('1682', 23), ('1641', 17), ('1695', 16), ('1660', 16), ('1600', 16), ('1689', 14), ('1628', 13), ('1699', 10), ('1672', 10), ('1650', 10), ('1700', 9), ('1687', 8), ('1686', 7), ('1655', 7), ('1694', 6), ('1698', 5), ('1696', 5), ('1693', 5), ('1692', 5), ('1675', 5), ('1640', 5), ('1676', 4), ('1671', 4), ('1666', 4), ('1697', 3), ('1683', 3), ('1679', 3), ('1647', 3), ('1646', 3), ('1500', 3), ('1661', 2), ('1659', 2), ('1654', 2), ('1653', 2), ('1775', 1), ('1743', 1), ('1704', 1), ('1691', 1), ('1673', 1), ('1668', 1), ('1665', 1), ('1664', 1), ('1662', 1), ('1656', 1), ('1651', 1), ('1645', 1), ('1644', 1), ('1642', 1), ('1626', 1), ('1625', 1), ('1622', 1), ('1617', 1), ('1616', 1), ('1613', 1), ('1609', 1), ('1604', 1), ('1584', 1), ('1582', 1), ('1552', 1), ('1548', 1), ('1508', 1), ('1477', 1)]


In [None]:
def getIDs(list):
    for dict in list:
        names = []
        for key in dict.keys():
            names.append(key)
        return names
print(getIDs([d1[2],d2[2]]))


In [10]:
output = '/srv/data/dateRangeTextNames.txt'
names = getIDs([d1[2],d2[2]])
file = open(output,'a+')
count = 0
for name in names:
    count += 1
    file.write(name + '\n')
print(count)
file.close()

452


In [10]:
d1 = getDates(dir,f1,{},{},{})
d2 = getDates(dir,f2,{},{},{})
info = d1[3]
for k in d2[3].keys():
    info[k] = d2[3][k]

epMissingInfo = open('/srv/data/EPmissing.txt','r')
epMissingInfo = epMissingInfo.readlines()
names = []
for name in epMissingInfo:
    name = name.replace('\n','')
    names.append(name)

relevant = {}
for n in names:
    if n in info.keys(): 
        date = info[n]
        relevant[n] = date
     
print(len(relevant))

There are 5309 texts within 1580-1640 (including the first estimated date in a range), 452 texts with date ranges, and 383 texts with unknown dates (marked with a 'u' or '?').
There are 5136 texts within 1580-1640 (including the first estimated date in a range), 371 texts with date ranges, and 302 texts with unknown dates (marked with a 'u' or '?').
1284


In [12]:
relevantEPmissing = '/srv/data/relevantEPmissing.txt'
rFile = open(relevantEPmissing,'a+')
for id in relevant.keys():
    rFile.write(id+' '+str(relevant[id])+'\n')
rFile.close()