In [1]:
import os 
import re 
import matplotlib.pyplot as plt
import numpy as np

def scatter(dates,freqs,title,color,x_label,y_label):
    xData = list(map(int, dates))
    yData = list(map(int, freqs))
    
    xyCoords=np.c_[xData,yData]
    
    uniquePoints, counts = np.unique(xyCoords, return_counts=True,axis=0)
    plt.scatter(uniquePoints[:,0],uniquePoints[:,1],s=counts*100,c=color,)

    fig = plt.gcf()
    fig.set_size_inches(15, 10)

    plt.xlabel(x_label, fontsize=20)
    plt.ylabel(y_label, fontsize=20)
    
    plt.title(title,fontsize=25)
    
    plt.show()

def bar(xData,yData,title,xlabel,ylabel):
    if (xData == "Year"):
        xData = list(map(int, xData))
        yData = list(map(int, yData))
    
    plt.figure(figsize = (15, 10))

    plt.bar(xData,yData,color='purple')
    
    plt.title(title, fontsize=20)
    
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel(ylabel, fontsize=15)
    
    plt.show()

In [2]:
dir = '/srv/data/'
f1 = 'eebo_phase1_IDs_and_dates.txt'
f2 = 'EEBO_Phase2_IDs_and_dates.txt' 

def getDates(folder,f,dRanges,unknown,ourTimes):
    file = os.path.join(folder,f)
    data = open(file,'r')
    data = data.readlines()
    dates = {}
    for d in data:
        datum = d.replace('\n','')
        datum = d.split('\t')
        id = datum[0]
        date = datum[1].replace('\n','')
        date = date.replace('?','')
        
        if re.search('-',date):
            # only look at the start date 
            r = date.split('-')
            dRanges[id] = date
            date = r[0] 
            # if re.search('u', r[1]) or r[1].find('?') != -1:
            #     unknown[id] = dRanges[id]
            # continue
      
        if re.search('u', date): #or date.find('?') != -1:
            unknown[id] = date
            if re.search('\d{3}',date):
                estimate = re.findall('\d{3}',date)[0]
                if estimate in range(158,164):
                    date = estimate + '0'
                else:
                    continue
            else:
                continue
        
        value = int(date)
        if value in range(1580,1640+2):
            ourTimes[id] = value  
            if value not in dates.keys():
                dates[value] = []
            else:
                dates[value].append(id)
    
    print("There are " + str(len(ourTimes)) + " texts within 1580-1641 (including the first estimated date in a range), " + 
            str(len(dRanges)) + " texts with date ranges, and", 
            str(len(unknown)) + " texts with unknown dates (marked with a 'u').")
    # print(f"Here are the texts within your range: {ourTimes}")
    # print(f"Here are the texts with a time range: {dRanges}")
    # print(f"These are the texts with unknown dates: {unknown}")
    return dates,unknown,dRanges,ourTimes

def plotFreqs(dList):
    freqs = {}
    for dict in dList: 
        for key in dict.keys():
            if key not in freqs.keys():
                freqs[key] = len(dict[key])
            else: 
                freqs[key] += len(dict[key])
    scatter(freqs.keys(),freqs.values(),"TCP Texts Per Year","purple","Year","Number of TCP Texts")
    bar(freqs.keys(),freqs.values(),"TCP Texts Per Year","Year","Number of TCP Texts")

In [3]:
print('Phase I')
d1 = getDates(dir,f1,{},{},{})
print('Phase II')
d2 = getDates(dir,f2,{},{},{})

Phase I
There are 5861 texts within 1580-1641 (including the first estimated date in a range), 452 texts with date ranges, and 3 texts with unknown dates (marked with a 'u').
Phase II
There are 5789 texts within 1580-1641 (including the first estimated date in a range), 371 texts with date ranges, and 5 texts with unknown dates (marked with a 'u').


In [None]:
plotFreqs([d1[0],d1[0]])

In [None]:
def plotUnknowns(list):
    freqs = {}
    for d in list: 
        for id in d.keys(): 
            start = d[id].split('-')[0]
            if start.find('?') != -1 or re.search('u',start):
                print(start)
                continue
            else:
                if start not in freqs.keys():
                    freqs[start] = 1
                else:
                    freqs[start] += 1 
    print(sorted (freqs.items(), key=lambda kv:(kv[1], kv[0]), reverse=True))
    scatter(freqs.keys(),freqs.values(),"TCP Texts w/ Date Ranges","purple","Earliest Estimated Date","Number of TCP Texts")

plotUnknowns([d1[2],d2[2]])

In [None]:
def getIDs(list):
    for dict in list:
        names = []
        for key in dict.keys():
            names.append(key)
        return names
print(getIDs([d1[2],d2[2]]))


In [None]:
output = '/srv/data/dateRangeTextNames.txt'
names = getIDs([d1[2],d2[2]])
file = open(output,'a+')
count = 0
for name in names:
    count += 1
    file.write(name + '\n')
print(count)
file.close()

In [4]:
d1 = getDates(dir,f1,{},{},{})
d2 = getDates(dir,f2,{},{},{})
# info = d1[3]
# for k in d2[3].keys():
#     info[k] = d2[3][k]

epMissingInfo = open('/srv/data/ECBC-Data-2022/Text_Files/EPmissing.txt','r')
epMissingInfo = epMissingInfo.readlines()
names = []
for name in epMissingInfo:
    name = name.replace('\n','')
    names.append(name)
print(len(names))

# relevant = {}
# for n in names:
#     if n in info.keys(): 
#         date = info[n]
#         relevant[n] = date
     
# print(len(relevant))
# print(len(d2[3]))

There are 5861 texts within 1580-1641 (including the first estimated date in a range), 452 texts with date ranges, and 3 texts with unknown dates (marked with a 'u').
There are 5789 texts within 1580-1641 (including the first estimated date in a range), 371 texts with date ranges, and 5 texts with unknown dates (marked with a 'u').
8222


In [5]:
relevantI,relevantII = {},{}
for n in names:
    if n in d1[3].keys(): 
        date = d1[3][n]
        relevantI[n] = date
    elif n in d2[3].keys():
        date = d2[3][n]
        relevantII[n] = date
print(len(relevantI))
print(len(relevantII))
print(len(relevantI)+len(relevantII))
        

320
1062
1382


In [17]:
phaseImissing = '/srv/data/ECBC-Data-2022/Text_Files/relevantEPmissingPhaseI.txt'
phaseIImissing = '/srv/data/ECBC-Data-2022/Text_Files/relevantEPmissingPhaseII.txt'
IFile = open(phaseImissing,'a+')
IIFile = open(phaseIImissing,'a+')
for id in relevantI.keys():
    IFile.write(id+' '+str(relevantI[id])+'\n')
IFile.close()
for id in relevantII.keys():
    IIFile.write(id+' '+str(relevantII[id])+'\n')
IIFile.close()