In [158]:
'''
Social network visualization 
'''
import re
import pandas as pd
from pyvis.network import Network
from itertools import combinations


def authors(csv,search):
    '''
    Extract text IDs and authors' names.
    '''
    df = pd.read_csv(csv)
    authors = df['author']
    ids = df['id']
    titles = df['title']
    count = 0
    dict = {}
    for idx,TCPID in enumerate(ids):
        if TCPID.strip() in search: 
            words = authors[idx].split(';')
            newWords = []
            for w in words: 
                w = w.strip()
                if w == '': continue
                if ' aut' in w: w = re.sub(' aut','',w)
                if re.search('printer|engraver',w):continue
                if re.search('Brome, Richard, d. 1652?|Brome, Ricahrd, d. 1652?.',w):
                    # we want to retain the distinction between direct & attributed authorship  
                    if 'attributed name.' not in w: 
                        w = 'Brome, Richard, d. 1652?'
                if 'Marston, John, 1575?-1634.' in w: 
                    if 'attributed name.' not in w: 
                        w = 'Marston, John, 1575?-1634'
                if 'Newcastle, William Cavendish, Duke of, 1592-1676.' in w: 
                    if 'attributed name.' not in w: 
                        w='Newcastle, William Cavendish, Duke of, 1592-1676.'
                newWords.append(w)
            dict[TCPID] = [list(set(newWords)),titles[idx]]
            count += 1
    return dict 

In [162]:
'''Get n-gram features'''
inFile = open('/Users/amycweng/Digital Humanities/Early-Modern-London/gramsEachplaysText.txt','r')

inFileLines = inFile.readlines()
inFile.close()
ngrams = {}
for line in inFileLines: 
    line = line.split(':')
    tcpID = line[0].strip()
    if '_' in tcpID: 
        tcpID = tcpID.split('_')[0]
    features = line[1].strip()
    if features == '': continue
    allfeatures = features.strip().split(' ')
    ngrams[tcpID] = []
    for f in allfeatures: 
        if re.search('thrift|thrifty|commodity|profit|wealth',f): 
            ngrams[tcpID].append(f)
print(len(ngrams))

29


In [163]:
'''Catalog authors, grams, and TCPIDs'''
file = '/Users/amycweng/Digital Humanities/Early-Modern-London/playwrightsTCP.csv'
tcpIDs = ngrams.keys()
auths = authors(file, tcpIDs)
uniqueAuths = []
for TCPID,infoList in auths.items():
    uniqueAuths.extend(infoList[0])
uniqueAuths = list(set(uniqueAuths))
print(len(uniqueAuths))
print(uniqueAuths)

pubdf = pd.read_csv('spreadsheet_Plays_PublishersTCP.csv')
textToPub = {}
uniquePubs = []
for idx,TCPID in enumerate(pubdf['id']):
    pubs = pubdf['editedpublisher'][idx]
    if pubs == 'None': continue
    pubs = pubs.split('; ')
    for i,p in enumerate(pubs):
        pubs[i] = p.strip() 
        uniquePubs.append(p)
    textToPub[TCPID] = pubs
uniquePubs = list(set(uniquePubs))
print(len(uniquePubs))
print(uniquePubs)

21
['Heywood, Thomas, d. 1641.', 'Fletcher, John, 1579-1625.', 'Sharpham, Edward, 1576-1608.', 'Rowley, William, 1585?-1642?', 'Jonson, Ben, 1573?-1637.', 'Greene, Thomas, d. 1612.', 'Shirley, James, 1596-1666.', 'Newcastle, William Cavendish, Duke of, 1592-1676.', 'Brome, Alexander, 1620-1666.', 'Cooke, Jo., fl. 1614.', 'Beaumont, Francis, 1584-1616.', 'Webster, John, 1580?-1625?', 'Rowley, Samuel, d. 1633?', 'Marmion, Shackerley, 1603-1639.', 'Greene, Robert, 1558?-1592.', 'Brome, Richard, d. 1652?', 'Marston, John, 1575?-1634', 'Dekker, Thomas, ca. 1572-1632.', 'Chapman, George, 1559?-1634.', 'Heywood, Thomas, d. 1641, attributed name.', 'Middleton, Thomas, d. 1627.']
27
['T Gubbin', 'Francis Constable', 'John Oxenbridge', 'Walter Burre', 'William Holme', 'Thomas Dring', 'John Hodgets', 'Hum Robinson', 'A Johnson', 'Humrey Lownes', 'Hum Hoseley', 'John Trundle', 'H Brome', 'Valentine Sims', 'John Grove', 'Richard Marriot', 'William Cooke', 'Robert Allot', 'Nathaniel Butter', 'Thomas

In [164]:
aToGramText = {}
authGrams = {}

# authpubedges stores all possible Author-Author, Author-Publisher, Publisher-Publisher edges
authpubedges = []

for TCPID, infoList in auths.items(): 
    allList = infoList[0].copy()
    if TCPID in textToPub.keys():
        allList.extend(textToPub[TCPID])
    combos = list(combinations(allList,2))
    for combo in combos: 
        if combo[1] == combo[0]: continue #avoids edges that point back to self 
        if (combo[1],combo[0]) in authpubedges: continue
        if (combo[0],combo[1]) in authpubedges: continue
        authpubedges.append(combo)

    for auth in infoList[0]: 
        auth = auth.strip(r'\[').strip(r'\]')
        if auth not in authGrams.keys():
            authGrams[auth] = []
        authGrams[auth].extend(list(set(ngrams[TCPID]))) 
        if auth not in aToGramText.keys(): 
            aToGramText[auth] = {TCPID: list(set(ngrams[TCPID]))}
        else: aToGramText[auth] = aToGramText[auth] | {TCPID: list(set(ngrams[TCPID]))}

gramToAuths = {}
for auth, grams in authGrams.items(): 
    for gram in grams: 
        if gram not in gramToAuths.keys(): gramToAuths[gram] = [auth]
        else: gramToAuths[gram].append(auth)

In [165]:
'''
Color Legend: 
- author nodes are purple 
- publisher nodes are palevioletred
- ngram edges are pink
- aut-pub edges are light blue #80B1D3 
- aut-aut edges are light purple #bebada
- put-pub edges are light green #ccebc5
'''

def gramAuthEdges(gramAuthDict):
    '''
    Catalogs edges (as well as the edge value, i.e., n-gram) between nodes
    Returns a list of tuples (combo,gram) 
    '''
    edgelist = []    
    for gram,authList in gramAuthDict.items():
        combos = list(combinations(authList,2))
        for combo in combos: 
            # add only edges between DIFFERENT authors. For each different gram, add a new edge  
            if combo[1] == combo[0]: continue #avoids edges that point back to self 
            if ((combo[1],combo[0]),gram) in edgelist: continue
            if ((combo[0],combo[1]),gram) in edgelist: continue
            edgelist.append((combo,gram))  
    return edgelist

'''Creates the information that will pop-up when a cursor hovers over an author node in the network'''
def createTitle(auth,authInfo):
    items = ''
    for TCPID,gramList in authInfo.items(): 
        count = 0
        for item in gramList: 
            if count == 0: gramString = f'{item}'
            count += 1 
            if not count % 2: 
                if count == len(gramList): gramString += f', {item}'
                else: gramString += f', {item},\n'
            elif count>1: gramString += f'\t{item}'
        items += f'{TCPID}: {gramString}\n'
    title = f'{auth}:\n{items}'
    return title
    
def gramGraph(edgelist,title,heading,aToGramText):
    # Create pyvis graph
    aa, ap, pp = 0,0,0
    g = Network(width=800,height=1000,notebook=True,heading=heading,bgcolor='white',font_color='black',directed=True)
    for authPair,gram in edgelist:        
        a1Title = createTitle(authPair[0],aToGramText[authPair[0]])
        a2Title = createTitle(authPair[1],aToGramText[authPair[1]])
        g.add_node(authPair[0], authPair[0], title=a1Title, color='purple',labelHighlightBold=True,chosen=True)
        g.add_node(authPair[1], authPair[1], title=a2Title, color='purple',labelHighlightBold=True,chosen=True)
        if gram != None: 
            g.add_edge(authPair[0],authPair[1],title=gram,color='pink',width=5,arrows='hide')
    
    for p1,p2 in authpubedges:
        if p1 in uniqueAuths: 
            g.add_node(p1, p1, title=p1, color='purple',labelHighlightBold=True,chosen=True)
            p1role = 'auth'
        else: 
            g.add_node(p1, p1, title=p1, color='palevioletred',labelHighlightBold=True,chosen=True)
            p1role = 'pub'
        if p2 in uniqueAuths: 
            g.add_node(p2, p2, title=p2, color='purple',labelHighlightBold=True,chosen=True)
            p2role = 'auth'
        else: 
            g.add_node(p2, p2, title=p2, color='palevioletred',labelHighlightBold=True,chosen=True)
            p2role = 'pub'
        if (p1role == 'auth' and p2role == 'pub') or (p1role == 'pub' and p2role == 'auth'):
            g.add_edge(p1,p2,color='#80B1D3',width=5,arrows='hide')
            ap += 1
        elif (p1role == 'auth' and p2role == 'auth'): 
            g.add_edge(p1,p2,color='#bebada',width=5,arrows='hide')
            aa += 1
        else: 
            g.add_edge(p1,p2,color='#ccebc5',width=5,arrows='hide')
            pp += 1
    print(f'There are {aa} author-author edges, {ap} author-publisher edges, and {pp} publisher-publisher edges.')
    g.set_edge_smooth('dynamic')
    g.repulsion()
    g.show(title+".html")


In [166]:
'''Create and output network'''
gaEdges = gramAuthEdges(gramToAuths)
print(f'There are {len(gaEdges)} n-gram edges')

There are 105 n-gram edges


In [167]:
gramGraph(gaEdges,'spreadsheet plays network','Spreadsheet Plays - Network',aToGramText)

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
There are 12 author-author edges, 44 author-publisher edges, and 7 publisher-publisher edges.


In [168]:
'''Get context windows for n-grams'''
import os,re

def getTexts(folder):
    fileToText = {}
    underscores = {}
    for root,dirs,files in os.walk(folder):
        for file in files:
                if '.txt' not in file: continue
                path = os.path.join(folder,file)
                f = open(path,'r')
                text = f.readlines()[0]
                if '_' in file: 
                        name = file.split('_')[0]
                        if name not in underscores.keys(): 
                                underscores[name] = text
                        else: 
                                underscores[name] = underscores[name] + ' ' + text
                else: 
                        name = file.split('.')[0]
                        fileToText[name] = text
                f.close()
    for name,text in underscores.items():
        fileToText[name] = text
    return fileToText

In [169]:
'''Map n-grams to TCPIDs'''
bigramdata = getTexts('/Users/amycweng/Digital Humanities/playsTXT')

textGrams = {}
for TCPID in ngrams.keys(): 
    nGrams = list(set(ngrams[TCPID]))
    if '_' in TCPID: 
        TCPID = TCPID.split('_')[0]
    textGrams[TCPID] = []
    for gram in nGrams: 
        gram = re.sub('_',' ',gram)
        textGrams[TCPID].append(gram)
print(len(textGrams))

29


In [170]:
def context(searchgram,textName):
    text = bigramdata[textName]
    if (searchgram in text):
        indices = [i for i in range(len(text)) if text.startswith(searchgram, i)]
        windows = []
        for index in indices:
            if index > 120: window = text[(index-120):(index+120)].split(' ')
            else: window = text[:index+120].split(' ')
            del window[0]
            del window[-1]
            window = ' '.join(window)
            windows.append(window)
        return windows 

In [173]:
'''Get context windows into a dictionary'''
contexts = {}
for TCPID,gramList in textGrams.items():
    contexts[TCPID] = {}
    for gram in gramList: 
        formatGram = re.sub(' ','_',gram)
        contexts[TCPID][formatGram] = context(gram,TCPID)

In [178]:
with open("playsNgramContexts.txt","w+") as file: 
    for tcpid, cDict in contexts.items(): 
        file.write(f'{tcpid}:\n')
        for gram,cList in cDict.items(): 
            file.write(f'{gram}:\n')
            for c in cList: 
                file.write(f'{c}\n')
            file.write('\n\n')
        file.write('\n\n')