In [1]:
'''
Social network visualization of the language around tobacco from 1625-1634
'''
import re
import pandas as pd
from pyvis.network import Network

def authors(csv,search):
    '''
    Extract text IDs and authors' names.
    '''
    df = pd.read_csv(csv)
    authors = df['author']
    ids = df['id']
    titles = df['title']
    dates = df['date']
    count = 0
    dict = {}
    for idx,TCPID in enumerate(ids):
        if TCPID.strip() in search: 
            words = set(authors[idx].split(';'))
            words.discard('')
            newWords = []
            for w in words: 
                w = w.strip()
                if re.search('printer|engraver',w):continue
                newWords.append(w)
            dict[TCPID] = [list(set(newWords)),titles[idx],dates[idx]]
            count += 1
    return dict 

In [2]:
'''Get tobacco n-gram features'''
inFile = open('/srv/data/periodFeatures/period4features/newtobaccograms.txt','r')
inFileLines = inFile.readlines()
inFile.close()
tobaccoNgrams = {}
for line in inFileLines: 
    line = line.split(':')
    features = line[1].strip()
    if features == '': continue
    tobaccoNgrams[line[0].strip()] = features.strip().split(' ')
print(len(tobaccoNgrams))

'''Clustering results for tobacco Period 4'''
Group0 = ['B12280', 'A10187', 'A03411', 'A19036', 'A68568', 'A11626', 'A14301', 'A14916', 'A15627', 'A09156', 'A00583', 'A19765', 'A20769', 'A12481', 'A13534', 'A14305', 'A08017', 'A09118', 'A08578']
Group2 = ['A01055', 'A20065', 'A20356', 'A69225', 'A11151', 'A12155', 'A13520', 'A08977', 'A19012', 'A03559', 'A13948', 'A07024', 'A04633', 'B00490', 'A72399', 'A68252', 'A20075', 'A02827', 'A02836', 'A03807', 'A13415', 'A13436', 'A05320', 'A16489', 'A06692', 'A07025', 'A09242', 'B01146', 'A72254', 'A10402', 'A21075', 'A11153', 'A02909', 'A03240', 'A04658', 'A16924', 'A08690']
Group1 = ['A22447', 'A22574', 'A22435', 'A22537', 'A22567', 'A22439', 'A22571']
Group3 = ['A02758', 'A06902', 'A16627', 'A01425', 'A03065', 'A05657', 'A11156', 'A01552', 'A06768', 'A01424']
Group4 = ['A15685', 'A16507', 'A72911', 'A02325', 'A12471', 'A09010', 'A12458', 'A17595']
groups = {0:Group0,1:Group1,2:Group2,3:Group3,4:Group4}
groupColors = {0:'pink',1:'purple',2:'darkblue',3:'plum',4:'palevioletred'}


81


In [19]:
'''N-gram categorization colors'''
from tobaccoGramGroups import plant,recreation,region,medical,trade,commodity,ethics,misc,maleGeneral,female
gramGroups = {
    'plant':plant,'recreation':recreation,'region':region,'trade':trade,'commodity':commodity,
    'ethics':ethics,'misc':misc,'maleGeneral':maleGeneral,'medical':medical,'female':female
}
edgeColors = {
    'plant': '#90C8AC', 'recreation':'#E8D7AD', 'region':'#CCF1FF', 'trade':'#edc7ed','commodity':'#FFD9C0',
    'ethics':'#FFCCE1','misc':'lemonchiffon','maleGeneral':'#E0D7FF','medical':'#8C88BA','female':'#BCD2E3'
}
from itertools import combinations
def gramNetwork(gramAuthDict):
    '''
    Catalogs edges (as well as the edge value, i.e., n-gram) between nodes
    Returns a list of tuples (combo,gram) 
    '''
    edgelist = []    
    for gram,authList in gramAuthDict.items():
        combos = list(combinations(authList,2))
        for combo in combos: 
            # add only edges between DIFFERENT authors. For each different gram, add a new edge  
            if combo[1] == combo[0]: continue #avoids edges that point back to self 
            if ((combo[1],combo[0]),gram) in edgelist: continue
            if ((combo[0],combo[1]),gram) in edgelist: continue
            edgelist.append((combo,gram))  
    return edgelist

'''Creates the information that will pop-up when a cursor hovers over a node in the network'''
def createTitle(auth,authInfo):
    items = ''
    for TCPID,gramList in authInfo.items(): 
        count = 0
        for item in gramList: 
            if count == 0: gramString = f'{item}'
            count += 1 
            if not count % 2: 
                if count == len(gramList): gramString += f', {item}'
                else: gramString += f', {item},\n'
            elif count>1: gramString += f'\t{item}'
        items += f'{TCPID}: {gramString}\n'
    title = f'{auth}:\n{items}'
    return title
    
def gramGraph(edgelist,title,authMap,heading,aToGramText):
    # Create pyvis graph
    g = Network(width=800,height=1000,notebook=True,heading=heading,bgcolor='white',font_color='black',directed=True)
    for authPair,gram in edgelist:
        if gram=='tobacco_tobacco':continue
        a1Color,a2Color = authMap[authPair[0]],authMap[authPair[1]]
        
        a1Title = createTitle(authPair[0],aToGramText[authPair[0]])
        a2Title = createTitle(authPair[1],aToGramText[authPair[1]])

        g.add_node(authPair[0], authPair[0], title=a1Title, color=a1Color,labelHighlightBold=True,chosen=True)
        g.add_node(authPair[1], authPair[1], title=a2Title, color=a2Color,labelHighlightBold=True,chosen=True)

        for groupName,gramList in gramGroups.items():
            if gram in gramList:
                g.add_edge(authPair[0],authPair[1],title=gram,color=edgeColors[groupName],width=10,arrows='hide')
    g.set_edge_smooth('dynamic')
    g.repulsion()
    g.show(title+".html")


In [5]:
'''Catalog authors, grams, and TCPIDs'''
file = '/srv/data/metadata/tuning/relevant.csv'
auths = authors(file,tobaccoNgrams.keys())
uniqueAuths = []
for TCPID,infoList in auths.copy().items():
    uniqueAuths.extend(infoList[0])
print(len(set(uniqueAuths)))

aToGramText = {}
authGrams = {}
for TCPID, infoList in auths.items(): 
    for auth in infoList[0]: 
        auth = auth.strip(r'\[').strip(r'\]')
        if auth not in authGrams.keys():
            authGrams[auth] = []
        authGrams[auth].extend(list(set(tobaccoNgrams[TCPID]))) 
        if auth not in aToGramText.keys(): 
            aToGramText[auth] = {TCPID: list(set(tobaccoNgrams[TCPID]))}
        else: aToGramText[auth] = aToGramText[auth] | {TCPID: list(set(tobaccoNgrams[TCPID]))}

gramToAuths = {}
for auth, grams in authGrams.items(): 
    for gram in grams: 
        if gram not in gramToAuths.keys(): gramToAuths[gram] = [auth]
        else: gramToAuths[gram].append(auth)


84


In [71]:
'''Get node colors based on clustering results'''
authCluster = {}
for TCPID,infoList in auths.items():
    for idx,group in enumerate(groups.values()):
        if TCPID in group: 
            for auth in infoList[0]: 
                if auth not in authCluster.keys(): authCluster[auth] = (idx,1)
                else: 
                    authCluster[auth] = (authCluster[auth][0]+idx, authCluster[auth][1]+1)

authMap = {}
for auth,nums in authCluster.items():
    avg = round(nums[0]/nums[1])
    authMap[auth] = groupColors[avg]
print(len(authMap))

84


In [74]:
'''Create and output network'''
edges = gramNetwork(gramToAuths)
print(len(edges))
gramGraph(edges,'Period 4 authors',authMap,'1625-1634 Tobacco N-Gram Author Network',aToGramText)

538


In [7]:
'''Get context windows for n-grams'''
import os,re

def getTexts(folder,searchList):
    fileToText = {}
    underscores = {}
    for root,dirs,files in os.walk(folder):
        for file in files:
                if '.txt' not in file: continue
                path = os.path.join(folder,file)
                f = open(path,'r')
                text = f.readlines()[0]
                if '_' in file: 
                        name = file.split('_')[0]
                        if name in searchList: 
                            if name not in underscores.keys(): 
                                    underscores[name] = text
                            else: underscores[name] = underscores[name] + ' ' + text
                else: 
                        name = file.split('.')[0]
                        if name in searchList: 
                            fileToText[name] = text
                f.close()
        for name,text in underscores.items():
            fileToText[name] = text
        return fileToText
   
def context(searchgram,textName):
    text = bigramdata[textName]
    if (searchgram in text):
        indices = [i for i in range(len(text)) if text.startswith(searchgram, i)]
        windows = []
        for index in indices:
            if index > 120: window = text[(index-120):(index+120)].split(' ')
            else: window = text[:index+120].split(' ')
            del window[0]
            del window[-1]
            window = ' '.join(window)
            windows.append(window)
        return windows 

In [8]:
'''Map n-grams to TCPIDs'''
from tobaccoTexts import per4
bigramdata = getTexts('/srv/data/relevantEPBodyNOSTOP', per4)

file = '/srv/data/metadata/tuning/relevant.csv'
textGrams = {}
for TCPID in tobaccoNgrams.keys(): 
    nGrams = list(set(tobaccoNgrams[TCPID]))
    textGrams[TCPID] = []
    for gram in nGrams: 
        gram = re.sub('_',' ',gram)
        textGrams[TCPID].append(gram)
print(len(textGrams))

81


In [48]:
'''Get context windows into a dictionary'''
contexts = {}
for TCPID,gramList in textGrams.items():
    contexts[TCPID] = {}
    for gram in gramList: 
        formatGram = re.sub(' ','_',gram)
        contexts[TCPID][formatGram] = context(gram,TCPID)

In [55]:
textToCluster = {}
for TCPID in auths.keys(): 
    for groupIdx, group in groups.items(): 
        if TCPID in group: textToCluster[TCPID] = groupIdx


In [65]:
'''Write out documentation for each text in HTML format (ready to be copied and pasted into a HTML doc)'''
outfile = open('/srv/data/amy/per4doc.txt','a+')
for TCPID, infoList in auths.items(): 
    outfile.write(f'<b>{TCPID}:</b> Cluster {textToCluster[TCPID]} <ul><li><b>Author: </b> {infoList[0]}</li><li><b>Title: </b> {infoList[1]}</li><li><b>Date: </b> {infoList[2]}</li>')
    gramDict = contexts[TCPID]
    for gram in gramDict.keys(): 
        outfile.write(f'<li><b>{gram}</b>')
        windows  = gramDict[gram]
        for idx,w in enumerate(windows):
            if not idx: outfile.write(f'<ul><li>{w}</li></ul>')
            else: outfile.write(f'<ul><li>{w}</li></ul>')
    outfile.write(f'</ul><br>')
outfile.write('\n\n\n')
