In [1]:
'''
Social network visualization
'''
import re
import pandas as pd
from pyvis.network import Network

def authors(csv,search):
    '''
    Extract text IDs and standardize authors' names.
    '''
    df = pd.read_csv(csv)
    authors = df['author']
    ids = df['id']
    count = 0
    dict = {}
    for idx,TCPID in enumerate(ids):
        if TCPID.strip() in search: 
            words = set(authors[idx].split(';'))
            words.discard('')
            newWords = []
            for w in words: 
                w = w.strip()
                # w = w.replace('.','')
                # # w = re.sub(r'\([^)]*\)','',w)
                if re.search('printer|engraver',w):continue
                newWords.append(w)
            dict[TCPID] = list(set(newWords))
            count += 1
    return dict 

In [2]:
'''Get tobacco n-gram features'''
inFile = open('/srv/data/periodFeatures/period4features/newtobaccograms.txt','r')
inFileLines = inFile.readlines()
inFile.close()
tobaccoNgrams = {}
for line in inFileLines: 
    line = line.split(':')
    features = line[1].strip()
    if features == '': continue
    tobaccoNgrams[line[0].strip()] = features.strip().split(' ')
print(len(tobaccoNgrams))

'''Clustering results for tobacco Period 4'''
Group0 = ['B12280', 'A10187', 'A03411', 'A19036', 'A68568', 'A11626', 'A14301', 'A14916', 'A15627', 'A09156', 'A00583', 'A19765', 'A20769', 'A12481', 'A13534', 'A14305', 'A08017', 'A09118', 'A08578']
Group2 = ['A01055', 'A20065', 'A20356', 'A69225', 'A11151', 'A12155', 'A13520', 'A08977', 'A19012', 'A03559', 'A13948', 'A07024', 'A04633', 'B00490', 'A72399', 'A68252', 'A20075', 'A02827', 'A02836', 'A03807', 'A13415', 'A13436', 'A05320', 'A16489', 'A06692', 'A07025', 'A09242', 'B01146', 'A72254', 'A10402', 'A21075', 'A11153', 'A02909', 'A03240', 'A04658', 'A16924', 'A08690']
Group1 = ['A22447', 'A22574', 'A22435', 'A22537', 'A22567', 'A22439', 'A22571']
Group3 = ['A02758', 'A06902', 'A16627', 'A01425', 'A03065', 'A05657', 'A11156', 'A01552', 'A06768', 'A01424']
Group4 = ['A15685', 'A16507', 'A72911', 'A02325', 'A12471', 'A09010', 'A12458', 'A17595']
groups = {'christianShip':Group0,'tobaccoPlantation':Group1,'shipMoney':Group2,'sugarMedicine':Group3,'plantationEngland':Group4}
groupColors = {0:'pink',1:'purple',2:'darkblue',3:'plum',4:'palevioletred'}


81


In [11]:
green = 'plant|strong|growth|sort|sophisticate|call|leaf' #90C8AC
beige = 'whiff|smoke|pipe|use|take|have|light|keep' #F9EBC8
blue = 'spanish|indian|foreign|english'  #CCF1FF
purple = 'sell|quantity|ounce|piece|shop|alehouse|tavern|retail|house|two|sack' #655D8A
orange = 'wine|ale|drink|other|weed' #FFD9C0
pink = 'concern|well|love|good|much|such' #FFCCE1
yellow = 'when|almost|often|upon|there|all|still|out|may|any|than|whereas|this|nor|itself|which|that|not|will|into|but|within|shall|each|through|but|only|come'
        #FAFFC7
lavender =  'your|they|doctor|our|his|their|man' #E0D7FF

gramGroups = {
    'green':green,'beige':beige,'blue':blue,'purple':purple,'orange':orange,
    'pink':pink,'yellow':yellow,'lavender':lavender
}
edgeColors = {
    'green': '#90C8AC', 'beige':'#F9EBC8', 'blue':'#CCF1FF', 'purple':'#edc7ed','orange':'#FFD9C0',
    'pink':'#FFCCE1','yellow':'lemonchiffon','lavender':'#E0D7FF'
}
from itertools import combinations
def gramNetwork(auths):
    '''
    Catalogs edges (as well as the edge value, i.e., n-gram) between nodes
    Returns a list of tuples (combo,gram) 
    '''
    edgelist = []    
    for gram,authorList in auths.items():
        combos = list(combinations(authorList,2))
        for combo in combos: 
            if ((combo[1],combo[0]),gram) in edgelist: continue
            if ((combo[0],combo[1]),gram) in edgelist: continue
            edgelist.append((combo,gram))  
    return edgelist

def createTitle(auth,authInfo):
    items = ''
    for TCPID,gramList in authInfo.items(): 
        count = 0
        for item in gramList: 
            if count == 0: gramString = f'{item}'
            count += 1 
            if not count % 2: 
                if count == len(gramList): gramString += f', {item}'
                else: gramString += f', {item},\n'
            elif count>1: gramString += f'\t{item}'
        items += f'{TCPID}: {gramString}\n'
    title = f'{auth}:\n{items}'
    return title

def gramGraph(edgelist,title,authMap,heading,aToGramText):
    # Create pyvis graph
    g = Network(width=800,height=1000,notebook=True,heading=heading,bgcolor='white',font_color='black')
    for authPair,gram in edgelist:
        if gram=='tobacco_tobacco':continue
        a1Color,a2Color = authMap[authPair[0]],authMap[authPair[1]]
        
        a1Title = createTitle(authPair[0],aToGramText[authPair[0]])
        a2Title = createTitle(authPair[1],aToGramText[authPair[1]])

        g.add_node(authPair[0], authPair[0], title=a1Title, color=a1Color,labelHighlightBold=True,chosen=True)
        g.add_node(authPair[1], authPair[1], title=a2Title, color=a2Color,labelHighlightBold=True,chosen=True)
        for color,gramList in gramGroups.items():
            option = re.search(gramList,gram)
            if option: 
                g.add_edge(authPair[0],authPair[1],title=gram,color=edgeColors[color],width=10)
                break
    g.show(title+".html")

In [4]:
file = '/srv/data/metadata/tuning/relevant.csv'
auths = authors(file,tobaccoNgrams.keys())
uniqueAuths = []
for TCPID,authList in auths.copy().items():
    uniqueAuths.extend(authList)
print(len(set(uniqueAuths)))

aToGramText = {}
authGrams = {}
for TCPID, authList in auths.items(): 
    for auth in authList: 
        auth = auth.strip(r'\[').strip(r'\]')
        if auth not in authGrams.keys():
            authGrams[auth] = []
        authGrams[auth].extend(list(set(tobaccoNgrams[TCPID]))) 

        if auth not in aToGramText.keys(): 
            aToGramText[auth] = {TCPID: list(set(tobaccoNgrams[TCPID]))}
        else: aToGramText[auth] = aToGramText[auth] | {TCPID: list(set(tobaccoNgrams[TCPID]))}

gramToAuths = {}
for auth, grams in authGrams.items(): 
    for gram in grams: 
        if gram not in gramToAuths.keys(): gramToAuths[gram] = [auth]
        else: gramToAuths[gram].append(auth)


84


In [5]:
authCluster = {}
for TCPID,authList in auths.items():
    for idx,group in enumerate(groups.values()):
        if TCPID in group: 
            for auth in authList: 
                if auth not in authCluster.keys(): authCluster[auth] = (idx,1)
                else: 
                    authCluster[auth] = (authCluster[auth][0]+idx, authCluster[auth][1]+1)

authMap = {}
for auth,nums in authCluster.items():
    avg = round(nums[0]/nums[1])
    authMap[auth] = groupColors[avg]
print(len(authMap))

84


In [12]:
edges = gramNetwork(gramToAuths)
print(len(edges))
gramGraph(edges,'Period 4 authors',authMap,'1625-1634 Tobacco N-Gram Author Network',aToGramText)

581


In [17]:
merge = 'your, they, doctor, our, his, their, man'
merge = merge.split(',')
new = []
for m in merge:
    new.append(m.strip())
merge = '|'.join(new)
merge

'your|they|doctor|our|his|their|man'