In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *

database_name = 'wos'

In [2]:
import string_grouper
import editdistance

In [3]:
# the final variable we are constructing
groups = {}

# tracks the last group-id assigned
new_gid = 0

In [4]:
try:
    cysum = load_variable("%s.cysum" % database_name)
except VariableNotFound:
    print("You need to generate cysum before running this notebook.")

In [5]:
def isarticle(x):
    sp = x.split("|")
    if len(sp) < 2:
        return False
    
    try:
        int(sp[1])
        return True
    except ValueError:
        return False

strings = [x for x in cysum if '[no title captured]' not in x]

articles = [x for x in strings if isarticle(x)]
books = [x for x in strings if not isarticle(x)]

In [6]:
print("%s articles, %s books to group" % (len(articles), len(books)))

234198 articles, 185089 books to group


# grouping books

In [7]:
# this cell may take quite a while to run.
# on Intel i7-9700F this runs in about a minute on 185k names.

books_grouped = string_grouper.match_strings(
    pd.Series(books), 
    number_of_processes=8, 
    min_similarity=0.7
)

In [8]:
books_grouped[(books_grouped.similarity<1-1e-8)].sort_values("similarity")

Unnamed: 0,left_side,right_side,similarity
80862,hill|human relations,coser|human relations,0.700000
306001,coser|human relations,hill|human relations,0.700000
230482,campbell|methodological issue,denton|methodological issue,0.700003
106423,denton|methodological issue,campbell|methodological issue,0.700003
2384,hardin|collective action,sandler|collective action,0.700003
...,...,...,...
95728,castellani|pathological gamblin,castellani|pathological gambling,0.994837
370660,mcgivering|management britain g,mcgivering|management britain,0.994962
358036,mcgivering|management britain,mcgivering|management britain g,0.994962
76429,minichiello|in depth interviewin,minichiello|indepth interviewing,0.995253


In [9]:
# for books, we require that the authors are no more than 1 edit from each other
# even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F

ft = defaultdict(set)

for i,r in books_grouped.iterrows():
    ls = r.left_side
    rs = r.right_side
    
    if ls == rs:
        continue
    
    la = ls.split("|")[0]
    ra = rs.split("|")[0]
    
    if editdistance.eval(la,ra) > 1:
        continue
    
    ft[ls].add(rs)
    ft[rs].add(ls)
    
print("%s books have some connection to others in a group" % len(ft))

In [10]:
# assigns group-ids based on the relational structure derived thus far
# the code propagates ids through the network, assuming transitivity of equality

def traverse(x, gid):
    global groups
    groups[x] = gid
    
    neighbors = ft[x]
    for n in neighbors:
        if n not in groups:
            traverse(n, gid)
      
for i,k in enumerate(strings):
    if k in groups:
        continue
        
    traverse(k, new_gid)
    new_gid += 1

# grouping articles

In [11]:
# this cell may take quite a while to run.
# on Intel i7-9700F this runs in five minutes on 234k entries.

articles_grouped = string_grouper.match_strings(
    pd.Series(articles), 
    number_of_processes=8, # decrease this number to 1 or 2 for slower computers or laptops (the fan might start screaming)
    min_similarity=0.8 # the similarity cutoff is tighter for articles than for books
)

In [12]:
articles_grouped[(articles_grouped.similarity<1-1e-8)].sort_values("similarity")

Unnamed: 0,left_side,right_side,similarity
176458,"kian|2009| j broadcast electron, v53, p477","hoffman|2009| j broadcast electron, v53, p3",0.800017
129037,"hoffman|2009| j broadcast electron, v53, p3","kian|2009| j broadcast electron, v53, p477",0.800017
74449,"oliver|1992| disability handicap, v7, p1011","keith|1992| disability handicap, v7, p167",0.800018
218140,"keith|1992| disability handicap, v7, p167","oliver|1992| disability handicap, v7, p1011",0.800018
118937,"samson|2014| ethnic racial stud, v37, p467","platt|2014| ethnic racial stud, v37, p46",0.800028
...,...,...,...
188026,"o'connor|1988| capitalism nature so, v1, p1","o'connor|1988| capitalism nature so, v1, p11",0.992325
40699,"jasanoff|2004| states knowledge cop, p13","jasanoff|2004| states knowledge cop, p1",0.992505
145879,"jasanoff|2004| states knowledge cop, p1","jasanoff|2004| states knowledge cop, p13",0.992505
168139,crompton|1991| journal of park and recreation ...,crompton|1991| journal of park and recreation ...,0.994068


In [13]:
# for articles, we require that the entire citations is only 1 edit apart.
# even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F

# this cell produces the `ft` variable, which maps from each term to the set of terms equivalent. I.e., `ft[A] = {B1,B2,B3}`

ft = defaultdict(set)

for i,r in articles_grouped.iterrows():
    ls = r.left_side
    rs = r.right_side
    
    if ls == rs:
        continue
    
    la = ls.split("|")[0]
    ra = rs.split("|")[0]
        
    if editdistance.eval(ls,rs) > 1:
        continue
    
    ft[ls].add(rs)
    ft[rs].add(ls)
    #print(ls,"|||",rs)

print("%s articles have some connection to others in a group" % len(ft))

In [15]:
# assigns group-ids based on the relational structure derived thus far
# the code propagates ids through the network, assuming transitivity of equality

def traverse(x, gid):
    global groups
    groups[x] = gid
    
    neighbors = ft[x]
    for n in neighbors:
        if n not in groups:
            traverse(n, gid)

for i,k in enumerate(articles):
    if k in groups:
        continue
        
    traverse(k, new_gid)
    new_gid += 1

In [16]:
# this line will break execution if there aren't as many groups assigned as we have articles and books
assert( len(articles) + len(books) == len(groups) )

In [17]:
len(set(groups.values())) - len(books) - len(articles)

-21400

In [18]:
len(books)

185089

In [19]:
len(articles)

234198

In [20]:
# saving the variable for later
save_variable("%s.groups" % database_name, groups)