# Creeps. 

  <font color='fuchsia'>Reading a .bib file, matching by name, and returning a message.</font>


TODO
  - if it's got a "." kill the "."
  - if it's got a word that is only 1 leter long, kill the word (that's likely gonna be an initial).
  - kill " Jr "; " Sr " .... any others like that?

In [None]:
from urllib.request import urlretrieve

url = "https://academic-sexual-misconduct-database.org/incidents/download_excel"
filename = "asmd_incidents.xls"
urlretrieve(url, filename)

In [None]:
import pandas as pd
from datetime import datetime, date 

# Take the database of offenders and return names. Turn this into a csv, just rip out what i want, so that later, i can do this without pandas.

df = pd.read_excel('asmd_incidents.xls', usecols=['Person', 'Institution', 'Original Link(s)']) # Just load the usable columns.
df = df[(df.Person != 'NAME UNKNOWN') & ~df.Person.isna()]
df.reset_index(inplace = True, drop = True)

df['Person'] = df['Person'].astype(str).str.lower()
# df['Person'] = df['Person'].str.replace(',',"") # This is necessary for later on. There's one bloke who is so and so Junior, and has a comma in his name.
offenders = [offender for offender in list(df.Person)] # Offenders isn't used, so remove.

df.columns = ['person', 'institution', 'source']

# Add a final line that serves as a datestamp.
df.loc[len(df)] = [date.today(), "" , ""]

df.to_csv('asmd.csv', index = False, sep = "\t") # Some of the names have a comma, and even some of URLs, so this helps for the string splits.
print(len(df))
df.tail()

In [None]:
# Check the ASMD database for non-latin characters which may cause issues in matching.
filepath = "asmd.csv"
entries = []

# Open the CSV file in read mode
with open(filepath, 'r') as file:
    for line in file.readlines():
        entries.append([x.strip() for x in line.split('\t')])

offenders, institutions, sources = zip(*entries)
chars = ''
for offender in offenders[:-1]:
    for x in offender:
        if x not in chars:
            chars += x

chars = "".join(sorted(chars))

# show the non-latin characters.
chars = [x for x in chars if x not in ". abcdefghijklmnopqrstuvwxyz"]
chars = "".join(chars)

In [None]:
# This is handled in the .sty file and grabs user input from the \creeps{} latex command.
filepath, exceptions, update = "examples.bib", 'dittmann1976strindberg,naqvi1995power,batra1987urban', "checkforupdates"


In [None]:
import sys 

# Take a .bib file, return the authors.
if not filepath.endswith(".bib"):
    filepath = filepath + ".bib"

class keyauthor:
    def __init__(self, keys = [], authors = []):
        self.keys = keys
        self.authors = authors

    def update(self, line):
        if line.strip().startswith("@"):
            self.keys.append(line.strip().split("{")[1].rstrip(',').strip().lower() )
        elif line.strip().split("=")[0].lower().__contains__("author"):
            self.authors.append(line.strip().split("{",1)[1].rsplit("}", 1)[0].split(" and "))

ka = keyauthor()
try:
    with open(filepath) as refs:
        for line in refs:
            ka.update(line)
except:
    print("\PackageError{creeps}{Ooops. Your bibfile was not found}{If your bibfile isn't in the same folder as your .tex file, use a relative filepath.}")
    sys.exit()

if len(ka.authors) == 0:
    sys.exit()

exceptions = [x.strip() for x in exceptions.split(',')]
for x in exceptions:
    try:
        pm = ka.keys.index(x)
        ka.keys.pop(pm)
        ka.authors.pop(pm)
    except:
        pass

# There are special characters in names. The way .bib files deals with these is different to vanilla latex. I break these down into two charactegories. single_purpose characters, like "ae" are for just one single letter. The others are composite, and modify the letter that comes after them. 
# they enter like this: e.g, {\\L} or: {\\c c}

# These three links explain what's going on.
      # https://tex.stackexchange.com/questions/8857/how-to-type-special-accented-letters-in-latex
      # https://en.wikibooks.org/wiki/LaTeX/Special_Characters#Escaped_codes
      # https://tex.stackexchange.com/questions/57743/how-to-write-ä-and-other-umlauts-and-accented-letters-in-bibliography

single_purpose = ["\\L", "\\l", "\\O", "\\o", "\\AA", "\\aa", "\\AE", "\\ae", "\\OE", "\\oe", "\\ss", "\\i", "\\j"]

# For dealing with special (in the sense that they're not used in the english language) characters, i bastardise as follows (not that i'm matching as .lower(), so removing capitals):
sp = {"\\l" : 'l', 
      "\\O" : "O", 
      "\\o" : 'o', 
      "\\aa" : 'a', 
      "\\ae" : 'ae', 
      "\\oe" : 'oe', 
      "\\ss" : 'ss', # not technically correct, but very much good enough. 
      "\\i" : 'i', 
      "\\j" : 'j'
}

# delete the above, and put this into the overleaf version.
sp = {"\\l" : 'l', "\\O" : "O", "\\o" : 'o', "\\aa" : 'a', "\\ae" : 'ae', "\\oe" : 'oe', "\\ss" : 'ss', "\\i" : 'i', "\\j" : 'j'}

# if these exist, remove them.
multi_purpose = ["\\t", "\\b", "\\d", "\\c", "\\H", "\\v", "\\r", "\\u", '\\"', "\\.", "\\=", "\\~" , "\\^", "\\'", "\\`"]
mp = [x.lower() for x in multi_purpose]

def cleanup(name, mp=mp, sp=sp):
    name = name.replace("{","").replace("}","")
    for l in mp:
        name = name.replace(l,"")
    for k, v in sp.items():
        name = name.replace(k,v)
    return(name)

# If the name is of the form: "last_name, first_name (middle_names)", flip it around and remove the comma, so it is of the form: "first_name (middle names) last_name"
for i in range(len(ka.authors)):
    ka.authors[i] = [cleanup(x.split(",")[1].strip().lower() + " " + x.split(",")[0].strip().lower())
                  if x.__contains__(",") 
                  else cleanup(x) 
                  for x in ka.authors[i]
                  ]

names = [item for sublist in ka.authors for item in sublist]

# Turn to dictionary
bibd = dict(zip(ka.keys, ka.authors))

# Take the csv, using vanilla python, return a dictionary with names and links.
entries = []

# Open the CSV file in read mode
try:
    with open('asmd.csv', 'r') as file:
        for line in file.readlines():
            entries.append([x.strip() for x in line.split('\t')])
except:
    print("\PackageError{creeps}{Ooops. asmd.csv was not found}{Import it from: https://raw.githubusercontent.com/alistaircameron/creeps/main/asmd.csv}")
    sys.exit()


# Grab the date the file was downloaded, if it was a long time ago, ask them to update.
if update.strip().lower() != 'false':
    from datetime import datetime, date
    if (datetime.strptime(entries[-1][0], "%Y-%m-%d").date() - date.today()).days > 90:
        print("\PackageWarningNoLine{creeps}{You have not updated the offenders database in 3+ months. To do so, navigate to asmd.csv, and click refresh}")
entries = entries[:-1]

# Create a dictionary of the above, but first creating separate lists.
offenders, institutions, sources = zip(*entries)
offd = dict(zip(offenders, zip(institutions, sources)))

# This works nicely only for exact name matching.
warnings = set([name for name in names if name in offenders])

if len(warnings) > 0:
    # Now, match the database and the .bib file.
    # Return the relevant bibliography identifiers for each offender.
    article_ids, institutions, sources = [], [], [] # this overwrites previous stuff. ugly, i know.
    for w in warnings:
        art_id = []
        for k, v in bibd.items():
            if w in v:
                art_id.append(k)

        article_ids.append(art_id)

        for k, v in offd.items():
            if k == w:
                institutions.append(offd.get(w)[0])
                sources.append(offd.get(w)[1])


    print("\color{red}")
    print("\\begin{itemize}")
    for i in range(len(warnings)):
        message = f"{list(warnings)[i]} ({institutions[i]}; {', '.join(article_ids[i])}) is matched to the database of offenders. For details, see " 

        print("\item", message)

        # Now, cycle through, print it if it's the final source in the list with a full stop after, otherwise print with a comma.
        for k, j in enumerate(sources[i].split(";")):
            if k != len(sources[i].split(";")) -1:
                details = f"\href{{{j}}}{{here}}, "
                print(details)
            else:
                details = f"\href{{{j}}}{{here}}."
                print(details)

        print('\n')    
    print("\end{itemize}")
    print("\color{black}")

# Strip the above, so that latex runs a bit faster.

<font color = 'fuchsia'>This greatly reduces overleaf compile time. This is the only python input for the .sty file.<font>

In [None]:
import sys 
if not filepath.endswith(".bib"):
    filepath = filepath + ".bib"
class keyauthor:
    def __init__(self, keys = [], authors = []):
        self.keys = keys
        self.authors = authors
    def update(self, line):
        if line.strip().startswith("@"):
            self.keys.append(line.strip().split("{")[1].rstrip(',').strip().lower() )
        elif line.strip().split("=")[0].lower().__contains__("author"):
            self.authors.append(line.strip().split("{",1)[1].rsplit("}", 1)[0].split(" and "))
ka = keyauthor()
try:
    with open(filepath) as refs:
        for line in refs:
            ka.update(line)
except:
    print("\PackageError{creeps}{Ooops. Your bibfile was not found}{If your bibfile isn't in the same folder as your .tex file, use a relative filepath.}")
    sys.exit()
if len(ka.authors) == 0:
    sys.exit()
exceptions = [x.strip() for x in exceptions.split(',')]
for x in exceptions:
    try:
        pm = ka.keys.index(x)
        ka.keys.pop(pm)
        ka.authors.pop(pm)
    except:
        pass
single_purpose = ["\\L", "\\l", "\\O", "\\o", "\\AA", "\\aa", "\\AE", "\\ae", "\\OE", "\\oe", "\\ss", "\\i", "\\j"]
sp = {"\\l" : 'l', "\\O" : "O", "\\o" : 'o', "\\aa" : 'a', "\\ae" : 'ae', "\\oe" : 'oe', "\\ss" : 'ss', "\\i" : 'i', "\\j" : 'j'}
multi_purpose = ["\\t", "\\b", "\\d", "\\c", "\\H", "\\v", "\\r", "\\u", '\\"', "\\.", "\\=", "\\~" , "\\^", "\\'", "\\`"]
mp = [x.lower() for x in multi_purpose]
def cleanup(name, mp=mp, sp=sp):
    name = name.replace("{","").replace("}","")
    for l in mp:
        name = name.replace(l,"")
    for k, v in sp.items():
        name = name.replace(k,v)
    return(name)
for i in range(len(ka.authors)):
    ka.authors[i] = [cleanup(x.split(",")[1].strip().lower() + " " + x.split(",")[0].strip().lower())
                  if x.__contains__(",") 
                  else cleanup(x) 
                  for x in ka.authors[i]
                  ]
names = [item for sublist in ka.authors for item in sublist]
bibd = dict(zip(ka.keys, ka.authors))
entries = []
try:
    with open('asmd.csv', 'r') as file:
        for line in file.readlines():
            entries.append([x.strip() for x in line.split('\t')])
except:
    print("\PackageError{creeps}{Ooops. asmd.csv was not found}{Import it from: https://raw.githubusercontent.com/alistaircameron/creeps/main/asmd.csv}")
    sys.exit()
if update.strip().lower() != 'false':
    from datetime import datetime, date
    if (datetime.strptime(entries[-1][0], "%Y-%m-%d").date() - date.today()).days > 90:
        print("\PackageWarningNoLine{creeps}{You have not updated the offenders database in 3+ months. To do so, navigate to asmd.csv, and click refresh}")
entries = entries[:-1]
offenders, institutions, sources = zip(*entries)
offd = dict(zip(offenders, zip(institutions, sources)))
warnings = set([name for name in names if name in offenders])
if len(warnings) > 0:
    article_ids, institutions, sources = [], [], []
    for w in warnings:
        art_id = []
        for k, v in bibd.items():
            if w in v:
                art_id.append(k)
        article_ids.append(art_id)
        for k, v in offd.items():
            if k == w:
                institutions.append(offd.get(w)[0])
                sources.append(offd.get(w)[1])
    print("\color{red}")
    print("\\begin{itemize}")
    for i in range(len(warnings)):
        message = f"{list(warnings)[i]} ({institutions[i]}; {', '.join(article_ids[i])}) is matched to the database of offenders. For details, see "
        print("\item", message)
        for k, j in enumerate(sources[i].split(";")):
            if k != len(sources[i].split(";")) -1:
                details = f"\href{{{j}}}{{here}}, "
                print(details)
            else:
                details = f"\href{{{j}}}{{here}}."
                print(details)
        print('\n')    
    print("\end{itemize}")
    print("\color{black}")