# Create all_references files

In [1]:
## Inport packages
from pathlib import Path
import os
import codecs
import re
import logging
import locale
import dataclasses
import copy
from itertools import groupby
from itertools import compress
from datetime import datetime
from typing import List
import jinja2
import pandas as pd
import numpy as np
from DABI_databases import *


## Functions
# Remove known non-keywords from keyword list
def filter_words(variable):
    remove_words = ['a','and','of','the','in', "The", "an", "An", "by", "le", "les", "on", "for", "this",
                    "their", "most", "\n  \n", "\n   \n", "\n     \n", "these", "that", "many", "some", "these", "also",
                    "such", "which", "palestine._id%c3%a9ologies_religieuses_entre_ugarit_et_le_monde_ph%c3%a9nicien_ugarit"]
    if (variable in remove_words):
        return False
    else:
        return True



In [2]:
## From Jonah's file

MYPATH = "./raw_data/"

#reference copy of previous working implementation of spaCy

#this is a new NLP attempt, implementing spaCy to output one large csv with topics inside it
#this version outputs parsed_dabi_filesNLP2.csv a csv with filename, author, year, and topic list

import fnmatch
import os
from pathlib import Path
import pandas as pd
import csv
import sys
from gensim.summarization import keywords
import spacy

#make a function to check for certain strings
def check_if_string_in_file(file_name, string_to_search):
    # Check if any line in the file contains given string
    # Open the file in read only mode
    with open(file_name, 'r') as read_obj:
        # Read all lines in the file one by one
        for line in read_obj:
            # For each line, check if line contains the string
            if string_to_search in line:
                return True
    return False

def search_string_in_file(file_name, string_to_search):
    #Search for the given string in file and return the lines containing that string
    #only works for unique strings in file
    line_number = 0
   
    # Open the file in read only mode
    with open(file_name, 'r') as read_obj:
        # Read all lines in the file one by one
        for line in read_obj:
            # For each line, check if line contains the string
            line_number += 1
            if string_to_search in line:
                # If yes, then add the line to the list
               return line.rstrip()
    

#clear main lists
filename = []
author = []
year = []
topics = []
nlp = spacy.load("en_core_web_sm")

#main program loop
#loop through all the .d files in the directory
for currentfile in os.listdir(MYPATH):
    if fnmatch.fnmatch(currentfile, '*.d'):
        data_folder = Path(MYPATH)
        file_to_open = data_folder / currentfile

        f = open(file_to_open)
        # Check if string '@@@R' is found in file, which means we are referring to material for the Mes/Rel site
        if check_if_string_in_file(file_to_open, '@@@R'):

            #take the filename for each @@@R file and populate the lists (cutting off .d extension)
            filename.append(currentfile[:-2])
            
            #choose AU (author) line and append to list
            au = search_string_in_file(file_to_open, 'AU ')
            if au is None: #make sure we don't get null values in list 
                au = 'AU None'
            author.append(au[3:])
            
            #choose Y (year) line and append to list    
            y = search_string_in_file(file_to_open, 'Y ')
            if y is None: #make sure we don't get null values in list 
                y = 'Y None'
            year.append(y[2:6]) #at most allow four digits for year
               
            #NLP routine to parse all nouns in the current file
            doc = nlp(f.read())
                
            t = []
            topic_list = []
    
            for t in doc:
                
                if len(t)<4: #get rid of short words
                    continue
                if t.pos_ == 'NUM' or t.pos_ == 'AUX' or t.pos_ == 'ADP' or t.pos_ == 'SYM' or t.pos_ == 'AUX': #get rid of numbers, auxiliary verbs, prepositions, symbols
                    continue
                if t.text.find("http") != -1: #get rid of hyperlinks
                    continue
                if t.text.find("@") != -1: #get rid of site references
                    continue
                if t.text.find(".pdf") != -1: #get rid of file references
                    continue
                if t.text.find(".html") != -1: #get rid of file references
                    continue
                if t.text.find("R/") != -1: #get rid of reference codes
                    continue     
                if t.text.find("\n\n") != -1: #get rid of carriage returns
                    continue
                if t.pos_ == "PROPN":  #put proper names in the list with capital letter
                    topic_list.append(t.lemma_)
                    continue
                
                topicToAdd = t.lemma_
                topicToAdd = topicToAdd.lower()  #make everything except proper names lowercase
                topic_list.append(topicToAdd)    #append the lemma of each word to the list
                
                # Remove words

                
            topics.append(topic_list)
 
a = {'filename': filename, 'author': author, 'year': year, 'keywords': [list(set(list(filter(filter_words,topic)))) for topic in topics]}

df = pd.DataFrame(a, columns=['filename','author','year', 'keywords'])
df.head()


Unnamed: 0,filename,author,year,keywords
0,AMGG,,2011,"[access, information, express, research, play,..."
1,Abusch2020Corpus,"Abusch, Tzvi; Daniel Schwemer, Mikko Lukko, an...",2020,"[proceed, dealing, January, interested, Brill,..."
2,Achtemeier1996Harper,"Achtemeier, Paul J. *et al.* (eds)",1996,"[Dictionary, contextualize, dictionary, colour..."
3,Albani2000Horoscopes,"Albani, Matthias",2000,"[University, scroll, investigate, Schiffman, H..."
4,Albright1940ANEreligion,"Albright, W.F.",1940,"[inherent, contrast, cross, Hittite, methodolo..."


In [3]:
## Filter based on filename and write to csv
df.to_csv("./data/all_filenames.csv")

In [4]:
## Determine edges -- very slow

# Loop through filenames and add an edge if they share a keyword
nNodes = df.shape[0]
sourceList = []
targetList = []
keywordList = []

for i in np.arange(nNodes):

    for j in np.arange(i+1,nNodes):
        
        keywords_in_i = df.keywords[i]
        keywords_in_j = df.keywords[j]

        # If any of keywords in i and j
        if any([keywords_in_i[ind] in keywords_in_j for ind in np.arange(len(keywords_in_i))]):
            sourceList.append(df.filename[i])
            targetList.append(df.filename[j])
            keywordList.append(list(set(list(compress(keywords_in_i, [keywords_in_i[ind] in keywords_in_j for ind in np.arange(len(keywords_in_i))])))))
                

            
edge_df = pd.DataFrame({'source': sourceList, 'target': targetList, 'keywords': keywordList})
print(f'Identified {edge_df.shape[0]} edges.')

Identified 72451 edges.


In [5]:
edge_df.head()


Unnamed: 0,source,target,keywords
0,AMGG,Abusch2020Corpus,"[corpus, August, mesopotamian, literature]"
1,AMGG,Achtemeier1996Harper,"[further, offer]"
2,AMGG,Albani2000Horoscopes,"[religion, August]"
3,AMGG,Albright1940ANEreligion,"[research, literature, polytheistic, ancient, ..."
4,AMGG,Allegro1968DJD5,"[offer, literature, part]"


In [6]:
## Write to csv
edge_df.to_csv("./data/all_filenames_edges.csv")