# Library Tagger

# IN PROGRESS NOT FUNCTIONAL

This notebook is intended to function as a one-time script for rationalizing a collection of PDFs scattered across a filesystem.

### Motivation
- Every project has a collection of PDFs.
- Closely related projects duplicate PDFs, wasting space.
- Even carefully curated PDFs in this context are either:
    - Dropped into a giant folder organized by author/year
    - Split across subfolders denoting topic (e.g. `logic`) but topics are task-specific and debatable

The purpose of this project is to collect PDFs across a filesystem and perform the following operations:

1. Rename the PDF by

    a. Extracting Author/Title/Year using [GROBID](https://github.com/kermitt2/grobid)
    
    b. Renaming the file [in TagSpaces style](https://docs.tagspaces.org/tagging#file-tagging-based-on-filename) with
    
    - FirstAuthorLastName~Year Title1 Title2 Title3 (where these are non stop words) followed by
    
    - [Tag1 ... TagN].pdf where TagX is
    
      - Each directory of the PDF's current file path
      - Author lastnames of the PDF
      - Keywords of the PDF
      - Year of the PDF
      - Title words of the PDF (stopwords removed)
      - Eventually it would be good for this to include all my publications that cite this paper

2. Move the PDF to a common folder, e.g. `/z/aolney/library`

3. Deposit a symbolic link in the present directory with the old name linking to the new file

At this point only a subset of [TagSpaces features](https://docs.tagspaces.org/userinterface.htm) will be needed to organize/search PDFs. 


## Get PDFs

Set one or more root directories and recursively find all PDFs. 
Save this information as a flat file and then copy all PDFs to a large working directory for GROBID.

In [5]:
//The root directories of all PDFs
let rootDirectories = [ "/home/aolney/Downloads" ]

//The working folders for GROBID
let grobidPdfFolder = "/y/pdf/"
let grobidOutputFolder = "/y/grobid/"

open System.IO
let rec GetAllFiles (extension:string) dirs =
    if Seq.isEmpty dirs then Seq.empty else
        seq { yield! dirs |> Seq.collect Directory.EnumerateFiles
              yield! dirs |> Seq.collect Directory.EnumerateDirectories |> GetAllFiles extension } |> Seq.filter( fun filePath -> filePath.ToUpper().EndsWith( extension ))


In [None]:
let allPdfs = rootDirectories |> GetAllFiles "PDF"

File.WriteAllLines("pdfList.txt", allPdfs )

Directory.CreateDirectory( grobidPdfFolder )
Directory.CreateDirectory( grobidOutputFolder )

allPdfs
|> Seq.iter( fun filePath ->
    let newName = filePath.Replace(Path.DirectorySeparatorChar.ToString(), "@")
    let newPath = Path.Combine( grobidFolder, newName )
    if File.Exists(newPath) |> not then File.Copy( filePath, newPath )
           )

## PDF to XML

Using [GROBID command line](https://grobid.readthedocs.io/en/latest/Grobid-batch/)

In [1]:
function doGrobid() {
    java -Xmx4G -jar /z/aolney/repos/grobid-0.5.1/grobid-core/build/libs/grobid-core-0.5.1-onejar.jar -gH /z/aolney/repos/grobid-0.5.1/grobid-home -dIn $1 -dOut $2 -exe processHeader
    #processFullText
}

doGrobid /y/pdf/ /y/grobid > grobid.log 2>&1

## Grobid XML to tags

- Author lastnames of the PDF
- Keywords of the PDF
- Year of the PDF
- Title words of the PDF (stopwords removed)

In [6]:
let allXML = [ grobidOutputFolder ] |> GetAllFiles "XML"  
allXML |> Seq.toArray


[|"/y/grobid/@home@aolney@Downloads@0013189x17725519.tei.xml";
  "/y/grobid/@home@aolney@Downloads@0034654317726529.tei.xml";
  "/y/grobid/@home@aolney@Downloads@033.tei.xml";
  "/y/grobid/@home@aolney@Downloads@0402.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1-s2.0-S000437020300122X-main.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1-s2.0-S0163638311000713-main.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1-s2.0-S0749596X17300013-main.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1-s2.0-S0895435617313069-main.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1-s2.0-S1364661317301316-main.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1-s2.0-S1364661319300610-main.tei.xml";
  "/y/grobid/@home@aolney@Downloads@10.1.1.467.9994.tei.xml";
  "/y/grobid/@home@aolney@Downloads@10.1.1.720.6456.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1308.5499.tei.xml";
  "/y/grobid/@home@aolney@Downloads@13_Mayer.tei.xml";
  "/y/grobid/@home@aolney@Downloads@1465.full.tei.xml";
  "/y/grobid/@home@aolney@

In [None]:
type ReviewerInfo =
    {
        Name: string
        Order: int
        Hash : int
        //Some of these are really only useful for debugging
        Title : string
        Text : string
        File : string
    }
    
let GetName first last =
    match first,last with
    | Some(f), Some(l) -> f + " " + l
    | None, Some(l) -> l
    | Some(f), None -> f
    | None, None -> ""
    
//let xml = files |> Seq.head |> System.IO.File.ReadAllText

let spaceRegex = new System.Text.RegularExpressions.Regex(@"\s+");
let NormalizeText ( text : string ) =
    spaceRegex.Replace( text, " " ).Trim().ToLower()

let ExtractInfo xmlFile = 
    let doc = new System.Xml.XmlDocument();
    let xml = xmlFile |> System.IO.File.ReadAllText
    doc.LoadXml(xml);

    let nsmgr = new System.Xml.XmlNamespaceManager(doc.NameTable)
    nsmgr.AddNamespace("tei",  "http://www.tei-c.org/ns/1.0")

    let theTitle  = doc.SelectSingleNode(@"//tei:title", nsmgr).InnerText
    let theAbstract = doc.SelectSingleNode(@"//tei:abstract", nsmgr).InnerText
    let theText = 
        doc.SelectNodes(@"//tei:text//tei:p", nsmgr) 
        |> Seq.cast< System.Xml.XmlNode> 
        |> Seq.map( fun x -> 
                   let directChildren = x.ChildNodes |> Seq.cast< System.Xml.XmlNode> |> Seq.map (fun x -> x.Value) //unlike InnerText, ignores child descendent nodes
                   String.concat " " directChildren
                  )
        |> String.concat " "
        |> NormalizeText

    let authors = doc.SelectNodes(@"//tei:sourceDesc//tei:persName", nsmgr)
    let reviewerInfos =
        authors 
        |> Seq.cast< System.Xml.XmlNode> 
        |> Seq.mapi( fun i x -> 
                   let forename = 
                       match x.["forename"] with
                       | null -> None
                       | f -> Some(f.InnerText)
                   let surname = 
                       match x.["surname"] with
                       | null -> None
                       | s -> Some(s.InnerText)
                    //originally we hashed on the text, but the file name has the full path, so that is probably better
                   //{Name = (GetName forename surname); File = xmlFile; Text = theText; Order = i; Title = theTitle; Hash = (hash theText) }
                   {Name = (GetName forename surname); File = xmlFile; Text = theText; Order = i; Title = theTitle; Hash = (hash xmlFile) }
                  )
    //
    if reviewerInfos |> Seq.isEmpty then
        None
    else
        Some( reviewerInfos )


In [None]:
let reviewerInfosWithHash =
    files
    |> Seq.choose ExtractInfo 
    |> Seq.collect id
    //|> Array.ofSeq

//for compression purposes, map hash to small integer
let hashHash =
    reviewerInfosWithHash
    |> Seq.map( fun ri -> ri.Hash )
    |> Seq.distinct
    |> Seq.mapi( fun i h -> h,i)
    |> Map.ofSeq

let reviewerInfos = 
    reviewerInfosWithHash
    |> Seq.map( fun ri -> { ri with Hash=hashHash.[ri.Hash]})
    
let authorOutput = 
    reviewerInfos
    |> Seq.map( fun ri ->  ri.Hash.ToString() + "\t" + ri.Name + "\t" + ri.Order.ToString()  )

//the data layout is expected by pke
let textOutput =
    reviewerInfos
    |> Seq.distinctBy( fun ri -> ri.Hash )
    |> Seq.map( fun ri -> ri.Hash.ToString() + "\t" + ri.Text + "\t" + ri.Title + "\t" + ri.File)
    
System.IO.File.WriteAllLines( "authors.tsv", authorOutput )
System.IO.File.WriteAllLines( "texts.tsv", textOutput )
System.IO.Directory.CreateDirectory( "texts") |> ignore
for ri in reviewerInfos |> Seq.distinctBy( fun ri -> ri.Hash ) do
    let outputPath = System.IO.Path.Combine( "texts", ri.Hash.ToString() + ".txt" )
    System.IO.File.WriteAllText( outputPath, ri.Text )


## Get Keyphrases

We use [PKE KP-Miner](https://boudinfl.github.io/pke/build/html/unsupervised.html#kpminer) b/c it has [the best unsupervised performance on SemEval2010](http://aclweb.org/anthology/C16-2015).

### First we build a document frequency model based on our corpus.

In [None]:
#NLTK dependencies that must be installed first
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [None]:
from pke import compute_document_frequency
from string import punctuation

# path to the collection of documents
input_dir = '/z/aolney/repos/jedm-reviewer-finder/texts/'

# path to the DF counts dictionary, saved as a gzip tab separated values
output_file = 'edm-df.gz'

# compute df counts and store stem -> weight values
compute_document_frequency(input_dir=input_dir,
                           output_file=output_file,
                           format="raw",            # input files format
                           use_lemmas=False,    # do not use Stanford lemmas
                           stemmer="porter",            # use porter stemmer
                           stoplist=list(punctuation),            # stoplist
                           delimiter='\t',            # tab separated output
                           extension='txt',          # input files extension
                           n=5)              # compute n-grams up to 5-grams

### Use KPMiner with our custom DF

In [None]:
import pke

output = []

#load df
df = pke.load_document_frequency_file(input_file='/z/aolney/repos/jedm-reviewer-finder/edm-df.gz')


#must loop over each document in texts
with open('/z/aolney/repos/jedm-reviewer-finder/texts.tsv') as inputFile:
    for line in inputFile:
        split = line.split("\t")
        text = split[1]
        hashCode = split[0]
        
        # 1. create a KPMiner extractor.
        extractor = pke.unsupervised.KPMiner(language='english')

        # 2. load the content of the document.
        #extractor.read_document(format='raw')
        extractor.read_text(text)

        # 3. select {1-5}-grams that do not contain punctuation marks or
        #    stopwords as keyphrase candidates. Set the least allowable seen
        #    frequency to 5 and the number of words after which candidates are
        #    filtered out to 200.
        lasf = 3 #5
        cutoff = 400 #200
        stoplist = nltk.corpus.stopwords.words("english")
        extractor.candidate_selection(lasf=lasf, cutoff=cutoff, stoplist=stoplist)

        # 4. weight the candidates using KPMiner weighting function.
        alpha = 2.3
        sigma = 3.0
        extractor.candidate_weighting(df=df, alpha=alpha, sigma=sigma)

        # 5. get the 10-highest scored candidates as keyphrases
        keyphrases = extractor.get_n_best(n=10)
        
        #NOTE: we convert score float to int for compression purposes
        for phrase,score in keyphrases:
            output.append( hashCode + "\t" + phrase + "\t" + str(int(score)) + "\n" )
        
#write out
with open("keys.tsv", "w") as f:
    f.writelines(output)