In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nlppln import WorkflowGenerator
#cwl_working_dir = '/home/dafne/cwl-working-dir/'
cwl_working_dir = '/home/jvdzwaan/cwl-working-dir/'

In [None]:
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())

In [None]:
# remove openiti metadata and divide a file in books/chapters 

with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    #print(wf.list_steps())
    
    txt_file = wf.add_input(txt_file='File')
    
    doc = """Split a text in OpenITI format in smaller pieces.
    
    First, the OpenITI metadata is removed. Next, the file is split on
    OpenITI markers, to be able to retain information about headers and
    quotes. Finally, the files are split based on file size, to make
    sure SAFAR does not crash on big input files.
    
    Input:
        txt_file (File): The name of the input file, a text in OpenITI format.
        
    Output:
        A list of text files, that can be analyzed or stemmed using SAFAR.
    """
    wf.set_documentation(doc)
    
    txt_file = wf.openiti2txt(in_file=txt_file)
    chapters = wf.split_text_openiti_markers(in_file=txt_file)
    snippets = wf.split_text_size(in_file=chapters, scatter='in_file', scatter_method='dotproduct')
    
    out_files = wf.flatten_list(list=snippets)
    
    wf.add_outputs(out_files=out_files)
    
    wf.save('../adhtools/cwl/split-file-chapters.cwl', wd=True, relative=False)

In [None]:
# split books/chapters for a directory of text files
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    #print(wf.list_steps())
    
    in_dir = wf.add_input(in_dir='Directory')
    
    doc = """Call split-file-chapters.cwl for a Directory of files.
    
    Scattered version of split-file-chapters.cwl.
    
    Input:
        in_dir (Directory): The directory containing texts to be processed.
        
    Output:
        A list (of lists) of text files, that can be analyzed or stemmed using SAFAR.
    """
    wf.set_documentation(doc)
    
    txt_files = wf.ls(in_dir=in_dir)
    chapters = wf.split_file_chapters(txt_file=txt_files,
                                      scatter='txt_file', scatter_method='dotproduct')
    
    wf.add_outputs(texts=chapters)
    
    wf.save('../adhtools/cwl/split-dir-chapters.cwl', wd=True, relative=False)

In [None]:
# analyze file (first split it into multiple smaller subfiles based on regexes)
# Obsolete? (uses splitting on regexes and does not retain information about headers and quotes)
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    txt_file = wf.add_input(txt_file='File')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    split_regex_small = wf.add_input(split_regex_small='string[]', default=['Milestone300', '### |', '### ||'])
        
    txt_file = wf.openiti2txt(in_file=txt_file)
    snippets = wf.split_text(in_file=txt_file, regex=split_regex_small)
        
    analyzed_files = wf.SafarAnalyze(in_files=snippets, analyzer=analyzer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=analyzed_files)
    #filtered_file = wf.safar_filter_analyses(in_file=merged_file)
    
    out_file = wf.safar_add_metadata_file(in_file=merged_file, in_file_meta=metadata)
    
    # Output is one xml file
    wf.add_outputs(out_file=out_file)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-file-no-filtering.cwl', wd=True, relative=False)

In [None]:
# analyze file (first split it into multiple smaller subfiles based on file size)
# Obsolete? (does not retain information about headers and quotes)
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    txt_file = wf.add_input(txt_file='File')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    size = wf.add_input(size='int?')

    txt_file = wf.openiti2txt(in_file=txt_file)
    snippets = wf.split_text_size(in_file=txt_file, size=size)
        
    analyzed_files = wf.SafarAnalyze(in_files=snippets, analyzer=analyzer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=analyzed_files)
    filtered_file = wf.safar_filter_analyses(in_file=merged_file)
    
    out_file = wf.safar_add_metadata_file(in_file=filtered_file, in_file_meta=metadata)
    
    # Output is one xml file
    wf.add_outputs(out_file=out_file)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-file.cwl', wd=True, relative=False)

In [None]:
# analyze file 
# - remove openiti metadata
# - divide in chapters (separate headings)
# - split based on file size
# - analyze
# - merge xml files
# - filter analyses
# - add metadata
# Result: an xml file for a book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    txt_file = wf.add_input(txt_file='File')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    size = wf.add_input(size='int?')
    
    doc = """Analyze a text file in OpenITI format using SAFAR.
    
    To be able to retain information about headers and quotes, the text is first 
    split into files for text, headers and quotes. Next, the resulting files are
    split based on file size, because SAFAR crashes if the output XML files become
    too large. Next, the small files are analyzed using SAFAR and the resulting XML
    files are merged into one big file, containing metadata and information about 
    which words are part of headers and quotes. Finally, to reduce the size of the 
    output XML, redundant information is filtered out.
    
    Inputs:
        analyzer (enum): The SAFAR analyzer to use. Options are (Alkhalil, BAMA).
        txt_file (File): The name of the file to analyze, should be in a text in 
            OpenITI format.
        metadata (File): The name of the csv file containing the corpus metadata.
        cp (string): The class path including where the SAFAR binaries can be found.
        size (int): Maximum file size in bytes. The text is spilt on the first 
            space after the desired file size is reached. So the file size slightly 
            differs between files.
    
    Output:
        File in SAFAR analyzer output, containing metadata and information about headers
            and quotes.
    """
    wf.set_documentation(doc)
        
    txt_files = wf.split_file_chapters(txt_file=txt_file)
        
    analyzed_files = wf.SafarAnalyze(in_files=txt_files, analyzer=analyzer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=analyzed_files)
    filtered_file = wf.safar_filter_analyses(in_file=merged_file)
    
    out_file = wf.safar_add_metadata_file(in_file=filtered_file, in_file_meta=metadata)
    
    # Output is one xml file
    wf.add_outputs(out_file=out_file)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-file.cwl', wd=True, relative=False)

In [None]:
# analyze file 
# - remove openiti metadata
# - divide in chapters (separate headings)
# - split based on file size
# - analyze
# - merge xml files
# - filter analyses
# - add metadata
# Result: an xml file for a book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    txt_file = wf.add_input(txt_file='File')
    #metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    size = wf.add_input(size='int?')
        
    txt_files = wf.split_file_chapters(txt_file=txt_file)
        
    analyzed_files = wf.SafarAnalyze(in_files=txt_files, analyzer=analyzer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=analyzed_files)
    filtered_file = wf.safar_filter_analyses(in_file=merged_file)
    
    #out_file = wf.safar_add_metadata_file(in_file=filtered_file, in_file_meta=metadata)
    
    # Output is one xml file
    wf.add_outputs(out_file=filtered_file)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-file-no-merge-metadata.cwl', wd=True, relative=False)

In [None]:
# Split and analyze multiple books
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    size = wf.add_input(size='int?')
    
    doc = """Analyze a directory of text files in OpenITI format using SAFAR.
    
    Calls `safar-split-and-analyze-file.cwl` for each file in the input directory.
    
    Inputs:
        analyzer (enum): The SAFAR analyzer to use. Options are (Alkhalil, BAMA).
        in_dir (Directory): Directory containing files to analyze.
        metadata (File): The name of the csv file containing the corpus metadata.
        cp (string): The class path including where the SAFAR binaries can be found.
        size (int): Maximum file size in bytes. The text is spilt on the first 
            space after the desired file size is reached. So the file size slightly 
            differs between files.
    
    Output:
        A list of files in SAFAR analyzer XML. There is an output file for each file 
            in the input directory.
    """
    wf.set_documentation(doc)
    
    books = wf.ls(in_dir=in_dir)
    
    safar_output = wf.safar_split_and_analyze_file(analyzer=analyzer, txt_file=books, cp=cp, 
                                                   size=size, metadata=metadata,
                                                   scatter='txt_file', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-dir.cwl', wd=True, relative=False)

In [None]:
# Split and analyze multiple books
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    #metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    size = wf.add_input(size='int?')  
    
    books = wf.ls(in_dir=in_dir)
    
    safar_output = wf.safar_split_and_analyze_file_no_merge_metadata(analyzer=analyzer, txt_file=books, cp=cp, 
                                                   size=size, 
                                                   scatter='txt_file', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-dir-no-merge-metadata.cwl', wd=True, relative=False)

In [None]:
# split and stem file file 
# - remove openiti metadata
# - divide in chapters (separate headings)
# - split based on file size
# - analyze
# - merge xml files
# (- filter analyses)
# - add metadata
# Result: an xml file for a book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    txt_file = wf.add_input(txt_file='File')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    
    doc = """Stem a text file in OpenITI format using SAFAR.
    
    To be able to retain information about headers and quotes, the text is first 
    split into files for text, headers and quotes. Next, the resulting files are
    split based on file size, because SAFAR crashes if the output XML files become
    too large. Next, the small files are stemmed using SAFAR and the resulting XML
    files are merged into one big file, containing metadata and information about 
    which words are part of headers and quotes.
    
    Inputs:
        stemmer (enum): The SAFAR stemmer to use. Options are (KHOJA, LIGHT10, 
            ISRI, MOTAZ, TASHAPHYNE).
        txt_file (File): The name of the file to stem, should be in a text in 
            OpenITI format.
        metadata (File): The name of the csv file containing the corpus metadata.
        cp (string): The class path including where the SAFAR binaries can be found.
    
    Output:
        File in SAFAR stemmer output, containing metadata and information about headers
            and quotes.
    """
    wf.set_documentation(doc)
        
    txt_files = wf.split_file_chapters(txt_file=txt_file)
        
    stemmed_files = wf.SafarStem(in_files=txt_files, stemmer=stemmer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=stemmed_files)
    filtered_file = merged_file #wf.safar_filter_analyses(in_file=merged_file) # No need to filter stemmed, because there for stemming there is always a single analysis.
    
    out_file = wf.safar_add_metadata_file(in_file=filtered_file, in_file_meta=metadata)
    
    # Output is one xml file
    wf.add_outputs(out_file=out_file)
    
    wf.save('../adhtools/cwl/safar-split-and-stem-file.cwl', wd=True, relative=False)

In [None]:
# Split and stem a directory containing text files
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_dir = wf.add_input(in_dir='Directory')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    
    doc = """Analyze a directory of text files in OpenITI format using SAFAR.
    
    Calls `safar-split-and-stem-file.cwl` for each file in the input directory.
       
    Inputs:
        stemmer (enum): The SAFAR stemmer to use. Options are (KHOJA, LIGHT10, 
            ISRI, MOTAZ, TASHAPHYNE).
        in_dir (Directory): Directory containing files to analyze.
        metadata (File): The name of the csv file containing the corpus metadata.
        cp (string): The class path including where the SAFAR binaries can be found.
    
    Output:
        A list of files in SAFAR stemmer XML. There is an output file for each file 
            in the input directory.
    """
    wf.set_documentation(doc)
    
    txt_files = wf.ls(in_dir=in_dir)
    out_files = wf.safar_split_and_stem_file(txt_file=txt_files, metadata=metadata, stemmer=stemmer, cp=cp,
                                             scatter='txt_file', scatter_method='dotproduct')

    wf.add_outputs(out_files=out_files)
    
    wf.save('../adhtools/cwl/safar-split-and-stem-dir.cwl', wd=True, relative=False)

In [None]:
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    in_dir = wf.add_input(in_dir='Directory')
    metadata = wf.add_input(metadata='File')
    
    doc = """Add metadata to all xml files in a directory
    
    Calls `safar-split-and-stem-file.cwl` for each file in the input directory.
       
    Inputs:
        in_dir (Directory): The directory containing the input files.
        metadata (File): The name of the csv file containing the corpus metadata.
    
    Output:
        A list of xml files with metadata. There is an output file for each file 
            in the input directory.
    """
    wf.set_documentation(doc)
    
    in_files = wf.ls(in_dir=in_dir)
    out_files = wf.safar_add_metadata_file(in_file=in_files, in_file_meta=metadata,
                                          scatter='in_file', scatter_method='dotproduct')
    
    wf.add_outputs(out_files=out_files)
    wf.save('../adhtools/cwl/add-metadata-dir.cwl', wd=True, relative=False)