In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nlppln import WorkflowGenerator
#cwl_working_dir = '/home/dafne/cwl-working-dir/'
cwl_working_dir = '/home/jvdzwaan/cwl-working-dir/'

In [None]:
with WorkflowGenerator(working_dir=cwl_working_dir) as wf_sub:
    wf_sub.load(steps_dir='../adhtools/cwl/')
    wf_sub.load(steps_dir='../java/cwl/')
    print(wf_sub.list_steps())

In [None]:
# Split and analyze single book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf_sub:
    wf_sub.load(steps_dir='../adhtools/cwl/')
    wf_sub.load(steps_dir='../java/cwl/')
    
    analyzer = wf_sub.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    book = wf_sub.add_input(book='File')
    cp = wf_sub.add_input(cp='string')
    #split_regex_small = wf_sub.add_input(split_regex_small='string', default='Milestone300')
    
    meta_file, txt_file = wf_sub.extract_metadata(in_file=book) 
    
    snippets = wf_sub.split_text_size(in_file=txt_file)
        
    analyzed_files = wf_sub.SafarAnalyze(in_files=snippets, analyzer=analyzer, cp=cp)
    #merged_file = wf_sub.merge_safar_xml(in_files=analyzed_files)
    
    #out_file = wf_sub.safar_add_metadata_file(in_file=merged_file, in_file_meta=meta_file)
    
    # Output is one xml file
    wf_sub.add_outputs(out_files=analyzed_files)
    
    wf_sub.save('../adhtools/cwl/safar-split-and-analyze-file-no-merge.cwl', wd=True, relative=False)

In [None]:
# save xml files
with WorkflowGenerator(working_dir=cwl_working_dir) as wf_sub:
    wf_sub.load(steps_dir='../adhtools/cwl/')
    wf_sub.load(steps_dir='../java/cwl/')
    
    analyzer = wf_sub.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    book = wf_sub.add_input(book='File')
    cp = wf_sub.add_input(cp='string')
    split_regex_small = wf_sub.add_input(split_regex_small='string', default='Milestone300')
    
    meta_file, txt_file = wf_sub.extract_metadata(in_file=book) 
    
    snippets = wf_sub.split_text(in_file=txt_file, regex=split_regex_small)
        
    analyzed_files = wf_sub.SafarAnalyze(in_files=snippets, analyzer=analyzer, cp=cp)
    
    wf_sub.add_outputs(analyzed_files=analyzed_files)
    
    wf_sub.save('../adhtools/cwl/safar-analyze-file-save-xml.cwl', wd=True, relative=False)

In [None]:
# Split and analyze multiple books
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    split_regex_small = wf.add_input(split_regex_small='string', default='Milestone300')   
    
    books = wf.ls(in_dir=in_dir)
    
    safar_output_dirs = wf.safar_split_and_analyze_file(analyzer=analyzer, book=books, cp=cp, split_regex_small=split_regex_small,
                                                        scatter='book', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output_dirs)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-dir.cwl', wd=True, relative=False)

In [None]:
# Analyze a single book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    #print(wf_sub.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    book = wf.add_input(book='File')
    cp = wf.add_input(cp='string')
    
    analyzed_files = wf.SafarAnalyze(in_dir=txt_dir, analyzer=analyzer, cp=cp)
    safar_output_dir = wf.safar_add_metadata(in_files=analyzed_files, in_dir_meta=meta_dir, in_file_meta=meta_file)
    
    # Output is a directory containing xml files. The name of the directory is the name of the book
    wf_sub.add_outputs(safar_output_dir=safar_output_dir)
    
    wf_sub.save('../adhtools/cwl/safar-analyze-file.cwl', wd=True, relative=False)

In [None]:
# Scattered version: analyze directory
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    recursive = wf.add_input(recursive='boolean?', default=False)
    cp = wf.add_input(cp='string')
    
    books = wf.ls(in_dir=in_dir, recursive=recursive)
    
    out_files = wf.SafarAnalyze(analyzer=analyzer, in_files=books, cp=cp)
    
    wf.add_outputs(out_files=out_files)
    
    wf.save('../adhtools/cwl/safar-analyze-dir.cwl', wd=True, relative=False)

In [None]:
# Scattered version: analyze directory and save to one directory
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    corpus_name = wf.add_input(index_name='string', default='corpus')
    
    books = wf.ls(in_dir=in_dir)
    safar_output_dirs = wf.safar_analyze_book(analyzer=analyzer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    merged_dir = wf.gather_dirs(in_dirs=safar_output_dirs, dir_name=corpus_name)
    
    wf.add_outputs(safar_output=merged_dir)
    
    wf.save('../adhtools/cwl/safar-analyze-corpus.cwl', wd=True, relative=False)

In [None]:
# Analyze input archive and output a single archive
# Part of the teamsprint for creating a corpus upload service
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    wf.set_label('Analyze Arabic texts using SAFAR.')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil', label='Analyzer')
    archive = wf.add_input(archive='File', label='Zip file containing texts in OpenITI format')
    cp = wf.add_input(cp='string', default='.:/home/jvdzwaan/data/tmp/adh/jars/*:/home/jvdzwaan/code/research-scripts/bin/')
    corpus_name = wf.add_input(index_name='string', default='corpus')
    
    in_dir = wf.archive2dir(archive=archive)
    
    results_dir = wf.safar_analyze_corpus(cp=cp, in_dir=in_dir, analyzer=analyzer, index_name=corpus_name)
    
    archive = wf.zip_dir_flat(in_dir=results_dir)
  
    wf.add_outputs(result=archive)
    
    wf.save('../adhtools/cwl/safar-analyze-archive.cwl', wd=True, relative=False)

In [None]:
# Now the scattered version
# Analyze a directory of books and index them
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_name = wf.add_input(index_name='string', default='corpus')
    action = wf.add_input(action='string', default='create')
    index_format = wf.add_input(index_format='string', default='safar-analyzer')
    text_direction = wf.add_input(text_direction='string', default='rtl')
    content_viewable = wf.add_input(content_viewable='boolean', default=True)
    xml_dir_name = wf.add_input(xml_dir_name='string', default='xml')
    
    books = wf.ls(in_dir=in_dir)
    safar_output_dirs = wf.safar_analyze_book(analyzer=analyzer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    merged_dir = wf.gather_dirs(in_dirs=safar_output_dirs, dir_name=xml_dir_name)
    indexed = wf.blacklabindexer(action=action, 
                                 index_format=index_format, 
                                 index_name=index_name, 
                                 in_dir=merged_dir, 
                                 text_direction=text_direction, 
                                 content_viewable=content_viewable)
    # do not use both safar_output_dirs and merged_dir as outputs, because that doesn't work
    # (probably because wf.gather_dirs does not copy files, but manipulates symlinks)
    wf.add_outputs(indexed=indexed)
    wf.add_outputs(merged_dir=merged_dir)
    
    wf.save('../adhtools/cwl/analyze-and-index-dir.cwl', wd=True, relative=False)

In [None]:
# Analyze directory of texts using different analyzers
# DOESN'T WORK ANYMORE SINCE WE UPDATED THE ANALYZER TO AN ENUM, WON'T FIX FOR NOW
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    analyzers = wf.add_input(analyzer='string[]', default=['Alkhalil', 'BAMA'])
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_dir_name = wf.add_input(index_dir_name='string', default='index')
    xml_dir_name = wf.add_input(xml_dir_name='string', default='xml')
    
    indexed, merged_dir = wf.analyze_and_index_dir(cp=cp, 
                                                   in_dir=in_dir, 
                                                   analyzer=analyzers, 
                                                   index_name=analyzers,
                                                   xml_dir_name=analyzers,
                                                   scatter=['analyzer', 'index_name', 'xml_dir_name'],
                                                   scatter_method='dotproduct')
    index_dir = wf.gather_dirs(in_dirs=indexed, dir_name=index_dir_name)
    xml_dir = wf.gather_dirs(in_dirs=merged_dir, dir_name=xml_dir_name)

    # output: a directory containing all indices (one for each analyzer) and 
    # a directory containing the xml files (contains a subdirectory for each 
    # analyzer which contains a directory for each book)
    wf.add_outputs(indexed=index_dir)
    wf.add_outputs(xml=xml_dir)
    
    wf.save('../adhtools/cwl/analyze-and-index-dir-all-analyzers.cwl', wd=True, relative=False)

In [None]:
# Remove metadata and divide a file into books/chapters
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    txt_file = wf.add_input(txt_file='File')
    regex = wf.add_input(regex='string[]', default=['### |', '### ||'])
    chapter_dir_name = wf.add_input(dir_name='string?')
    
    chapters = wf.split_text(in_file=txt_file, regex=regex)
    chapter_dir = wf.save_files_to_dir(dir_name=chapter_dir_name, in_files=chapters)
    
    #wf.add_outputs(metadata=meta_file)
    wf.add_outputs(chapters=chapter_dir)
    
    wf.save('../adhtools/cwl/split-books-chapters-file.cwl', wd=True, relative=False)

In [None]:
# split books/chapters for a directory of text files
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    in_dir = wf.add_input(in_dir='Directory')
    regex = wf.add_input(regex='string[]', default=['### |', '### ||'])
    
    txt_files = wf.ls(in_dir=in_dir)
    chapters = wf.split_books_chapters_file(txt_file=txt_files, regex=regex, 
                                            scatter='txt_file', scatter_method='dotproduct')
    
    wf.add_outputs(texts=chapters)
    #wf.add_outputs(metadata=metadata)
    
    wf.save('../adhtools/cwl/split-books-chapters-dir.cwl', wd=True, relative=False)

In [None]:
# Split and stem single book (txt file)
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    txt_file = wf.add_input(txt_file='File')
    cp = wf.add_input(cp='string')
    split_regex_small = wf.add_input(split_regex_small='string[]', default=['Milestone300'])
    
    meta_file, txt_file = wf.extract_metadata(in_file=txt_file) 
    
    snippets = wf.split_text(in_file=txt_file, regex=split_regex_small)
        
    stemmed_files = wf.SafarStem(in_files=snippets, stemmer=stemmer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=stemmed_files)
    
    #out_file = wf_sub.safar_add_metadata_file(in_file=merged_file, in_file_meta=meta_file)
    
    # Output is one xml file
    #wf_sub.add_outputs(out_file=out_file)
    wf.add_outputs(xml_file=merged_file)
    
    wf.save('../adhtools/cwl/safar-split-and-stem-file.cwl', wd=True, relative=False)

In [None]:
# Split and stem a directory of text files
# The output is an xml file for each text file in the input directory
# Metadata is ignored for now.
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_dir = wf.add_input(in_dir='Directory')
    recursive = wf.add_input(recursive='boolean?', default=False)
    cp = wf.add_input(cp='string')
    split_regex_small = wf.add_input(split_regex_small='string[]', default=['Milestone300'])
    
    txt_files = wf.ls(in_dir=in_dir, recursive=recursive)
    
    safar_output_dirs = wf.safar_split_and_stem_file(stemmer=stemmer, txt_file=txt_files, cp=cp, 
                                                     split_regex_small=split_regex_small,
                                                     scatter='txt_file', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output_dirs)
    
    
    wf.save('../adhtools/cwl/safar-split-and-stem-dir.cwl', wd=True, relative=False)

In [None]:
# Stem a single book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf_sub:
    wf_sub.load(steps_dir='../adhtools/cwl/')
    wf_sub.load(steps_dir='../java/cwl/')
    print(wf_sub.list_steps())
    
    
    stemmer = wf_sub.add_input(stemmer='enum', 
                               symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                               default='LIGHT10')
    book = wf_sub.add_input(book='File')
    cp = wf_sub.add_input(cp='string')
    
    txt_dir, meta_dir, meta_file = wf_sub.txt2safar_input(in_file=book)
    stemmed_files = wf_sub.SafarStem(in_dir=txt_dir, stemmer=stemmer, cp=cp)
    safar_output_dir = wf_sub.safar_add_metadata(in_files=stemmed_files, in_dir_meta=meta_dir, in_file_meta=meta_file)
    
    wf_sub.add_outputs(safar_output_dir=safar_output_dir)
    
    wf_sub.save('../adhtools/cwl/safar-stem-book.cwl', wd=True, relative=False)

In [None]:
# Scattered version: stem directory
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    books = wf.ls(in_dir=in_dir)
    
    safar_output_dirs = wf.safar_stem_book(stemmer=stemmer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output_dirs)
    
    wf.save('../adhtools/cwl/safar-stem-dir.cwl', wd=True, relative=False)

In [None]:
# Now the scattered version
# Stem a directory of books and index them
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_name = wf.add_input(index_name='string', default='corpus')
    action = wf.add_input(action='string', default='create')
    index_format = wf.add_input(index_format='string', default='safar-stemmer')
    text_direction = wf.add_input(text_direction='string', default='rtl')
    content_viewable = wf.add_input(content_viewable='boolean', default=True)
    
    books = wf.ls(in_dir=in_dir)
    safar_output_dirs = wf.safar_stem_book(stemmer=stemmer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    merged_dir = wf.gather_dirs(in_dirs=safar_output_dirs)
    indexed = wf.blacklabindexer(action=action, 
                                 index_format=index_format, 
                                 index_name=index_name, 
                                 in_dir=merged_dir, 
                                 text_direction=text_direction, 
                                 content_viewable=content_viewable)
    wf.add_outputs(indexed=indexed)
    
    wf.add_outputs(safar_output_dirs=safar_output_dirs)
    wf.add_outputs(merged_dir=merged_dir)
    
    wf.save('../adhtools/cwl/safar-stem-and-index-dir.cwl', wd=True, relative=False)

In [None]:
# Stem directory of texts using different stemmers
# DOESN'T WORK ANYMORE SINCE WE UPDATED THE ANALYZER TO AN ENUM, WON'T FIX FOR NOW
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    stemmers = wf.add_input(stemmer='string[]', default=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'])
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    
    indexed, merged_dir, safar_output_dirs = wf.stem_and_index_dir(cp=cp, 
                                                                      in_dir=in_dir, 
                                                                      stemmer=stemmers, 
                                                                      index_name=stemmers,
                                                                      scatter=['stemmer', 'index_name'],
                                                                      scatter_method='dotproduct')
    wf.add_outputs(indexed=indexed)
    
    wf.save('../adhtools/cwl/stem-and-index-dir-all-stemmers.cwl', wd=True, relative=False)

In [None]:
# add metadata to xml files (using the metadata csv)
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    in_dir = wf.add_input(in_dir='Directory')
    in_file_meta = wf.add_input(in_file_meta='File')
    
    in_files = wf.ls(in_dir=in_dir)
    xml = wf.safar_add_metadata_file(in_file=in_files, in_file_meta=in_file_meta, 
                                     scatter='in_file', scatter_method='dotproduct')
    
    wf.add_outputs(xml=xml)
    wf.save('../adhtools/cwl/add_metadata_wf.cwl', wd=True, relative=False)
