In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nlppln import WorkflowGenerator
#cwl_working_dir = '/home/dafne/cwl-working-dir/'
cwl_working_dir = '/home/jvdzwaan/cwl-working-dir/'

In [None]:
# Analyze a single book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf_sub:
    wf_sub.load(steps_dir='../adhtools/cwl/')
    wf_sub.load(steps_dir='../java/cwl/')
    #print(wf_sub.list_steps())
    
    analyzer = wf_sub.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    book = wf_sub.add_input(book='File')
    cp = wf_sub.add_input(cp='string')
    
    txt_dir, meta_dir, meta_file = wf_sub.txt2safar_input(in_file=book)
    analyzed_files = wf_sub.SafarAnalyze(in_dir=txt_dir, analyzer=analyzer, cp=cp)
    safar_output_dir = wf_sub.safar_add_metadata(in_files=analyzed_files, in_dir_meta=meta_dir, in_file_meta=meta_file)
    
    # Output is a directory containing xml files. The name of the directory is the name of the book
    wf_sub.add_outputs(safar_output_dir=safar_output_dir)
    
    wf_sub.save('../adhtools/cwl/safar-analyze-book.cwl', wd=True, relative=False)

In [None]:
# Scattered version: analyze directory
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_name = wf.add_input(index_name='string', default='corpus')
    books = wf.ls(in_dir=in_dir)
    
    safar_output_dirs = wf.safar_analyze_book(analyzer=analyzer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output_dirs)
    
    wf.save('../adhtools/cwl/safar-analyze-dir.cwl', wd=True, relative=False)

In [None]:
# Scattered version: analyze directory and save to one directory
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    corpus_name = wf.add_input(index_name='string', default='corpus')
    
    books = wf.ls(in_dir=in_dir)
    safar_output_dirs = wf.safar_analyze_book(analyzer=analyzer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    merged_dir = wf.gather_dirs(in_dirs=safar_output_dirs, dir_name=corpus_name)
    
    wf.add_outputs(safar_output=merged_dir)
    
    wf.save('../adhtools/cwl/safar-analyze-corpus.cwl', wd=True, relative=False)

In [None]:
# Analyze input archive and output a single archive
# Part of the teamsprint for creating a corpus upload service
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    archive = wf.add_input(archive='File')
    cp = wf.add_input(cp='string')
    corpus_name = wf.add_input(index_name='string', default='corpus')
    
    in_dir = wf.archive2dir(archive=archive)
    
    results_dir = wf.safar_analyze_corpus(cp=cp, in_dir=in_dir, analyzer=analyzer, index_name=index_name)
    
    archive = wf.zip_dir_flat(in_dir=results_dir)
  
    wf.add_outputs(result=archive)
    
    wf.save('../adhtools/cwl/safar-analyze-archive.cwl', wd=True, relative=False)

In [None]:
# Now the scattered version
# Analyze a directory of books and index them
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_name = wf.add_input(index_name='string', default='corpus')
    action = wf.add_input(action='string', default='create')
    index_format = wf.add_input(index_format='string', default='safar-analyzer')
    text_direction = wf.add_input(text_direction='string', default='rtl')
    content_viewable = wf.add_input(content_viewable='boolean', default=True)
    xml_dir_name = wf.add_input(xml_dir_name='string', default='xml')
    
    books = wf.ls(in_dir=in_dir)
    safar_output_dirs = wf.safar_analyze_book(analyzer=analyzer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    merged_dir = wf.gather_dirs(in_dirs=safar_output_dirs, dir_name=xml_dir_name)
    indexed = wf.blacklabindexer(action=action, 
                                 index_format=index_format, 
                                 index_name=index_name, 
                                 in_dir=merged_dir, 
                                 text_direction=text_direction, 
                                 content_viewable=content_viewable)
    # do not use both safar_output_dirs and merged_dir as outputs, because that doesn't work
    # (probably because wf.gather_dirs does not copy files, but manipulates symlinks)
    wf.add_outputs(indexed=indexed)
    wf.add_outputs(merged_dir=merged_dir)
    
    wf.save('../adhtools/cwl/analyze-and-index-dir.cwl', wd=True, relative=False)

In [None]:
# Analyze directory of texts using different analyzers
# DOESN'T WORK ANYMORE SINCE WE UPDATED THE ANALYZER TO AN ENUM, WON'T FIX FOR NOW
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    analyzers = wf.add_input(analyzer='string[]', default=['Alkhalil', 'BAMA'])
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_dir_name = wf.add_input(index_dir_name='string', default='index')
    xml_dir_name = wf.add_input(xml_dir_name='string', default='xml')
    
    indexed, merged_dir = wf.analyze_and_index_dir(cp=cp, 
                                                   in_dir=in_dir, 
                                                   analyzer=analyzers, 
                                                   index_name=analyzers,
                                                   xml_dir_name=analyzers,
                                                   scatter=['analyzer', 'index_name', 'xml_dir_name'],
                                                   scatter_method='dotproduct')
    index_dir = wf.gather_dirs(in_dirs=indexed, dir_name=index_dir_name)
    xml_dir = wf.gather_dirs(in_dirs=merged_dir, dir_name=xml_dir_name)

    # output: a directory containing all indices (one for each analyzer) and 
    # a directory containing the xml files (contains a subdirectory for each 
    # analyzer which contains a directory for each book)
    wf.add_outputs(indexed=index_dir)
    wf.add_outputs(xml=xml_dir)
    
    wf.save('../adhtools/cwl/analyze-and-index-dir-all-analyzers.cwl', wd=True, relative=False)

In [None]:
# Stem a single book
with WorkflowGenerator(working_dir=cwl_working_dir) as wf_sub:
    wf_sub.load(steps_dir='../adhtools/cwl/')
    wf_sub.load(steps_dir='../java/cwl/')
    print(wf_sub.list_steps())
    
    
    stemmer = wf_sub.add_input(stemmer='enum', 
                               symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                               default='LIGHT10')
    book = wf_sub.add_input(book='File')
    cp = wf_sub.add_input(cp='string')
    
    txt_dir, meta_dir, meta_file = wf_sub.txt2safar_input(in_file=book)
    stemmed_files = wf_sub.SafarStem(in_dir=txt_dir, stemmer=stemmer, cp=cp)
    safar_output_dir = wf_sub.safar_add_metadata(in_files=stemmed_files, in_dir_meta=meta_dir, in_file_meta=meta_file)
    
    wf_sub.add_outputs(safar_output_dir=safar_output_dir)
    
    wf_sub.save('../adhtools/cwl/safar-stem-book.cwl', wd=True, relative=False)

In [None]:
# Scattered version: analyze directory
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_name = wf.add_input(index_name='string', default='corpus')
    books = wf.ls(in_dir=in_dir)
    
    safar_output_dirs = wf.safar_stem_book(stemmer=stemmer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output_dirs)
    
    wf.save('../adhtools/cwl/safar-stem-dir.cwl', wd=True, relative=False)

In [None]:
# Now the scattered version
# Stem a directory of books and index them
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    index_name = wf.add_input(index_name='string', default='corpus')
    action = wf.add_input(action='string', default='create')
    index_format = wf.add_input(index_format='string', default='safar-stemmer')
    text_direction = wf.add_input(text_direction='string', default='rtl')
    content_viewable = wf.add_input(content_viewable='boolean', default=True)
    
    books = wf.ls(in_dir=in_dir)
    safar_output_dirs = wf.safar_stem_book(stemmer=stemmer, book=books, cp=cp, scatter='book', scatter_method='dotproduct')
    merged_dir = wf.gather_dirs(in_dirs=safar_output_dirs)
    indexed = wf.blacklabindexer(action=action, 
                                 index_format=index_format, 
                                 index_name=index_name, 
                                 in_dir=merged_dir, 
                                 text_direction=text_direction, 
                                 content_viewable=content_viewable)
    wf.add_outputs(indexed=indexed)
    
    wf.add_outputs(safar_output_dirs=safar_output_dirs)
    wf.add_outputs(merged_dir=merged_dir)
    
    wf.save('../adhtools/cwl/stem-and-index-dir.cwl', wd=True, relative=False)

In [None]:
# Stem directory of texts using different stemmers
# DOESN'T WORK ANYMORE SINCE WE UPDATED THE ANALYZER TO AN ENUM, WON'T FIX FOR NOW
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    wf.load(step_file='https://raw.githubusercontent.com/arabic-digital-humanities/BlackLabIndexer-docker/master/blacklabindexer.cwl')
    
    print(wf.list_steps())
    
    stemmers = wf.add_input(stemmer='string[]', default=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'])
    in_dir = wf.add_input(in_dir='Directory')
    cp = wf.add_input(cp='string')
    
    indexed, merged_dir, safar_output_dirs = wf.stem_and_index_dir(cp=cp, 
                                                                      in_dir=in_dir, 
                                                                      stemmer=stemmers, 
                                                                      index_name=stemmers,
                                                                      scatter=['stemmer', 'index_name'],
                                                                      scatter_method='dotproduct')
    wf.add_outputs(indexed=indexed)
    
    wf.save('../adhtools/cwl/stem-and-index-dir-all-stemmers.cwl', wd=True, relative=False)