In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nlppln import WorkflowGenerator
#cwl_working_dir = '/home/dafne/cwl-working-dir/'
cwl_working_dir = '/home/jvdzwaan/cwl-working-dir/'

In [None]:
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    print(wf.list_steps())

In [None]:
# remove openiti metadata and divide a file in books/chapters 

with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    #print(wf.list_steps())
    
    txt_file = wf.add_input(txt_file='File')
    regex = wf.add_input(regex='string[]', default=['### |', '### ||'])
    chapter_dir_name = wf.add_input(dir_name='string?')
    
    txt_file = wf.openiti2txt(in_file=txt_file)
    chapters = wf.split_text(in_file=txt_file, regex=regex)
    wf.add_outputs(chapters=chapters)
    
    wf.save('../adhtools/cwl/split-books-chapters-file.cwl', wd=True, relative=False)    

In [None]:
# split books/chapters for a directory of text files
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    #print(wf.list_steps())
    
    in_dir = wf.add_input(in_dir='Directory')
    regex = wf.add_input(regex='string[]', default=['### |', '### ||'])
    
    txt_files = wf.ls(in_dir=in_dir)
    chapters = wf.split_books_chapters_file(txt_file=txt_files, regex=regex, 
                                            scatter='txt_file', scatter_method='dotproduct')
    
    wf.add_outputs(texts=chapters)
    
    wf.save('../adhtools/cwl/split-books-chapters-dir.cwl', wd=True, relative=False)

In [None]:
# analyze file (first split it into multiple smaller subfiles)
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    txt_file = wf.add_input(txt_file='File')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    split_regex_small = wf.add_input(split_regex_small='string[]', default=['Milestone300', '### |', '### ||'])
        
    txt_file = wf.openiti2txt(in_file=txt_file)
    snippets = wf.split_text(in_file=txt_file, regex=split_regex_small)
        
    analyzed_files = wf.SafarAnalyze(in_files=snippets, analyzer=analyzer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=analyzed_files)
    #filtered_file = wf.safar_filter_analyses(in_file=merged_file)
    
    out_file = wf.safar_add_metadata_file(in_file=merged_file, in_file_meta=metadata)
    
    # Output is one xml file
    wf.add_outputs(out_file=out_file)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-file-no-filtering.cwl', wd=True, relative=False)

In [None]:
# Split and analyze multiple books
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    analyzer = wf.add_input(analyzer='enum', symbols=['Alkhalil', 'BAMA'], default='Alkhalil')
    in_dir = wf.add_input(in_dir='Directory')
    metadata = wf.add_input(metadata='File')
    cp = wf.add_input(cp='string')
    split_regex_small = wf.add_input(split_regex_small='string[]', default=['Milestone300', '### |', '### ||'])   
    
    books = wf.ls(in_dir=in_dir)
    
    safar_output = wf.safar_split_and_analyze_file(analyzer=analyzer, txt_file=books, cp=cp, 
                                                   split_regex_small=split_regex_small,
                                                   metadata=metadata,
                                                   scatter='txt_file', scatter_method='dotproduct')
    
    wf.add_outputs(safar_output=safar_output)
    
    wf.save('../adhtools/cwl/safar-split-and-analyze-dir.cwl', wd=True, relative=False)

In [None]:
# Split and stem single book (txt file)
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_file = wf.add_input(in_file='File')
    metadata = wf.add_input(metadata='File')
    regex = wf.add_input(regex='string[]', default=['### |', '### ||'])
    cp = wf.add_input(cp='string')
    
    txt_file = wf.openiti2txt(in_file=in_file)
    
    snippets = wf.split_text(in_file=txt_file, regex=regex)
        
    stemmed_files = wf.SafarStem(in_files=snippets, stemmer=stemmer, cp=cp)
    merged_file = wf.merge_safar_xml(in_files=stemmed_files)
    out_file = wf.safar_add_metadata_file(in_file=merged_file, in_file_meta=metadata)

    wf.add_outputs(out_file=out_file)
    
    wf.save('../adhtools/cwl/safar-split-and-stem-file.cwl', wd=True, relative=False)

In [None]:
# Split and stem single book (txt file)
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    stemmer = wf.add_input(stemmer='enum', 
                           symbols=['KHOJA', 'LIGHT10', 'ISRI', 'MOTAZ', 'TASHAPHYNE'], 
                           default='LIGHT10')
    in_dir = wf.add_input(in_dir='Directory')
    metadata = wf.add_input(metadata='File')
    regex = wf.add_input(regex='string[]', default=['### |', '### ||'])
    cp = wf.add_input(cp='string')
    
    txt_files = wf.ls(in_dir=in_dir)
    out_files = wf.safar_split_and_stem_file(in_file=txt_files, metadata=metadata, stemmer=stemmer, cp=cp,
                                             scatter='in_file', scatter_method='dotproduct')

    wf.add_outputs(out_files=out_files)
    
    wf.save('../adhtools/cwl/safar-split-and-stem-dir.cwl', wd=True, relative=False)

In [None]:
with WorkflowGenerator(working_dir=cwl_working_dir) as wf:
    wf.load(steps_dir='../adhtools/cwl/')
    wf.load(steps_dir='../java/cwl/')
    
    in_dir = wf.add_input(in_dir='Directory')
    metadata = wf.add_input(metadata='File')
    
    in_files = wf.ls(in_dir=in_dir)
    out_files = wf.safar_add_metadata_file(in_file=in_files, in_file_meta=metadata,
                                          scatter='in_file', scatter_method='dotproduct')
    
    wf.add_outputs(out_files=out_files)
    wf.save('../adhtools/cwl/add-metadata-dir.cwl', wd=True, relative=False)