In [2]:
import PyPDF2
import re
import pandas as pd
import numpy as np

In [3]:

def extract_text_from_pdf(pdf_path):
    text = ''
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        num_pages = len(pdf_reader.pages)
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

In [4]:
pdf_path = r"/mnt/disk/yrajcoomar/kreol-benchmark/data_collection/books/diksioner.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
pdf_reader = PyPDF2.PdfReader(pdf_path)

### Parsing Strategy



In [5]:
word_def_list = [] #to parse as well into bitext
cr_en_bitext_list = [] #to parse into
triage_list = [] #leftover, error catching
len_header = len('DIKS YON ER KREOL MORISYEN  \n \n')
for page in range(1,len(pdf_reader.pages)):
    text_page = pdf_reader.pages[page].extract_text().strip(' ')
    page_str = len(str(page+1))
    header_strip_len = len_header + page_str
    sentences = text_page[header_strip_len:].strip(' ').lstrip('\n').split('.')
    for sentence in sentences:
        if '[' in sentence: # Definitions in the dict tend to have square brackets to denote etymological sources.
            word_def_list.append((page,sentence))
        elif '=' in sentence: # Bitext in the dict tends to have equal signs
            cr_en_bitext_list.append((page,sentence))
        else: # Above 2 conditions are expected to catch atleast 80% of data correctly
            if len(sentence) > 1:
                triage_list.append((page,sentence))

## Post parsing Clean

### definitions

In [6]:
clean_word_def_list = []
double_brackets_list = []
for pg,txt in word_def_list:
    if txt.count('[') > 1:
        double_brackets_list.append((pg,txt))
    else:
        clean_txt = txt.replace('\n','').replace('  ',' ')
        clean_word_def_list.append((pg,clean_txt.strip(' ')))

In [7]:
single_bkt_list = []
for pg,txt in double_brackets_list:
    tl = txt.replace('\n',' ').split('   ')
    for x in tl:
        if len(x) > 0:
            single_bkt_list.append((pg,x.strip(' ')))

In [8]:
single_bkt_list

[(2, 'abi [from Fre abus] : abuse'),
 (2, 'abitan  [from Fre habitant] : inhabitant'),
 (3, 'abitiye  [from Fre habituer] : to accustom'),
 (3, 'abiye  [from Fre habiller] : to dress'),
 (3, 'abize  [from Fre abuser] :  to abuse'),
 (3, 'abor  [from Fre a bord] : on board'),
 (3, 'absan  [from Fre absent] : absent'),
 (3, 'absans  [from Fre absence] : absence'),
 (4, 'absorbe  [from Fre absorber] : to absorb'),
 (4, 'adimans  [from Fre a dimanche] : to  Sunday'),
 (5, 'akonpanye  (akonpayn) [from Fre  accompagner] : to accompany'),
 (5, 'akonpli  [from Fre accomplir] : to  accomplish'),
 (10, 'alorizinn  [from Fre a l’origine] : originally'),
 (10, 'alwil  [from Fre a l’huile] : of oil'),
 (11, 'amize  [from Fre amuser] : to have fun'),
 (11, 'amizir  [from Fre a mesure] : to the extent  (of)'),
 (13, 'anbarase  (anbaras) [from Fre embarrasser]  :  to embarrass'),
 (13, 'anbasader  [from Fre ambassadeur] :  ambassador'),
 (13, 'anbrase  [from Fre embrasser] : to embrace;  to kiss'),
 (

In [9]:
all_definitions = [x[1] for x in clean_word_def_list] + [x[1] for x in single_bkt_list]

In [10]:
import re

def remove_brackets(text):
    # Pattern to find brackets and content within
    pattern = r'\[[^]]*\]|\([^)]*\)'

    # Use re.sub to replace found patterns with an empty string
    result = re.sub(pattern, '', text)

    return result.replace('  ',' ')



In [11]:
all_definitions.pop(0)
all_definitions.insert(0,'abandone (abandon) [from Fre abandonner] : to abandon')

In [12]:
cleaned_list = [remove_brackets(item) for item in all_definitions]

In [13]:
def convert_list(lst,sep):
    # Initialize empty list to store converted items
    converted_lst = []
    triage_l = []

    # Loop through items in original list
    for item in lst:
        # Split item into input and target
        if item.count(':')==1:
            input_str, target_str = item.split(sep)
            # Remove leading/trailing whitespace and strip input string of any brackets
            input_str = input_str.strip().strip('[]()')
            target_str = target_str.strip()
            # Create dictionary with input and target strings
            item_dict = {'input': input_str, 'target': target_str}
            # Add dictionary to converted list
            converted_lst.append(item_dict)
        else:
            triage_l.append(item)

    return converted_lst,triage_l

# Apply function to list
export_json,cleaner_triage_defs = convert_list(cleaned_list,':')

In [14]:
# import json
# with open('dict1.json', 'w') as f:
#     for item in export_json:
#         # Write dictionary to file on new line
#         f.write(json.dumps(item) + '\n')

### cr_en bitext list

In [15]:
cr_en_bitext_list

[(1,
  ' Abbreviations  \nAra = Arabic  \nBan = Bantu  \nEng = English  \nFre = French  \nHin = Hindi  \nMal = Malagasy  \nredupl'),
 (1, ' = reduplication  \n \n \n \n \n \n \n'),
 (2,
  ' Akoz samem, \nler mo ti ena sizan mo ti abandon \nposibilite vinn enn gran pent = Because \n(of) that, when I was six years old I \nmissed the  chance to become a great \npainter'),
 (2, '  Mo finn abandon li tusel lakaz = I \nleft him alone at home'),
 (2, " Mo'nn seye me abba = I've \ntried but in vain"),
 (2, '  So \nabitan zot katorz = Its inhab itants number \nfourteen'),
 (2, ' \nDezyem  planet la ti abite par enn vantar ='),
 (3,
  '  Maladi \nla ti abor, me lekipaz pa finn deklar nanye \n= Disease (went) on board, but the crew \ndidn’t declare anything'),
 (3,
  ' Malgre \nso puvwar absoli, li ti bon = In spite of his \nabsolute power, he was good'),
 (3, ' \nAbsans konpetisyon = Absence (of) \ncompetition'),
 (3, ' Absans predater = Absence \n(of) predators'),
 (3,
  '  Bann \ndimunn rod so

In [16]:
cr_en_bitext_list.pop(0)

(1,
 ' Abbreviations  \nAra = Arabic  \nBan = Bantu  \nEng = English  \nFre = French  \nHin = Hindi  \nMal = Malagasy  \nredupl')

In [17]:
cr_en_bitext_list.pop(0)

(1, ' = reduplication  \n \n \n \n \n \n \n')

In [18]:
bitext_list = []
for pg,txt in cr_en_bitext_list:
    cl_t = txt.replace('\n','').replace('  ',' ').strip(' ')
    bitext_list.append((pg,cl_t))

In [19]:
bitext_list_txt = [x[1] for x in bitext_list]

In [20]:
def convert_list(lst,sep):
    # Initialize empty list to store converted items
    converted_lst = []
    triage_l = []

    # Loop through items in original list
    for item in lst:
        # Split item into input and target
        if item.count(sep)==1:
            input_str, target_str = item.split(sep)
            # Remove leading/trailing whitespace and strip input string of any brackets
            input_str = input_str.strip().strip('[]()')
            target_str = target_str.strip()
            # Create dictionary with input and target strings
            item_dict = {'input': input_str, 'target': target_str}
            # Add dictionary to converted list
            converted_lst.append(item_dict)
        else:
            triage_l.append(item)

    return converted_lst,triage_l

# Apply function to list
export_json,cleaner_triage_defs = convert_list(cleaned_list,':')

In [21]:
bitext_export_json, bitext_triage = convert_list(bitext_list_txt,'=')

In [22]:
import json

In [None]:
import json

In [26]:
with open('/mnt/disk/yrajcoomar/kreol-benchmark/data_collection/notebooks/cr_en_dict_sentences.json', 'w') as f:
    for item in bitext_export_json:
        # Write dictionary to file on new line
        item_dict = {'input': item['target'], 'target': item['input']}
        f.write(json.dumps(item_dict) + '\n')

In [71]:
len(bitext_export_json)

2383

In [1]:
import pandas as pd

In [3]:
import json

dict_json = pd.read_json('/mnt/disk/yrajcoomar/kreol-benchmark/data_collection/notebooks/dict_definitions.json',lines=True)
dict_json['input'], dict_json['target'] = dict_json['target'], dict_json['input']

with open('/mnt/disk/yrajcoomar/kreol-benchmark/data_collection/notebooks/cr_en_dict_definitions.json', 'w') as f:
    for idx,item in dict_json.iterrows():
        # Write dictionary to file on new line
        item_dict = {'input': item.input, 'target': item.target}
        f.write(json.dumps(item_dict) + '\n')

In [14]:
dict_json = pd.read_json('/mnt/disk/yrajcoomar/kreol-benchmark/data_collection/notebooks/dict_sentences.json',lines=True)
dict_json['input'], dict_json['target'] = dict_json['target'], dict_json['input']

with open('/mnt/disk/yrajcoomar/kreol-benchmark/data_collection/notebooks/cr_en_dict_sentences.json', 'w') as f:
    for idx,item in dict_json.iterrows():
        # Write dictionary to file on new line
        item_dict = {'input': item.input, 'target': item.target}
        f.write(json.dumps(item_dict) + '\n')