In [1]:
import numpy as np
import pandas as pd    


from tokenizers import Tokenizer, SentencePieceBPETokenizer
from transformers import PreTrainedTokenizerFast

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
VOCAB_SIZE_SRC=50000

### Join Training Data

In [3]:
src_lang = 'en'
tgt_lang = 'cr'
pvt_lang = 'fr'

In [6]:
import re
from unicodedata import normalize
import string


def clean_text(text):
    # Remove accents
    text = normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation.replace('_','')))

    # Remove extra whitespaces
    # text = re.sub(r'\s+', '', text).strip()

    # Convert to lowercase
    text = text.lower()

    return text

In [7]:
def join_corpus(src,pvt,tgt,mode):
    src_tgt_json = rf"C:\Users\yush2\OneDrive\Desktop\kreol\experiments\data\{src}-{tgt}\{src}-{tgt}_{mode}.jsonl"
    pvt_tgt_json = rf"C:\Users\yush2\OneDrive\Desktop\kreol\experiments\data\{pvt}-{tgt}\{pvt}-{tgt}_{mode}.jsonl"

    src_tgt_jsonObj = pd.read_json(path_or_buf=src_tgt_json, lines=True)
    src_tgt_joined = np.concatenate([src_tgt_jsonObj.input , src_tgt_jsonObj.target])
    src_tgt_text = ' \n '.join(src_tgt_joined)

    pvt_tgt_jsonObj = pd.read_json(path_or_buf=pvt_tgt_json, lines=True)
    pvt_tgt_joined = np.concatenate([pvt_tgt_jsonObj.input , pvt_tgt_jsonObj.target])
    pvt_tgt_text = ' \n '.join(pvt_tgt_joined)

    return src_tgt_text + pvt_tgt_text

In [6]:
train_text = join_corpus(src_lang,pvt_lang,tgt_lang,'train')
test_text = join_corpus(src_lang,pvt_lang,tgt_lang,'test')
dev_text = join_corpus(src_lang,pvt_lang,tgt_lang,'dev')
all_text = clean_text(train_text + test_text + dev_text)

In [7]:
all_text[:400]

'to determine the thickness and dynamics of the moons ice shell and \n characterize the surface geology of europa in detail \n a science definition team nasa report has reported \n in a new study in the journal astrobiology \n if one day humans send a robotic lander to the surface of europa \n study lead author robert pappalardo of nasas  laboratory \n pappalardo added \n there is still a lot of preparati'

In [8]:
def get_training_data(text,fixed_window):
    for i in range(0,len(text),fixed_window):
        yield text[i : i+fixed_window]

### Create Tokenizer Pipeline

In [9]:
from tokenizers.implementations import SentencePieceBPETokenizer

In [10]:
tokenizer_save_path = rf"C:\Users\yush2\OneDrive\Desktop\kreol\preprocessing\spm_tokenizer_{src_lang}_{tgt_lang}.model"
tokenizer_save_path_dir = rf"C:\Users\yush2\OneDrive\Desktop\kreol\preprocessing"

In [11]:
special_tokens = ["<bos>", "<pad>", "</s>", "<unk>", "<cls>", "<sep>", "<mask>"]

In [12]:
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    get_training_data(all_text.lower(),1000),
    vocab_size=VOCAB_SIZE_SRC,
    special_tokens=special_tokens
)

TypeError: SentencePieceBPETokenizer.__init__() got an unexpected keyword argument 'padding'

In [309]:
pt_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, model_max_length=len(train_text), special_tokens=special_tokens)
pt_tokenizer.bos_token = "<bos>"
pt_tokenizer.bos_token_id = tokenizer.token_to_id("<bos>")
pt_tokenizer.pad_token = "<pad>"
pt_tokenizer.pad_token_id = tokenizer.token_to_id("<pad>")
pt_tokenizer.eos_token = "</s>"
pt_tokenizer.eos_token_id = tokenizer.token_to_id("</s>")
pt_tokenizer.unk_token = "<unk>"
pt_tokenizer.unk_token_id = tokenizer.token_to_id("<unk>")
pt_tokenizer.cls_token = "<cls>"
pt_tokenizer.cls_token_id = tokenizer.token_to_id("<cls>")
pt_tokenizer.sep_token = "<sep>"
pt_tokenizer.sep_token_id = tokenizer.token_to_id("<sep>")
pt_tokenizer.mask_token = "<mask>"
pt_tokenizer.mask_token_id = tokenizer.token_to_id("<mask>")
# and save for later!
pt_tokenizer.save_pretrained(tokenizer_save_path_dir)

('C:\\Users\\yush2\\OneDrive\\Desktop\\kreol\\preprocessing\\tokenizer_config.json',
 'C:\\Users\\yush2\\OneDrive\\Desktop\\kreol\\preprocessing\\special_tokens_map.json',
 'C:\\Users\\yush2\\OneDrive\\Desktop\\kreol\\preprocessing\\tokenizer.json')

In [310]:
def write_keys_to_txt(dictionary, file_path):
    dict_keys = np.unique(np.array(dictionary.keys()))[0]
    with open(file_path, 'w') as file:
        for _, key in enumerate(dict_keys):
            file.write(f"{key} 1\n")

output_file_path = "dict.en_fr_cr.txt"

# Call the function to write keys to the text file
write_keys_to_txt(pt_tokenizer.vocab, output_file_path)

In [4]:
from transformers import AutoTokenizer

In [5]:
wrapped_tokenizer = AutoTokenizer.from_pretrained(r'C:\Users\yush2\OneDrive\Desktop\kreol\preprocessing',use_fast=False)

### Convert train,dev,test files

In [74]:
def read_jsonl(path):
    jsonObj = pd.read_json(path_or_buf=path, lines=True)
    return list(jsonObj.input.values), list(jsonObj.target.values)

In [75]:
a,b=read_jsonl(rf'C:\Users\yush2\OneDrive\Desktop\kreol\experiments\data\{src_lang}-{tgt_lang}\{src_lang}-{tgt_lang}_dev.jsonl')

In [95]:
oo = wrapped_tokenizer(list(a),text_target=list(b),padding='max_length',max_length=100,truncation=True)

In [98]:
max([len(oo['labels'][i]) for i in range(500)])

100

In [97]:
len(oo['labels'][0])

100

In [66]:
list(oo.keys())

['input_ids', 'token_type_ids', 'attention_mask', 'labels']

In [101]:
def encode_save(tokenizer,mode,src_lang,tgt_lang):
    src_voc,tgt_voc = read_jsonl(rf'C:\Users\yush2\OneDrive\Desktop\kreol\experiments\data\{src_lang}-{tgt_lang}\{src_lang}-{tgt_lang}_{mode}.jsonl')
    tokenizing_dict = tokenizer(src_voc,text_target=tgt_voc,padding='max_length',max_length=50,truncation=True)
    src_tokens = np.array([z for y in tokenizing_dict['input_ids'] for z in y])
    tgt_tokens = np.array([z for y in tokenizing_dict['labels'] for z in y])
    # src_encoding = np.array(tokenizer.encode(src_voc,padding='max_length',max_length=42))
    # tgt_encoding = np.array(tokenizer.encode(tgt_voc,padding='max_length',max_length=42))
    save_path_src = rf'C:\Users\yush2\OneDrive\Desktop\kreol\experiments\data\{src_lang}-{tgt_lang}\indices\{src_lang}-{tgt_lang}_{mode}.spm.{src_lang}'
    save_path_tgt = rf'C:\Users\yush2\OneDrive\Desktop\kreol\experiments\data\{src_lang}-{tgt_lang}\indices\{src_lang}-{tgt_lang}_{mode}.spm.{tgt_lang}'
    np.savetxt(save_path_src,src_tokens)
    np.savetxt(save_path_tgt,tgt_tokens)
    print(f"src indices: {len(src_tokens)}")
    print(f"tgt indices: {len(tgt_tokens)}")
    print(rf'{src_lang} & {tgt_lang} indices have been set in {src_lang}-{tgt_lang}\indices')

In [102]:
encode_save(wrapped_tokenizer,'train',src_lang,tgt_lang)
encode_save(wrapped_tokenizer,'test',src_lang,tgt_lang)
encode_save(wrapped_tokenizer,'dev',src_lang,tgt_lang)


src indices: 1090500
tgt indices: 1090500
en & cr indices have been set in en-cr\indices
src indices: 50000
tgt indices: 50000
en & cr indices have been set in en-cr\indices
src indices: 25000
tgt indices: 25000
en & cr indices have been set in en-cr\indices


In [100]:
cc = wrapped_tokenizer.encode("i dont know whats goo")

In [23]:
cc.encode_save

[3498, 122, 303, 646, 496, 82, 390, 78]

In [25]:
'\n'.join(['Line 1', 'Line 2', 'Line 3'])

'Line 1\nLine 2\nLine 3'

In [314]:


# Reading from the file
with open(r'C:\Users\yush2\OneDrive\Desktop\kreol\preprocessing\dict.en_fr_cr.txt', 'r') as file:
    lines_read = file.readlines()

print(lines_read)



### Vectorizing Text

### Setting up transfer learning weights

### Training Transformer Models

In [3]:
import sentencepiece as spm

In [5]:
sp = spm.SentencePieceProcessor(model_file=r'C:\Users\yush2\OneDrive\Desktop\kreol\preprocessing\spm_tokenizer_en_cr.model')

RuntimeError: Internal: C:\Users\yush2\AppData\Local\Temp\pip-install-fdbfenl3\sentencepiece_092b7357e86e436f8273a293d4febb2f\sentencepiece\src\sentencepiece_processor.cc(1102) [model_proto->ParseFromArray(serialized.data(), serialized.size())] 