getting error while importing from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer #41

kartikgupta321 · 2024-01-21T11:42:17Z

same code was working about a week back but now i get this error, running the code using modal labs remote gpus

code

import modal
stub = modal.Stub()


volume = modal.NetworkFileSystem.persisted("data")
MODEL_DIR = "/data"

@stub.function( cpu=2, memory = 4276, gpu = 'A10G', timeout=1200, network_file_systems={MODEL_DIR: volume})
def loadIndicTrans2(dataset_name):
    import time
    start_time = time.time()

    import os 
    import subprocess
    
    commands = [
    "pip install -q bitsandbytes",
    "apt update ", 
    "apt install -y git",
    "git clone https://github.com/AI4Bharat/IndicTrans2"
    ]
    for command in commands:
        subprocess.run(command, shell=True)

    os.chdir("IndicTrans2/huggingface_interface")
    subprocess.run("bash install.sh", shell=True)


    with open('importIndic.py', 'w') as file:
        file.write(f'''
try:
    import torch
    import os
    import pandas as pd
    import csv
    print(torch.cuda.get_device_name(0))
    import sys
    from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
    print('from transformers imported')
    from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer
    print('from indictranstokenizer imported')
    
    en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"  # ai4bharat/indictrans2-en-indic-dist-200M
    
    BATCH_SIZE = 4
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    
    if len(sys.argv) > 1:
        quantization = sys.argv[1]
    else:
        quantization = ""
    
    
    def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
        if quantization == "4-bit":
            qconfig = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
            )
        elif quantization == "8-bit":
            qconfig = BitsAndBytesConfig(
                load_in_8bit=True,
                bnb_8bit_use_double_quant=True,
                bnb_8bit_compute_dtype=torch.bfloat16,
            )
        else:
            qconfig = None
    
        tokenizer = IndicTransTokenizer(direction=direction)
        model = AutoModelForSeq2SeqLM.from_pretrained(
            ckpt_dir,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            quantization_config=qconfig,
        )
    
        if qconfig == None:
            model = model.to(DEVICE)
            model.half()
        model.eval()
        return tokenizer, model
    
    def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
        translations = []
        for i in range(0, len(input_sentences), BATCH_SIZE):
            batch = input_sentences[i : i + BATCH_SIZE]
    
            batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
    
            inputs = tokenizer(
                batch,
                src=True,
                truncation=True,
                padding="longest",
                return_tensors="pt",
                return_attention_mask=True,
            ).to(DEVICE)
    
            with torch.no_grad():
                generated_tokens = model.generate(
                    **inputs,
                    use_cache=True,
                    min_length=0,
                    max_length=256,
                    num_beams=5,
                    num_return_sequences=1,
                )
    
            generated_tokens = tokenizer.batch_decode(generated_tokens.detach().cpu().tolist(), src=False)
    
            translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)
            del inputs
            torch.cuda.empty_cache()
        return translations

    
    ip = IndicProcessor(inference=True)
    en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir, "en-indic", quantization)


    from datasets import load_dataset
    dataset_name = '{dataset_name}'
    if(dataset_name == "ai2_arc"):
        possible_configs = [
        'ARC-Challenge',
        'ARC-Easy'
        ]
        # columns to translate
        columns = ['question','choices']
        # columns not to translate, to keep in converted dataset as is.
        columns_asis = ['id','answerKey']

    dataset = []
    if(dataset_name == 'ai2_arc'):
        for config in possible_configs:
            base_url = 'https://huggingface.co/api/datasets/allenai/ai2_arc/parquet/{{config}}'
            data_files = {{'train': base_url + '/train/0.parquet','test':base_url + '/test/0.parquet', 'validation': base_url + '/validation/0.parquet'}}
            dataset_slice = load_dataset('parquet', data_files=data_files)
            dataset.append(dataset_slice)

    
except Exception as e:
    # Handle the exception
    print('An error occurred:'+ str(e))
        ''')
    result = subprocess.run(['python', 'importIndic.py'], stdout=subprocess.PIPE)


@stub.local_entrypoint()
def main():
    # provide dataset name among ai2_arc, gsm8k, lukaemon/mmlu
    dataset_name = "ai2_arc"
    
    loadIndicTrans2.remote(dataset_name)

the error says An error occurred:[Errno 2] No such file or directory: '/usr/local/lib/python3.11/site-packages/RESOURCES/script/all_script_phonetic_data.csv'

Bhanu191 · 2024-01-22T16:47:43Z

Facing the same error , it was working 4 days before for me

kartikgupta321 · 2024-01-22T17:49:34Z

Facing the same error , it was working 4 days before for me

It is working now, check the issue https://github.com/VarunGumma/IndicTransTokenizer/issues/2

Bhanu191 · 2024-01-23T03:54:36Z

Thanks , its working now

kartikgupta321 closed this as completed Jan 22, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

getting error while importing from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer #41

getting error while importing from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer #41

kartikgupta321 commented Jan 21, 2024

Bhanu191 commented Jan 22, 2024

kartikgupta321 commented Jan 22, 2024

Bhanu191 commented Jan 23, 2024

getting error while importing from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer #41

getting error while importing from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer #41

Comments

kartikgupta321 commented Jan 21, 2024

same code was working about a week back but now i get this error, running the code using modal labs remote gpus

code

Bhanu191 commented Jan 22, 2024

kartikgupta321 commented Jan 22, 2024

Bhanu191 commented Jan 23, 2024