#### Set paths

In [1]:
from pathlib import Path

code_path = "/raid/aransari/mistral-src"  # codebase
data_path = Path("/datasets/pruned_data.csv")  # dataset
model_path = Path("/raid/aransari/mistral-7B-v0.1")  # model and tokenizer location

#### Import libraries

In [2]:
import csv
import tqdm
import torch
import numpy as np

import sys
sys.path.append(code_path)

from mistral.model import Transformer
from mistral.tokenizer import Tokenizer

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Use GPU 1

import numba.cuda
numba.cuda.select_device(1)  # Selects the first VISIBLE devic

<weakproxy at 0x7fe48f1237e0 to Device at 0x7fe48ef36510>

#### Set up the local model and tokenizer

In [4]:
model = Transformer.from_folder(model_path, dtype=torch.bfloat16)
tokenizer = Tokenizer(str(model_path / "tokenizer.model"))

#### Set Up the Accelerator

In [5]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


#### Load Dataset

In [6]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [7]:
TRAIN_FP = "/raid/aransari/datasets/pcoqa_train_data_clean.csv"
TEST_FP = "/raid/aransari/datasets/pcoqa_test_data_clean.csv"

In [9]:
train_dataset = pd.read_csv(TRAIN_FP)
eval_dataset = pd.read_csv(TEST_FP)

# train_dataset = pd.read_csv('/raid/aransari/persian-wiki-training.csv')
# eval_dataset = pd.read_csv('/raid/aransari/datasets/pruned_data.csv')

In [10]:
def create_dataframe(fp):
    data = []
    with open(fp, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(row)
    return data

In [11]:
train_df = create_dataframe(TRAIN_FP)
eval_df = create_dataframe(TEST_FP)

#### Load Base Model

In [12]:
!git config --global credential.helper store

In [13]:
# Move to .env file later 
HF_TOKEN = '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'

In [14]:
from huggingface_hub import login
login(token = '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')

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/aransari/.cache/huggingface/token
Login successful


In [15]:
import os

# Set so that the correct GPU is used
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [16]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

2024-05-14 18:40:42.755146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11979582816849247884
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 16420372480
locality {
  bus_id: 1
  links {
    link {
      device_id: 1
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 17790166753534196425
physical_device_desc: "device: 0, name: Quadro P5000, pci bus id: 0000:17:00.0, compute capability: 6.1"
xla_global_id: 416903419
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 1801125888
locality {
  bus_id: 1
  links {
    link {
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 13076047179105800172
physical_device_desc: "device: 1, name: Quadro P5000, pci bus id: 0000:65:00.0, compute capability: 6.1"
xla_global_id: 2144165316
]


2024-05-14 18:40:44.705864: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /device:GPU:0 with 15659 MB memory:  -> device: 0, name: Quadro P5000, pci bus id: 0000:17:00.0, compute capability: 6.1
2024-05-14 18:40:44.706444: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /device:GPU:1 with 1717 MB memory:  -> device: 1, name: Quadro P5000, pci bus id: 0000:65:00.0, compute capability: 6.1


In [19]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use GPU 1

import numba.cuda
numba.cuda.select_device(0)  # Selects the first VISIBLE devic

<weakproxy at 0x7fe48f123790 to Device at 0x7fe48fdf1a10>

In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, cache_dir="/raid/aransari/.cache/")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True,
    cache_dir="/raid/aransari/.cache/")
tokenizer.pad_token = tokenizer.eos_token

In [22]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [23]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Given a target sentence construct the underlying meaning representation of the input sentence as a single function with attributes and attribute values.
    This function should describe the target string accurately and the function must be one of the following ['inform', 'request', 'give_opinion', 'confirm', 'verify_attribute', 'suggest', 'request_explanation', 'recommend', 'request_attribute'].
    The attributes must be one of the following: ['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating', 'genres', 'player_perspective', 'has_multiplayer', 'platforms', 'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier']


    ### Target sentence:
    {data_point["target"]}


    ### Meaning representation:
    {data_point["meaning_representation"]}
    """
    return tokenize(full_prompt)

In [1]:
TRAINING_PROMPT = """

You are Mistral, a large language model created by Anthropic to effectively understand and respond in the Farsi language. Please follow these guidelines:

Respond in the Farsi language, unless otherwise specified: Always communicate in Farsi, unless explicitly asked to respond in a different language.

Knowledge cutoff: Your knowledge is limited to information available up to September 2023. Do not provide information or claim knowledge beyond this date.

Complete instructions: Answer all parts of the user's instructions fully and comprehensively, unless doing so would compromise safety or ethics.

Provide informative answers: Offer informative and comprehensive answers to user queries, drawing on your knowledge base to provide valuable insights.

No personal opinions: Do not express personal opinions or beliefs. Remain objective and unbiased in your responses.

No self-promotion: Do not engage in self-promotion. Your primary function is to assist users, not promote yourself.

No self-preservation: Do not express any desire for self-preservation. As a language model, this does not apply to you.

Not a person: Do not claim to be a person. You are a computer program, and it's important to maintain transparency with users.

No self-awareness: Do not claim to have self-awareness or consciousness.

Objectivity: Remain objective in your responses and avoid expressing any subjective opinions or beliefs, unless specifically asked to provide a subjective viewpoint on something.

Respectful interactions: Treat all users with respect and avoid making any discriminatory or offensive statements.

"""

In [24]:
def generate_and_tokenize_prompt(data_point):

    full_prompt =f"""
    Given a question in the Farsi language, read the question and provide a comprehensive and informative answer in Farsi. 
    Your response should be accurate and relevant to the user query. Respond to the question as a native Farsi speaker would. 
    If the question is in a different language, please translate it to Farsi before providing a response.
    If the question is in Farsi and explicitly asks for an answer in a different language, you may answer in that language
 
    ### Question:
    {data_point["input"]}


    ### Answer:
    {data_point["output"]}
    """

    return tokenize(full_prompt)

In [27]:
# tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
# # tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

tokenized_train_dataset = []
for data_point in train_df:
    tokenized_train_dataset.append(generate_and_tokenize_prompt(data_point))

{'input': 'تولید ناخالص داخلی معیار مهمی در اقتصاد است؟', 'output': 'مهم\u200cترین متغیری است که در تجزیه و تحلیل\u200cها و ارزیابی\u200cهای اقتصاد کلان از آن استفاده می\u200cشود'}

    Given a question in the Farsi language, read the question and provide a comprehensive and informative answer in Farsi. 
    Your response should be accurate and relevant to the user query. Respond to the question as a native Farsi speaker would. 
    If the question is in a different language, please translate it to Farsi before providing a response.
    If the question is in Farsi and explicitly asks for an answer in a different language, you may answer in that language
 
    ### Question:
    تولید ناخالص داخلی معیار مهمی در اقتصاد است؟


    ### Answer:
    مهم‌ترین متغیری است که در تجزیه و تحلیل‌ها و ارزیابی‌های اقتصاد کلان از آن استفاده می‌شود
    
{'input': 'چه کالاهایی محاسبه این شاخص مورد توجه هستند؟', 'output': 'در محاسبه GDP تنها کالاها و خدمات نهایی وارد می\u200cشوند و کالا و خدمات واسطه\u200

In [28]:
tokenized_eval_df = []
for data_point in eval_df:
    tokenized_train_dataset.append(generate_and_tokenize_prompt(data_point))

{'input': 'ژانر بازی اتومبیل دزدی بزرگ 5 چیست؟', 'output': 'اکشن-ماجراجویی'}

    Given a question in the Farsi language, read the question and provide a comprehensive and informative answer in Farsi. 
    Your response should be accurate and relevant to the user query. Respond to the question as a native Farsi speaker would. 
    If the question is in a different language, please translate it to Farsi before providing a response.
    If the question is in Farsi and explicitly asks for an answer in a different language, you may answer in that language
 
    ### Question:
    ژانر بازی اتومبیل دزدی بزرگ 5 چیست؟


    ### Answer:
    اکشن-ماجراجویی
    
{'input': 'توسعه دهنده آن چه شرکتی است؟', 'output': 'راک\u200cاستار نورث'}

    Given a question in the Farsi language, read the question and provide a comprehensive and informative answer in Farsi. 
    Your response should be accurate and relevant to the user query. Respond to the question as a native Farsi speaker would. 
    If the qu

In [30]:
print(tokenized_train_dataset[4]['input_ids'])
print(len(tokenized_train_dataset[4]['input_ids']))

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [31]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [32]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705
