<a href="https://colab.research.google.com/github/Sriram-code/Intel-Hackathon/blob/main/Zephyr-7B-fineTuned-summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Dependencies

In [None]:
! pip install datasets transformers trl peft accelerate bitsandbytes auto-gptq optimum modin

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Importing Dependencies

In [None]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, TrainingArguments
from trl import SFTTrainer
import modin.pandas as pd #modin.pandas --- Intel's libray for effeciently loading csv file from Datasets
import shutil
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch

#Config

In [None]:
class Config:
    MODEL_ID = "TheBloke/zephyr-7B-beta-GPTQ"
    DATASET_ID = "sriramahesh2000/summarization"
    CONTEXT_FIELD= ""
    INSTRUCTION_FIELD = "Article Text"
    TARGET_FIELD = "Summary"
    BITS = 4
    DISABLE_EXLLAMA = True
    DEVICE_MAP = "auto"
    USE_CACHE = False
    LORA_R = 16
    LORA_ALPHA = 16
    LORA_DROPOUT = 0.05
    BIAS = "none"
    TARGET_MODULES = ["q_proj", "v_proj"]
    TASK_TYPE = "CAUSAL_LM"
    OUTPUT_DIR = "sample2"
    BATCH_SIZE = 8
    GRAD_ACCUMULATION_STEPS = 1
    OPTIMIZER = "paged_adamw_32bit"
    LR = 2e-4
    LR_SCHEDULER = "cosine"
    LOGGING_STEPS = 50
    SAVE_STRATEGY = "epoch"
    NUM_TRAIN_EPOCHS = 20
    MAX_STEPS = 1000
    FP16 = True
    PUSH_TO_HUB = True
    DATASET_TEXT_FIELD = "text"
    MAX_SEQ_LENGTH = 512
    PACKING = False

Loading Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# Zephyr Trainer
The training of the moddel is done here with appropriate parameters and required configration

In [None]:
class ZephyrTrainer:

    def __init__(self):

        '''
        A Trainer used to train the Zephyr 7B model which beats Llama2-70b-chat model for your custom usecase

        Initialized:
        config: Parameters required for the trainer to create and process dataset, train and save model finally
        tokenizer: Tokenizer required in training loop
        '''

        self.config = Config()
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_ID)
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def process_data_sample(self, example):

        '''
        Helper function to process the dataset sample by adding prompt and clean if necessary.

        Args:
        example: Data sample

        Returns:
        processed_example: Data sample post processing
        '''

        processed_example = "<|system|>\n You are a AI powered summarization Assistant. also use your own knolwedge about indian law and give the output accordingly.</s>\n<|user|>\n" + example[self.config.INSTRUCTION_FIELD] + "</s>\n<|assistant|>\n" + example[self.config.TARGET_FIELD]

        return processed_example

    def create_dataset(self):

        '''
        Downloads and processes the dataset

        Returns:
        processed_data: Training ready processed dataset
        '''

        data = load_dataset(self.config.DATASET_ID, split="train")

        print("\n====================================================================\n")
        print("\t\t\tDOWNLOADED DATASET")
        print("\n====================================================================\n")

        df = data.to_pandas()
        print(df)
        df[self.config.DATASET_TEXT_FIELD] = df[[self.config.INSTRUCTION_FIELD, self.config.TARGET_FIELD]].apply(lambda x: self.process_data_sample(x), axis=1)

        print("\n====================================================================\n")
        print("\t\t\tPROCESSED DATASET")

        print("\n====================================================================\n")
        print(df[[self.config.DATASET_TEXT_FIELD]])

        processed_data = Dataset.from_pandas(df[[self.config.DATASET_TEXT_FIELD]])
        return processed_data

    def prepare_model(self):

        '''
        Prepares model for finetuning by quantizing it and attaching lora modules to the model

        Returns:
        model - Model ready for finetuning
        peft_config - LoRA Adapter config
        '''

        bnb_config = GPTQConfig(
                                    bits=self.config.BITS,
                                    disable_exllama=self.config.DISABLE_EXLLAMA,
                                    tokenizer=self.tokenizer
                                )

        model = AutoModelForCausalLM.from_pretrained(
                                                        self.config.MODEL_ID,
                                                        quantization_config=bnb_config,
                                                        device_map=self.config.DEVICE_MAP
                                                    )

        print("\n====================================================================\n")
        print("\t\t\tDOWNLOADED MODEL")
        print("\n====================================================================\n")

        model.config.use_cache=self.config.USE_CACHE
        model.config.pretraining_tp=1
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)

        print("\n====================================================================\n")
        print("\t\t\tMODEL CONFIG UPDATED")
        print("\n====================================================================\n")

        peft_config = LoraConfig(
                                    r=self.config.LORA_R,
                                    lora_alpha=self.config.LORA_ALPHA,
                                    lora_dropout=self.config.LORA_DROPOUT,
                                    bias=self.config.BIAS,
                                    task_type=self.config.TASK_TYPE,
                                    target_modules=self.config.TARGET_MODULES
                                )

        model = get_peft_model(model, peft_config)

        print("\n====================================================================\n")
        print("\t\t\tPREPARED MODEL FOR FINETUNING")
        print("\n====================================================================\n")

        return model, peft_config

    def set_training_arguments(self):

        '''
        Sets the arguments for the training loop in TrainingArguments class
        '''

        training_arguments = TrainingArguments(
                                                output_dir=self.config.OUTPUT_DIR,
                                                per_device_train_batch_size=self.config.BATCH_SIZE,
                                                gradient_accumulation_steps=self.config.GRAD_ACCUMULATION_STEPS,
                                                optim=self.config.OPTIMIZER,
                                                learning_rate=self.config.LR,
                                                lr_scheduler_type=self.config.LR_SCHEDULER,
                                                save_strategy=self.config.SAVE_STRATEGY,
                                                logging_steps=self.config.LOGGING_STEPS,
                                                num_train_epochs=self.config.NUM_TRAIN_EPOCHS,
                                                max_steps=self.config.MAX_STEPS,
                                                fp16=self.config.FP16,
                                                push_to_hub=self.config.PUSH_TO_HUB
                                            )

        return training_arguments

    def train(self):

        '''
        Trains the model on the specified dataset in config
        '''

        data = self.create_dataset()
        model, peft_config = self.prepare_model()
        training_args = self.set_training_arguments()

        print("\n====================================================================\n")
        print("\t\t\tPREPARED FOR FINETUNING")
        print("\n====================================================================\n")

        trainer = SFTTrainer(
                                model=model,
                                train_dataset=data,
                                peft_config=peft_config,
                                dataset_text_field=self.config.DATASET_TEXT_FIELD,
                                args=training_args,
                                tokenizer=self.tokenizer,
                                packing=self.config.PACKING,
                                max_seq_length=self.config.MAX_SEQ_LENGTH
                            )
        trainer.train()

        print("\n====================================================================\n")
        print("\t\t\tFINETUNING COMPLETED")
        print("\n====================================================================\n")

        trainer.push_to_hub()

In [None]:
if __name__ == "__main__":
    zephyr_trainer = ZephyrTrainer()
    zephyr_trainer.train()

#Saving model
Move the folder to Google Drive

In [None]:
folder_to_move = '/content/sample2'
destination_folder = '/content/drive/MyDrive/'
shutil.move(folder_to_move, destination_folder)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Inference
loading model localy


In [None]:
def process_data_sample(example):

    processed_example = "<|system|>\n You are document sumarizer who is going to sumarise the content without missing any keypoints in a concise manner.truncate the input if it it beyond length you can handle.always give a complete sentence which makes sense and inform how much word you can handle.</s>\n<|user|>\n" + example["instruction"] + "</s>\n<|assistant|>\n"

    return processed_example
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/intel hackathon/sample2")
sentence='''
appeal no. lxvi of 1949. appeal from the high court of judicature, bombay, in a reference under section 66 of the indian income tax act, 1022. k.m. munshi (n. p. nathvani, with him), for the appel lant. ' m.c. setalvad, attorney general for india (h. j. umrigar, with him), for the respondent. 1950. may 26. the judgment of the court was delivered by mehr chand mahajan j. this is an appeal against a judgment of the high court of judicature at bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant bombay acts are allowable deductions under section 9 (1) (iv) of the indian income tax act. the assessee company is an investment company deriving its income from properties in the city of bombay. for the assessment year 1940 41 the net income of the assessee under the head "property" was computed by the income tax officer in the sum of rs. 6,21,764 after deducting from gross rents certain payments. the company had paid during the relevant year rs. 1,22,675 as municipal property tax and rs. 32,760 as urban property tax. deduction of these two sums was claimed under the provisions of section 9 the act. out of the first item a deduction in the sum of rs. 48,572 was allowed on the ground that this item represented tenants ' burdens paid by the assessee, otherwise the claim was disal lowed. the, appeals of the assessee to the appellate as sistant commissioner and to the income tax appellate tribu nal were unsuccessful. the tribunal, however, agreed to refer two questions of law to the high court of judicature at bombay, namely, (1) whether the municipal taxes paid by the applicant company are an allowable deduction under 555 the provisions of section 9 (1) (iv) of the indian income tax act; (2) whether the urban immoveable property taxes paid by the applicant company are an allowable deduction under section 9 (1) (iv) or under section 9 (1) (v) of the indian income tax act. a supplementary reference was made covering a third question which was not raised before us and it is not there fore necessary to refer to it. the high court answered all the three questions in the negative and hence this appeal. the question for our determination is whether the munic ipal property tax and urban immoveable property tax can be deducted as an allowance under clause (iv) of sub section (1) of section 9 of the act. the decision of the point depends firstly on the construction of the language employed in sub clause (iv) of sub section (1) of section 9 of the act, and secondly, on a finding as to the true nature and character of the liability of the owner under the relevant bombay acts for the payment of these taxes. section 9 along with the relevant clause runs thus: (1) the tax shall be payable by an assessee under the head ' income from property ' in respect of the bona fide annual value of property consisting of any buildings or lands appurtenant thereto of which he is the owner, . . subject to the following allowances, namely : (iv) where the property is subject to a mortgage or other capital charge, the amount of any interest on such mortgage or charge; where the property is subject to an annual charge not being a capital charge, the. amount of such charge; where the property is subject to a ground rent, the amount of such ground rent; and, where the property has been acquired, constructed, repaired, renewed or recon structed with borrowed capital, the amount of any interest payable on such capital; . . . " it will be seen that clause (iv) consists of four sub clauses corresponding to the four deductions allowed 556 under the clause. before the amending act of 1939, clause (iv) contained only the first, third and fourth sub clauses. under the first sub clause interest is deductible whether the amount borrowed on the security of the property was spent on the property or not
'''
inp_str = process_data_sample(
    {
        "instruction": sentence,
    }
)

inputs = tokenizer(inp_str, return_tensors="pt").to("cuda")

model = AutoPeftModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/intel hackathon/sample2",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda")

generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
import time
st_time = time.time()
outputs = model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(time.time()-st_time)

<|system|>
 You are document sumarizer who is going to sumarise the content without missing any keypoints in a concise manner.truncate the input if it it beyond length you can handle.always give a complete sentence which makes sense and inform how much word you can handle. 
<|user|>

appeal no. lxvi of 1949. appeal from the high court of judicature, bombay, in a reference under section 66 of the indian income tax act, 1022. k.m. munshi (n. p. nathvani, with him), for the appel lant. ' m.c. setalvad, attorney general for india (h. j. umrigar, with him), for the respondent. 1950. may 26. the judgment of the court was delivered by mehr chand mahajan j. this is an appeal against a judgment of the high court of judicature at bombay in an income tax matter and it raises the question whether munici pal property tax and urban immoveable property tax payable under the relevant bombay acts are allowable deductions under section 9 (1) (iv) of the indian income tax act. the assessee company is an 