In [1]:
!git clone https://github.com/ryanzhumich/AESLC

Cloning into 'AESLC'...
remote: Enumerating objects: 17469, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 17469 (delta 1), reused 0 (delta 0), pack-reused 17461[K
Receiving objects: 100% (17469/17469), 7.36 MiB | 23.05 MiB/s, done.
Resolving deltas: 100% (48/48), done.


In [None]:
!nvcc --version

In [None]:
!rm -r /kaggle/working

In [2]:
!pip install -q torch
!pip install -q git+https://github.com/huggingface/transformers #huggingface transformers for downloading models weights
!pip install -U datasets #huggingface datasets to download and manipulate datasets
!pip install -q peft #Parameter efficient finetuning - for qLora Finetuning
!pip install -q bitsandbytes #For Model weights quantisation
!pip install -q trl #Transformer Reinforcement Learning - For Finetuning using Supervised Fine-tuning
!pip install -q wandb -U #Used to monitor the model score during training

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/ec/93/454ada0d1b289a0f4a86ac88dbdeab54921becabac45da3da787d136628f/datasets-2.16.1-py3-none-any.whl.metadata
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets)
  Obtaining dependency information for pyarrow-hotfix from https://files.pythonhosted.org/packages/e4/f4/9ec2222f5f5f8ea04f66f184caafd991a39c8782e31f5b0266f101cb68ca/pyarrow_hotfix-0.6-py3-none-any.whl.metadata
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.1.0
    Unins

In [3]:
#!pip3 install -q git+https://github.com/casper-hansen/AutoAWQ
!pip3 install -q optimum
!pip3 install -q auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7
#!pip install auto-gptq

In [None]:
pip install --upgrade huggingface_hub datasets

In [None]:
pip show huggingface_hub

In [4]:
import json
import re
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer # For supervised finetuning

In [5]:
DOWN_DATA_PATH = 'AESLC/enron_subject_line'
LINES_DATA_PATH = 'enron_lines'
#MODEL_KEY = 'olm/olm-gpt2-dec-2022'
MODEL_KEY = 'mistral-7B'
#MODEL_KEY = 'gpt2'
EXP_NAME = f'enron-subgen-{MODEL_KEY}'

In [6]:
import random
import numpy as np
import os
import json
SEED_VALUE = 15
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [7]:
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)
torch.manual_seed(SEED_VALUE)

<torch._C.Generator at 0x7baed1735010>

In [8]:
def clean_text(text):
    # Lowercase the text
    #text = text.lower()
    # Remove special characters
    #text = re.sub(r'\W', ' ', text)
    # Remove extra white spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def create_text_row(email, subject):
    text_row = f"""<s>[INST] Generate a subject for this email content, {email} [/INST] \\n {subject} </s>"""
    return text_row

def test_row(email):
    text_row = f"""<s>[INST] Generate a subject for this email content, {email} [/INST] """
    return text_row

In [10]:
os.makedirs(LINES_DATA_PATH,exist_ok=True)
def prepare_train_dataset(down_data_path,lines_data_path,split):
    with open(os.path.join(lines_data_path,f'{split}.json'),'w') as writer:
        filenames = os.listdir(os.path.join(down_data_path,split))
        for filename in filenames:
            with open(os.path.join(down_data_path,split,filename),'r') as reader:
                email,subject = reader.read().split('@subject')
                # email = ' '.join(email.replace('\n',' ').split()).strip()
                # subject = ' '.join(subject.replace('\n',' ').split()).strip()

                email = clean_text(email)
                subject = clean_text(subject)

                json_object = {
                    "text": create_text_row(email, subject)
                }
               # sequence = '<email>' + email + '<subject>' + subject
                writer.write(json.dumps(json_object) + "\n")

    return

In [11]:
def prepare_validation_dataset(down_data_path,lines_data_path,split):
    with open(os.path.join(lines_data_path,f'{split}.json'),'w') as writer:
        filenames = os.listdir(os.path.join(down_data_path,split))
        for filename in filenames:
            with open(os.path.join(down_data_path,split,filename),'r') as reader:
                email,email_right = reader.read().split('@subject')
                subject,subject_right  = email_right.split('@ann0')
                ann0,ann0_right = subject_right.split('@ann1')
                ann1,ann2 = ann0_right.split('@ann2')


                # email = ' '.join(email.replace('\n',' ').split()).strip()
                # subject = ' '.join(subject.replace('\n',' ').split()).strip()
                # ann0 = ' '.join(ann0.replace('\n',' ').split()).strip()
                # ann1 = ' '.join(ann1.replace('\n',' ').split()).strip()
                # ann2 = ' '.join(ann2.replace('\n',' ').split()).strip()

                email = clean_text(email)
                subject = clean_text(subject)
                ann0 = clean_text(ann0)
                ann1 = clean_text(ann1)
                ann2 = clean_text(ann2)

                json_object = {
                    "text": test_row(email),
                    "subject": subject,
                    "ann0": ann0,
                    "ann1": ann1,
                    "ann2":ann2

                }


               # sequence = '<email>' + email + '<subject>' + subject + '<ann0>' + ann0 + '<ann1>' + ann1 + '<ann2>' + ann2
                writer.write(json.dumps(json_object) + '\n')

    return

In [12]:
prepare_train_dataset(DOWN_DATA_PATH,LINES_DATA_PATH,'train')
prepare_validation_dataset(DOWN_DATA_PATH,LINES_DATA_PATH,'dev')

prepare_validation_dataset(DOWN_DATA_PATH,LINES_DATA_PATH,'test')

In [None]:
# def load_dataset(file_path,tokenizer):
#     block_size = 1024

#     dataset = LineByLineTextDataset(
#              tokenizer=tokenizer,
#              file_path=file_path,
#              block_size=block_size
#     )
#     return dataset

In [13]:
def load_data_collator(tokenizer,mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm
    )
    return data_collator

In [None]:
#from datasets import Dataset, load_dataset

In [None]:
%cd /kaggle/input/mistral-7b

In [14]:
train_dataset = load_dataset('json', data_files='enron_lines/train.json', split='train')
test_dataset = load_dataset('json', data_files='enron_lines/test.json', split='train')
dev_dataset = load_dataset('json',data_files='enron_lines/dev.json', split='train')

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [16]:
test_dataset

Dataset({
    features: ['text', 'subject', 'ann0', 'ann1', 'ann2'],
    num_rows: 1906
})

In [None]:
model_name = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

# Fine-tuned model name
new_model = "mistralai-Email-Instruct"

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 100

# Log every X updates steps
logging_steps = 100

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=use_4bit,
#     bnb_4bit_quant_type=bnb_4bit_quant_type,
#     bnb_4bit_compute_dtype=compute_dtype,
#     bnb_4bit_use_double_quant=use_nested_quant,
# )

In [15]:
#from awq import AutoAWQForCausalLM
import optimum

In [None]:
# Load the base model with QLoRA configuration

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #quantization_config=bnb_config,
    device_map={"": 0},
)

base_model = prepare_model_for_kbit_training(base_model)

base_model.config.use_cache = False
base_model.config.pretraining_tp = 1
if torch.cuda.device_count() > 1: # If more than 1 GPU
    base_model.is_parallelizable = True
    base_model.model_parallel = True

# Load MistralAI tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
eval_prompt = """Print hello world in python, C, and C++"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

base_model.eval()
with torch.no_grad():
    print(tokenizer.decode(base_model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

In [None]:
import wandb
wandb.login()
#run = wandb.init(project="Email_subject_MIstral")
run = wandb.init(project='Email_subject_MIstral', id='fstwjwem', resume="must")

In [None]:
!pip install evaluate sacrebleu
!pip install rouge_score

In [None]:
import evaluate

In [None]:
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
#meteor = evaluate.load('meteor')


def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits, dim=-1)

    return pred_ids, labels



def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = logits[0]
    preds = np.where(preds!= -100,preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    sequences = tokenizer.batch_decode(labels, skip_special_tokens=True)

    prompts = list()
    references = list()
    for sequence in sequences:
        try:
            prompt = sequence["text"]
            subject =sequence["subject"]
            ann0 = sequence["ann0"]
            ann1 = sequence["ann1"]
            ann2 = sequence["ann2"]
            prompts.append(prompt)
            references.append([subject, ann0, ann1, ann2])
        except ValueError:
            continue

    tokenizer.padding_side='left'
    n_batches = math.ceil(len(prompts)*1.0 / per_device_eval_batch_size)
    outputs = list()
    for i in range(n_batches):
        prompts_batch = prompts[i*per_device_eval_batch_size : (i+1)*per_device_eval_batch_size]
        prompts_batch_ids = tokenizer(prompts_batch,
            padding=True, truncation=True, return_tensors='pt').to("cuda")
        output_ids = base_model.generate(
            **prompts_batch_ids, max_new_tokens=10,
            pad_token_id=tokenizer.pad_token_id)
        outputs_batch = [seq.split('<subject>')[1] for seq in
            tokenizer.batch_decode(output_ids, skip_special_tokens=True)]
        outputs.extend(outputs_batch)
    tokenizer.padding_side='right'

    bleu_score = bleu.compute(predictions=outputs, references=references)
    rouge_score = rouge.compute(predictions=outputs, references=references)
   # meteor_score = meteor.compute(predictions=outputs, references=references)

    return {
        'BLEU': round(bleu_score['bleu'], 4) * 100,
        'R1': round(rouge_score['rouge1'], 4) * 100,
        'R2': round(rouge_score['rouge2'], 4) * 100,
        'RL': round(rouge_score['rougeL'], 4) * 100,
        'RLsum': round(rouge_score['rougeLsum'], 4) * 100,
        #'meteor': round(meteor_score['meteor'], 4) * 100,

        #'METEOR': round(meteor_score['meteor'], 4) * 100
        }

In [None]:
run.id

In [None]:
%ls /kaggle/working/artifacts

In [None]:
!rm -r ./artifacts

In [None]:
%cd /kaggle/working

In [None]:
!rm -r ./results

In [None]:
last_run_id = 'fstwjwem'
my_checkpoint_name = f"checkpoint-{last_run_id}:v37"
my_checkpoint_artifact = run.use_artifact(my_checkpoint_name)
checkpoint_dir = my_checkpoint_artifact.download()
#base_model = AutoModelForCausalLM.from_pretrained('/kaggle/working/artifacts/checkpoint-jxdoo6gc:v6', quantization_config=bnb_config,device_map={"": 0})

In [None]:
# Load LoRA configuration

os.environ["WANDB_LOG_MODEL"] = "checkpoint"
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)





# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    eval_steps = 100,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=1000, # the number of training steps the model will take
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
    eval_dataset=dev_dataset,
#     preprocess_logits_for_metrics=preprocess_logits_for_metrics,
#     compute_metrics=compute_metrics
)

In [None]:
#trainer.train()

trainer.train(resume_from_checkpoint='/kaggle/working/artifacts/checkpoint-fstwjwem:v37')

# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
!rm -r ./results

In [None]:
import gc
#del base_model
gc.collect()

del trainer
gc.collect()

In [None]:
torch.cuda.empty_cache() # PyTorch thing

In [None]:
gc.collect()

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

In [None]:

merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
prompt = "<s>[INST] Generate a subject for this email content, Thanks in advance for agreeing to speak at the Global Operations Controller Forum. There will be approximately 30 Enron business controllers present at the meeting. All have responsibility for mid and back office operations for the following Enron entities: Enron North America, Enron Europe, Enron South America, Enron Global Markets, Enron Industrial Markets, Enron Broadband Services and Enron Energy Services. Attendees will be here from Houston, Calgary, Tokyo, Sydney, London and New York (metals business). Attached for your reference is the agenda. There may be some slight changes before the forum begins, but this will give you a good idea of the topics to be covered and the other speakers who will address the group. You are scheduled to address the group as follows: [/INST] "

input_ids = tokenizer(prompt, return_tensors="pt", truncation=False).input_ids.cuda()

outputs = merged_model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

In [None]:
print(f"Prompt:\n{prompt}\n")
print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

# **Testing purposes, separately executed**

In [None]:
def test_data():
    prompts = list()
    references = list()
    with open("/kaggle/input/mistral-7b/enron_lines/test.json", "r") as sequences:
        for sequence in sequences:
            sequence = json.loads(sequence)
            try:
                prompt = sequence["text"]
                subject =sequence["subject"]
                ann0 = sequence["ann0"]
                ann1 = sequence["ann1"]
                ann2 = sequence["ann2"]
                prompts.append(prompt)
                references.append([subject, ann0, ann1, ann2])
            except ValueError:
                continue

    tokenizer.padding_side='left'
  #  n_batches = math.ceil(len(prompts)*1.0 / per_device_eval_batch_size)
    outputs = list()
#     for i in range(n_batches):
#         prompts_batch = prompts[i*per_device_eval_batch_size : (i+1)*per_device_eval_batch_size]
#         prompts_batch_ids = tokenizer(prompts_batch,
#             padding=True, truncation=True, return_tensors='pt').to("cuda")
#         output_ids = base_model.generate(
#             **prompts_batch_ids, max_new_tokens=10,
#             pad_token_id=tokenizer.pad_token_id)
#         outputs_batch = [seq[0][len(prompt):] for seq in
#             tokenizer.batch_decode(output_ids, skip_special_tokens=True)]
#         outputs.extend(outputs_batch)
    for prompt in prompts:
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
        output_ids = merged_model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)
        output = tokenizer.batch_decode(output_ids.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
        print(output)
        outputs.extend(output)
    
    
        
        
    tokenizer.padding_side='right'
    
    df = pd.DataFrame({
        'prompt':prompts,
        'references':references,
        'model_outputs':outputs
    })

    df.to_csv('test_output.csv', index=False)


    bleu_score = bleu.compute(predictions=outputs, references=references)
    rouge_score = rouge.compute(predictions=outputs, references=references)
   # meteor_score = meteor.compute(predictions=outputs, references=references)

    return {
        'BLEU': round(bleu_score['bleu'], 4) * 100,
        'R1': round(rouge_score['rouge1'], 4) * 100,
        'R2': round(rouge_score['rouge2'], 4) * 100,
        'RL': round(rouge_score['rougeL'], 4) * 100,
        'RLsum': round(rouge_score['rougeLsum'], 4) * 100,
        #'meteor': round(meteor_score['meteor'], 4) * 100,

        #'METEOR': round(meteor_score['meteor'], 4) * 100
        }

In [None]:
test_data()