In [None]:
# !pip install transformers datasets peft accelerate bitsandbytes trl safetensors torch --no-cache

In [None]:
!nvidia-smi

### Data pushing to hugging face ###

In [None]:
import pandas as pd

df = pd.read_excel('Annotated file', sheet_name= 'Classified')
# df = df.sample(frac = 1).reset_index(drop=True)
df=  df.dropna().reset_index(drop=True)
df = df[['Abstract', 'Extracted Dictionary']]
df.head()

In [None]:
input, output, instruction = [], [], []
for i in range(len(df)):
  input.append(df['Abstract'][i])
  output.append(df['Extracted Dictionary'][i])
  #instruction.append('Classify the abstract into relevant or non-relevant class based on given input text and provide the information in relevant: 1 and non-relevant: 0')
  instruction.append('Extract all the Battery material/Battery type, Recycling process used, Process conditions/Battery properties and Recovered materials from the given input text and provide them in a dictionary format with keys as "Battery_Material/Battery_Type", "Recycling_Process_Used", "Process_Conditions/Battery_Properties" and "Recovered_Materials":')
df = pd.DataFrame([input, instruction, output]).T
df.rename({0:'input', 1: 'instruction', 2: 'output',}, axis=1, inplace=True)
df.tail()

In [None]:
import os

# Set the Hugging Face token environment variable
os.environ['HUGGINGFACE_TOKEN'] = "Enter your huggingface token"

from huggingface_hub import login

# Retrieve the token from the environment variable
token = os.getenv('HUGGINGFACE_TOKEN')

# Log in using the token
login(token)

In [None]:
df.to_csv("NER_data_battery.csv", index=False)

In [None]:
from datasets import load_dataset
data_train = load_dataset("csv", data_files="NER_data_battery.csv")
# data_train = data_train.remove_columns("Unnamed: 0")
data_train

In [None]:
from datasets import load_dataset
import re

dataset = data_train.shuffle(seed=42)
dataset

In [None]:
dataset.push_to_hub("Name of file" )

### Data loading for NER training ###

In [None]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("Name of file", split="train")
dataset = dataset.train_test_split(test_size=0.3)
small_train_dataset = dataset["train"].shuffle(seed=42)
small_eval_dataset = dataset["test"].shuffle(seed=42)

print(f"Dataset Size: {len(dataset)}")
#print(dataset[randrange(len(dataset))])
# Dataset Size: 7

In [None]:
print(len(small_eval_dataset))
print(len(small_train_dataset))

In [None]:
def format_prompt(sample):
    return f"""
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample["instruction"]}

### Input:
{sample["input"]}

### Response:
{sample["output"]}
"""

In [None]:
from random import randrange

print(format_prompt(small_train_dataset[randrange(len(small_train_dataset))]))

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Hugging Face model name
model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama_NER_batterr_KG"
use_flash_attention = False

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    use_cache=False,
    use_flash_attention_2=use_flash_attention,
    device_map="auto",
    torch_dtype=torch.float16
)

model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# LoRA config based on QLoRA paper
peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.10,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir="./finetuned-llama-7b-chat-hf-ner2",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=2,
    save_strategy="steps",
    save_steps = 2,
    evaluation_strategy="steps",  # Set this to "steps" to evaluate at regular intervals
    eval_steps=2,
    #save_strategy = "epoch",
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    disable_tqdm=False
)

In [None]:
from trl import SFTTrainer

max_seq_length = 1024 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_prompt,
    args=training_arguments,
)

In [None]:
# Train
trainer.train()

trainer.evaluate()

# Save model
trainer.model.save_pretrained(new_model)

In [None]:
# Empty VRAM
del model
#del pipe
del trainer
import gc

gc.collect()
gc.collect()

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import transformers
device_map = {"": 0}

In [None]:
!nvidia-smi

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

### END OF MODEL TRAINING ###

In [None]:
# !pip install huggingface_hub

In [None]:
import os

# Set the Hugging Face token environment variable
os.environ['HUGGINGFACE_TOKEN'] = "Enter your hugging face token"

from huggingface_hub import login

# Retrieve the token from the environment variable
token = os.getenv('HUGGINGFACE_TOKEN')

# Log in using the token
login(token)

In [None]:
model.generation_config.do_sample = True

In [None]:
new_model = 'llama_NER_battery_KG'

In [None]:
#!huggingface-cli login
# new_model = "test_llama_class"
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

### Load trained model###

In [None]:
from transformers import AutoTokenizer
import transformers
import torch

model = "Enter your model address" # meta-llama/Llama-2-7b-chat-hf

tokenizer = AutoTokenizer.from_pretrained(model, use_auth_token=True)

if not isinstance(model, transformers.LlamaForCausalLM):
    # Reload the model
    model = transformers.AutoModelForCausalLM.from_pretrained(model, use_auth_token=True)

In [None]:
instruct = 'Extract all the Battery material/Battery type, Recycling process used, Process conditions/Battery properties and Recovered materials from the given input text and provide them in a dictionary format with keys as "Battery_Material/Battery_Type", "Recycling_Process_Used", "Process_Conditions/Battery_Properties" and "Recovered_Materials":'

In [None]:
import pandas as pd

df = pd.read_excel('Enter file address', sheet_name= 'Classified')
# df = df.sample(frac = 1).reset_index(drop=True)
df=  df.dropna().reset_index(drop=True)
df = df[['Abstract', 'Extracted Dictionary']]
df.head()

In [None]:
sample_i = """Multi-steps sequential alkaline and acid treatments were applied on EU-1 zeolite to develop more mesoporosity surface area and to offer more active sites.
Different concentrations of NaOH (0.1, 0.25 and 0.5 M) were used to treat parent EU-1 with different cyclic times, followed by a treatment with HNO3 (4 M) also for different cycles.
The treated and the parent EU-1 samples were assessed on the conversion of methanol to olefins. The multi-step sequential treatments succeed to create more mesopores as compared with
single-step treatment. This improvement was confirmed by BET analysis and NLDFT calculations for pore size distribution. The total acidity was measured using TPD and the quality of
the active sites was characterized by in situ FTIR spectroscopy of adsorbed pyridine. Solid state NMR was used to study the position of Al in the EU-1 framework. The main advantage
of this multi-step treatment is the preservation of EU-1 crystallinity, which was calculated using XRD. The multi-step treatment yielded an increase in the mesoporosity surface area
 by 24% and an enhancement of the total acidity, up to five times larger than the parent EU-1 crystals. Furthermore, the selectivity of
propylene was increased from 0%, in the case of parent EU-1, to 50% of the product stream together with large enhancement of the stability of methanol conversion and life time."""
sample_i =  df['Abstract'][0]

prompt = f"""
  Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

  ### Instruction:
  {instruct}

  ### Input:
  {sample_i}

  ### Response:
  """

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids, max_new_tokens=512, do_sample=True, top_p=0.6,temperature=0.9)
# print(outputs)
#df.loc[i, "prediction"] = outputs

#print(f"Instruction:\n{sample['instruction']}\n")
#print(f"Input:\n{sample['input']}\n")
output_str =  tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]

# df.loc[i, "prediction"] = output_str
print(type(output_str),output_str)