In [3]:
!pip install datasets



In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("avimittal30/dataset-fin")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/dataset-fin


In [3]:
 !pip install huggingface_hub



In [4]:
import pandas as pd
df=pd.read_csv('/kaggle/input/dataset-fin/dataset_fin.csv')

In [5]:
def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()

In [6]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [8]:
df['text'] = df['input']
df['sentiment'] = df['output'].map({'moderately positive': 'positive',
                                    'moderately negative': 'negative',
                                    'strong positive': 'positive',
                                    'strong negative': 'negative',
                                    'mildly positive': 'positive',
                                    'mildly negative': 'negative',
                                    'neutral': 'neutral',
                                    'negative': 'negative',
                                    'positive': 'positive'
                                    })

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test = train_test_split(df[df.sentiment == sentiment],
                                    train_size=300,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
                        columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)


  .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))


In [16]:
print(train_data['text'][0])
print(X_test['text'][46461])

Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [Teleste BK Optiflex amplifier products will be used in ongoing capacity upgrade of KDG 's network to bi-directionality .] = neutral
Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [3 Top-Ranked Cheap Tech Stocks Under $10 to Buy Before Thanksgiving - Nasdaq] =


In [8]:
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
import torch

In [9]:
!pip install -U bitsandbytes
!pip install trl



In [13]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")

# Use the token directly with login
from huggingface_hub import login
login(token=hf_token)



In [11]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=False,
)

model_name = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
quantization_config=bnb_config,
trust_remote_code=True
)

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [12]:
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [13]:
from trl import SFTConfig, SFTTrainer

In [14]:
from peft import LoraConfig
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )

training_arguments = TrainingArguments(
        output_dir="logs",
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8, # 4
        optim="paged_adamw_32bit",
        save_steps=0,
        logging_steps=25,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=True,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="cosine",
        report_to="tensorboard",
        evaluation_strategy="epoch"
    )

trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=eval_data,
        peft_config=peft_config,

        tokenizer=tokenizer,
        args=training_arguments,


    )

    # Train model
trainer.train()


  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/900 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/150 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,7.1369,0.886576
2,6.2307,0.862621


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=336, training_loss=7.6038046677907305, metrics={'train_runtime': 2387.2445, 'train_samples_per_second': 1.131, 'train_steps_per_second': 0.141, 'total_flos': 1.0586441523462144e+16, 'train_loss': 7.6038046677907305})

In [16]:
output_dir ="/kaggle/output/trained-model"
trainer.save_model(output_dir)

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch

peft_model = "/kaggle/output/trained-model"
compute_dtype = getattr(torch, "float16")

config = PeftConfig.from_pretrained(peft_model)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    use_auth_token=False,
    torch_dtype=compute_dtype,
    device_map="auto",
)
model.config.use_cache = False
model.config.pretraining_tp = 1
# model = AutoPeftModelForCausalLM.from_pretrained(peft_model)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,
                                          trust_remote_code=True,
                                          )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = PeftModel.from_pretrained(model, peft_model, device_map="auto")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Now, using the peft configuration

In [18]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
quantization_config=bnb_config,
trust_remote_code=True
)
    

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
from tqdm import tqdm

In [21]:
X_test_dataset = Dataset.from_pandas(X_test)


In [33]:
y_pred = []
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1,
    temperature=0.01,
    # device='cuda'
)

# Extract texts from the dataset
texts = X_test_dataset["text"]

# Process in batches
batch_size = 8  # Adjust based on your memory constraints
for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    results = pipe(batch_texts)
    
    # Handle both single and batch results
    if not isinstance(results[0], list):
        results = [results]
    
    for result in results:
        if isinstance(result, list):
            # If result is a list (for batch processing)
            answer = result[0]['generated_text'].split("=")[-1]
        else:
            # If result is a single item
            answer = result['generated_text'].split("=")[-1]
        
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianFor

[[{'generated_text': 'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Converted Boeing freighters have historically been the jet of choice for Amazon Air, but the new contract will see Airbus planes used for the first time.] = neutral'}],
 [{'generated_text': 'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [3 Top-Ranked Cheap Tech Stocks Under $10 to Buy Before Thanksgiving - Nasdaq] = positive'}],
 [{'generated_text': 'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and r

In [23]:
import numpy as np
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

In [24]:
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")

accuracy

0.76

In [25]:
y_pred = []
pipe = pipeline(
    task="text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_new_tokens=1,
    temperature=0.01,
    # device='cuda'
)

# Extract texts from the dataset
texts = X_test_dataset["text"]

# Process in batches
batch_size = 8  # Adjust based on your memory constraints
for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    results = pipe(batch_texts)
    
    # Handle both single and batch results
    if not isinstance(results[0], list):
        results = [results]
    
    for result in results:
        if isinstance(result, list):
            # If result is a list (for batch processing)
            answer = result[0]['generated_text'].split("=")[-1]
        else:
            # If result is a single item
            answer = result['generated_text'].split("=")[-1]
        
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")

Device set to use cuda:0
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 113/113 [03:38<00:00,  1.94s/it]


In [1]:
accuracy = accuracy_score(y_true, y_pred)
accuracy

NameError: name 'accuracy_score' is not defined