In [54]:
import os
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Ensure GPU is visible
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Check if CUDA is available
print("CUDA is available" if torch.cuda.is_available() else "CUDA is not available")

# Load your dataset
data = pd.read_csv(r'C:\Users\admin\Desktop\SQ\try.csv')

# Preprocess function
def preprocess_text(text):
    return text.lower().replace('\n', ' ').replace('\r', '')

# Apply preprocessing
data['text'] = data['text'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
data['target'] = label_encoder.fit_transform(data['target'])
num_labels = len(label_encoder.classes_)

# Split the dataset
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['target'], random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_data.reset_index(drop=True))

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/roberta-large-zeroshot-v2.0-c")

# Tokenize the texts
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Rename the target column to labels
train_dataset = train_dataset.rename_column("target", "labels")
test_dataset = test_dataset.rename_column("target", "labels")

# Print the tokenized datasets to check
print("Tokenized Test Dataset Sample:")
print(test_dataset[:5])
print("Tokenized Train Dataset Sample:")
print(train_dataset[:5])

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Print the formatted datasets to verify
print("Formatted Train Dataset Sample:")
print(train_dataset)
print("Formatted Test Dataset Sample:")
print(test_dataset)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained("MoritzLaurer/roberta-large-zeroshot-v2.0-c", num_labels=num_labels, ignore_mismatched_sizes=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./saved_model',
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    no_cuda=not torch.cuda.is_available(),  # Use CUDA if available
    report_to=[]  
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(f"Evaluation results: {results}")

# Save the model
trainer.save_model('./saved_model')

# Optionally, save the tokenizer as well
tokenizer.save_pretrained('./saved_model')


CUDA is not available


Map: 100%|██████████| 30/30 [00:00<00:00, 7533.78 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 1591.99 examples/s]


Tokenized Test Dataset Sample:
{'text': ['def unveils latest tech product', 'abc reports strong financial results for q2', 'abc corp declares bankruptcy after revenue drop', 'abc corp secures debt financing for expansion', 'company xyz launches new esg initiatives'], 'labels': [5, 7, 0, 1, 2], 'input_ids': [[0, 9232, 36685, 5290, 665, 2903, 1152, 2, 1, 1, 1, 1], [0, 36822, 690, 670, 613, 775, 13, 2231, 176, 2, 1, 1], [0, 36822, 44086, 26460, 7388, 71, 903, 1874, 2, 1, 1, 1], [0, 36822, 44086, 15636, 4123, 1126, 5200, 13, 2919, 2, 1, 1], [0, 24233, 3023, 42006, 10158, 92, 2714, 571, 5287, 2, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]}
Tokenized Train Dataset Sample:
{'text': ['xyz ltd declares bankruptcy amidst financial turmoil', 'def inc. secures $30 million loan from bank', 'abc and xyz form strategic alliance', 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at MoritzLaurer/roberta-large-zeroshot-v2.0-c and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([8, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 4/120 [00:07<03:18,  1.71s/it]
[A

[A[A                               
                                               
  3%|▎         | 4/120 [00:07<03:18,  1.71s/it]
[A

{'eval_loss': 2.1254987716674805, 'eval_runtime': 0.1965, 'eval_samples_per_second': 40.711, 'eval_steps_per_second': 5.089, 'epoch': 1.0}


  7%|▋         | 8/120 [00:15<03:38,  1.95s/it]
[A

[A[A                               
                                               
  7%|▋         | 8/120 [00:15<03:38,  1.95s/it]
[A

{'eval_loss': 2.072481632232666, 'eval_runtime': 0.2549, 'eval_samples_per_second': 31.38, 'eval_steps_per_second': 3.923, 'epoch': 2.0}


 10%|█         | 12/120 [00:23<03:50,  2.14s/it]
[A

[A[A                               
                                                
 10%|█         | 12/120 [00:24<03:50,  2.14s/it]
[A

{'eval_loss': 2.0217044353485107, 'eval_runtime': 0.2571, 'eval_samples_per_second': 31.111, 'eval_steps_per_second': 3.889, 'epoch': 3.0}


 13%|█▎        | 16/120 [00:32<03:44,  2.16s/it]
[A

[A[A                               
                                                
 13%|█▎        | 16/120 [00:33<03:44,  2.16s/it]
[A

{'eval_loss': 2.0028555393218994, 'eval_runtime': 0.2676, 'eval_samples_per_second': 29.891, 'eval_steps_per_second': 3.736, 'epoch': 4.0}


 17%|█▋        | 20/120 [00:41<03:35,  2.16s/it]
[A

[A[A                               
                                                
 17%|█▋        | 20/120 [00:42<03:35,  2.16s/it]
[A

{'eval_loss': 1.9656320810317993, 'eval_runtime': 0.2609, 'eval_samples_per_second': 30.669, 'eval_steps_per_second': 3.834, 'epoch': 5.0}


 20%|██        | 24/120 [00:50<03:28,  2.17s/it]
[A

[A[A                               
                                                
 20%|██        | 24/120 [00:50<03:28,  2.17s/it]
[A

{'eval_loss': 1.9148555994033813, 'eval_runtime': 0.2712, 'eval_samples_per_second': 29.497, 'eval_steps_per_second': 3.687, 'epoch': 6.0}


 23%|██▎       | 28/120 [00:59<03:20,  2.18s/it]
[A

[A[A                               
                                                
 23%|██▎       | 28/120 [00:59<03:20,  2.18s/it]
[A

{'eval_loss': 1.8713290691375732, 'eval_runtime': 0.2602, 'eval_samples_per_second': 30.749, 'eval_steps_per_second': 3.844, 'epoch': 7.0}


 27%|██▋       | 32/120 [01:08<03:11,  2.18s/it]
[A

[A[A                               
                                                
 27%|██▋       | 32/120 [01:08<03:11,  2.18s/it]
[A

{'eval_loss': 1.7546573877334595, 'eval_runtime': 0.2681, 'eval_samples_per_second': 29.839, 'eval_steps_per_second': 3.73, 'epoch': 8.0}


 30%|███       | 36/120 [01:17<03:02,  2.18s/it]
[A

[A[A                               
                                                
 30%|███       | 36/120 [01:17<03:02,  2.18s/it]
[A

{'eval_loss': 1.6826660633087158, 'eval_runtime': 0.254, 'eval_samples_per_second': 31.5, 'eval_steps_per_second': 3.938, 'epoch': 9.0}


 33%|███▎      | 40/120 [01:26<02:53,  2.16s/it]
[A

[A[A                               
                                                
 33%|███▎      | 40/120 [01:26<02:53,  2.16s/it]
[A

{'eval_loss': 1.5735118389129639, 'eval_runtime': 0.2629, 'eval_samples_per_second': 30.435, 'eval_steps_per_second': 3.804, 'epoch': 10.0}


 37%|███▋      | 44/120 [01:35<02:45,  2.18s/it]
[A

[A[A                               
                                                
 37%|███▋      | 44/120 [01:35<02:45,  2.18s/it]
[A

{'eval_loss': 1.4754071235656738, 'eval_runtime': 0.2676, 'eval_samples_per_second': 29.901, 'eval_steps_per_second': 3.738, 'epoch': 11.0}


 40%|████      | 48/120 [01:44<02:38,  2.20s/it]
[A

[A[A                               
                                                
 40%|████      | 48/120 [01:44<02:38,  2.20s/it]
[A

{'eval_loss': 1.4091920852661133, 'eval_runtime': 0.2645, 'eval_samples_per_second': 30.242, 'eval_steps_per_second': 3.78, 'epoch': 12.0}


 43%|████▎     | 52/120 [01:53<02:30,  2.21s/it]
[A

[A[A                               
                                                
 43%|████▎     | 52/120 [01:53<02:30,  2.21s/it]
[A

{'eval_loss': 1.3501088619232178, 'eval_runtime': 0.2624, 'eval_samples_per_second': 30.482, 'eval_steps_per_second': 3.81, 'epoch': 13.0}


 47%|████▋     | 56/120 [02:02<02:19,  2.19s/it]
[A

[A[A                               
                                                
 47%|████▋     | 56/120 [02:02<02:19,  2.19s/it]
[A

{'eval_loss': 1.317955732345581, 'eval_runtime': 0.2653, 'eval_samples_per_second': 30.152, 'eval_steps_per_second': 3.769, 'epoch': 14.0}


 50%|█████     | 60/120 [02:11<02:11,  2.19s/it]
[A

[A[A                               
                                                
 50%|█████     | 60/120 [02:11<02:11,  2.19s/it]
[A

{'eval_loss': 1.2650952339172363, 'eval_runtime': 0.2627, 'eval_samples_per_second': 30.455, 'eval_steps_per_second': 3.807, 'epoch': 15.0}


 53%|█████▎    | 64/120 [02:20<02:02,  2.18s/it]
[A

[A[A                               
                                                
 53%|█████▎    | 64/120 [02:20<02:02,  2.18s/it]
[A

{'eval_loss': 1.1375560760498047, 'eval_runtime': 0.2708, 'eval_samples_per_second': 29.538, 'eval_steps_per_second': 3.692, 'epoch': 16.0}


 57%|█████▋    | 68/120 [02:29<01:53,  2.19s/it]
[A

[A[A                               
                                                
 57%|█████▋    | 68/120 [02:29<01:53,  2.19s/it]
[A

{'eval_loss': 1.0662130117416382, 'eval_runtime': 0.2753, 'eval_samples_per_second': 29.059, 'eval_steps_per_second': 3.632, 'epoch': 17.0}


 60%|██████    | 72/120 [02:38<01:45,  2.20s/it]
[A

[A[A                               
                                                
 60%|██████    | 72/120 [02:38<01:45,  2.20s/it]
[A

{'eval_loss': 1.008046269416809, 'eval_runtime': 0.2617, 'eval_samples_per_second': 30.57, 'eval_steps_per_second': 3.821, 'epoch': 18.0}


 63%|██████▎   | 76/120 [02:47<01:36,  2.20s/it]
[A

[A[A                               
                                                
 63%|██████▎   | 76/120 [02:48<01:36,  2.20s/it]
[A

{'eval_loss': 0.9565284848213196, 'eval_runtime': 0.2642, 'eval_samples_per_second': 30.279, 'eval_steps_per_second': 3.785, 'epoch': 19.0}


 67%|██████▋   | 80/120 [02:56<01:27,  2.19s/it]
[A

[A[A                               
                                                
 67%|██████▋   | 80/120 [02:57<01:27,  2.19s/it]
[A

{'eval_loss': 0.9272826313972473, 'eval_runtime': 0.2663, 'eval_samples_per_second': 30.037, 'eval_steps_per_second': 3.755, 'epoch': 20.0}


 70%|███████   | 84/120 [03:05<01:19,  2.21s/it]
[A

[A[A                               
                                                
 70%|███████   | 84/120 [03:06<01:19,  2.21s/it]
[A

{'eval_loss': 0.9141188859939575, 'eval_runtime': 0.2775, 'eval_samples_per_second': 28.831, 'eval_steps_per_second': 3.604, 'epoch': 21.0}


 73%|███████▎  | 88/120 [03:15<01:13,  2.30s/it]
[A

[A[A                               
                                                
 73%|███████▎  | 88/120 [03:15<01:13,  2.30s/it]
[A

{'eval_loss': 0.8514343500137329, 'eval_runtime': 0.2843, 'eval_samples_per_second': 28.141, 'eval_steps_per_second': 3.518, 'epoch': 22.0}


 77%|███████▋  | 92/120 [03:24<01:02,  2.22s/it]
[A

[A[A                               
                                                
 77%|███████▋  | 92/120 [03:24<01:02,  2.22s/it]
[A

{'eval_loss': 0.7918561697006226, 'eval_runtime': 0.2933, 'eval_samples_per_second': 27.277, 'eval_steps_per_second': 3.41, 'epoch': 23.0}


 80%|████████  | 96/120 [03:33<00:53,  2.21s/it]
[A

[A[A                               
                                                
 80%|████████  | 96/120 [03:33<00:53,  2.21s/it]
[A

{'eval_loss': 0.7495076656341553, 'eval_runtime': 0.2608, 'eval_samples_per_second': 30.675, 'eval_steps_per_second': 3.834, 'epoch': 24.0}


 83%|████████▎ | 100/120 [03:42<00:44,  2.20s/it]
[A

[A[A                               
                                                 
 83%|████████▎ | 100/120 [03:43<00:44,  2.20s/it]
[A

{'eval_loss': 0.724989652633667, 'eval_runtime': 0.2624, 'eval_samples_per_second': 30.484, 'eval_steps_per_second': 3.811, 'epoch': 25.0}


 87%|████████▋ | 104/120 [03:51<00:35,  2.20s/it]
[A

[A[A                               
                                                 
 87%|████████▋ | 104/120 [03:52<00:35,  2.20s/it]
[A

{'eval_loss': 0.708335816860199, 'eval_runtime': 0.2742, 'eval_samples_per_second': 29.178, 'eval_steps_per_second': 3.647, 'epoch': 26.0}


 90%|█████████ | 108/120 [04:00<00:26,  2.20s/it]
[A

[A[A                               
                                                 
 90%|█████████ | 108/120 [04:01<00:26,  2.20s/it]
[A

{'eval_loss': 0.6902002692222595, 'eval_runtime': 0.2678, 'eval_samples_per_second': 29.871, 'eval_steps_per_second': 3.734, 'epoch': 27.0}


 93%|█████████▎| 112/120 [04:09<00:17,  2.22s/it]
[A

[A[A                               
                                                 
 93%|█████████▎| 112/120 [04:10<00:17,  2.22s/it]
[A

{'eval_loss': 0.6742333173751831, 'eval_runtime': 0.2633, 'eval_samples_per_second': 30.38, 'eval_steps_per_second': 3.798, 'epoch': 28.0}


 97%|█████████▋| 116/120 [04:19<00:08,  2.21s/it]
[A

[A[A                               
                                                 
 97%|█████████▋| 116/120 [04:19<00:08,  2.21s/it]
[A

{'eval_loss': 0.6637663245201111, 'eval_runtime': 0.3298, 'eval_samples_per_second': 24.257, 'eval_steps_per_second': 3.032, 'epoch': 29.0}


100%|██████████| 120/120 [04:28<00:00,  2.20s/it]
[A

[A[A                               
                                                 
100%|██████████| 120/120 [04:28<00:00,  2.20s/it]
[A
100%|██████████| 120/120 [04:28<00:00,  2.24s/it]


{'eval_loss': 0.6590226888656616, 'eval_runtime': 0.2561, 'eval_samples_per_second': 31.232, 'eval_steps_per_second': 3.904, 'epoch': 30.0}
{'train_runtime': 268.4704, 'train_samples_per_second': 3.352, 'train_steps_per_second': 0.447, 'train_loss': 1.2363861083984375, 'epoch': 30.0}


100%|██████████| 1/1 [00:00<00:00, 1003.42it/s]


Evaluation results: {'eval_loss': 0.6590226888656616, 'eval_runtime': 0.2647, 'eval_samples_per_second': 30.226, 'eval_steps_per_second': 3.778, 'epoch': 30.0}


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.json',
 './saved_model\\merges.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

In [56]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric

# Ensure GPU is visible
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load your test dataset
data = pd.read_csv(r'C:\Users\admin\Desktop\SQ\try.csv')

# Preprocess function
def preprocess_text(text):
    return text.lower().replace('\n', ' ').replace('\r', '')

# Apply preprocessing
data['text'] = data['text'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
data['target'] = label_encoder.fit_transform(data['target'])

# Split the dataset
_, test_data = train_test_split(data, test_size=0.5, stratify=data['target'], random_state=42)

# Convert to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_data.reset_index(drop=True))

# Load the saved model and tokenizer
model_path = './saved_model'
tokenizer = AutoTokenizer.from_pretrained(r"C:\Users\admin\Desktop\traini\saved_model")
model = AutoModelForSequenceClassification.from_pretrained(r"C:\Users\admin\Desktop\traini\saved_model")

# Tokenize the texts
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

test_dataset = test_dataset.map(tokenize, batched=True)

# Rename the target column to labels
test_dataset = test_dataset.rename_column("target", "labels")

# Set format for PyTorch
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define the compute_metrics function
metric = load_metric("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = torch.tensor(predictions)  # Convert predictions to tensor
    predictions = torch.argmax(predictions, dim=1)
    return metric.compute(predictions=predictions, references=torch.tensor(labels))

# Define evaluation arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=8,
    no_cuda=True,
    report_to=[]  # Disabling all reporting integrations including codecarbon
)

# Create a Trainer instance for evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Evaluate the model
results = trainer.evaluate()

# Print the results
print(results)


Map: 100%|██████████| 19/19 [00:00<00:00, 5524.94 examples/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 3/3 [00:02<00:00,  1.13it/s]

{'eval_loss': 0.49643397331237793, 'eval_accuracy': 1.0, 'eval_runtime': 2.869, 'eval_samples_per_second': 6.622, 'eval_steps_per_second': 1.046}



