In [1]:
import os
import pandas as pd
import torch
from transformers import AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
from torch.quantization import quantize_dynamic
from transformers import Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from datasets import load_dataset

from pathlib import Path
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


# Load the model from the checkpoint
checkpoint_path = "./results/roberta/checkpoint-9900/"

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# Load the model from the checkpoint directory
model = RobertaForSequenceClassification.from_pretrained(checkpoint_path)




In [3]:

# Load data
export_dir = Path(os.getcwd())
data_path = Path(export_dir, "data")
train_file = Path(data_path, 'train_data_only_text_and_labels.csv')
eval_file = Path(data_path, 'eval_data_only_text_and_labels.csv')
test_file = Path(data_path, 'test_data_only_text_and_labels.csv')

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
test_df = pd.read_csv(test_file)


In [6]:
train_df

Unnamed: 0,text_combined,label
0,mr benson eko bbennisadinetcomuy benson eko ch...,1
1,millicent boston helgagermanflintcochraneorg r...,1
2,jason ling vytekdemisehotmailcom _nextpart_001...,0
3,vladimir antalik ohrbzoznamsk would like purch...,0
4,lennart regebro hyiffbigmailcom wed mar 26 200...,0
...,...,...
52785,justin shore listuserneopittstateedu 116 pm 04...,0
52786,charles philip chan optgpesympaticoca aaron ku...,0
52787,isador kee catimen84swannejp dear d59ebf6a0f14...,1
52788,global risk management operations weekly opera...,0


In [11]:
!pip install --upgrade torch torchvision

Collecting torch
  Using cached torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Using cached torchvision-0.19.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (f

In [7]:
import torch
from transformers import Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from torch.quantization import quantize_dynamic
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import pandas as pd
import numpy as np
import os
import copy


# Load the model from the checkpoint
checkpoint_path = "./results/roberta/checkpoint-9900/"
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# Load the model from the checkpoint directory
model = RobertaForSequenceClassification.from_pretrained(checkpoint_path)

# Tokenize and prepare datasets
def tokenize_and_encode(df):
    encodings = tokenizer(list(df['text_combined']), truncation=True, padding=True, return_tensors='pt')
    labels = torch.tensor(df['label'].values)
    return encodings, labels

train_encodings, train_labels = tokenize_and_encode(train_df)
eval_encodings, eval_labels = tokenize_and_encode(eval_df)
test_encodings, test_labels = tokenize_and_encode(test_df)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
eval_dataset = CustomDataset(eval_encodings, eval_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

# Evaluate original model size
def get_model_size(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / (1024 ** 2)  # Size in MB

original_model_size = get_model_size(model)

# Apply dynamic quantization to the linear layers
model.to('cpu')  # Ensure model is on CPU
quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

# Evaluate quantized model size
quantized_model_size = get_model_size(quantized_model)




In [8]:
print(quantized_model_size)
total_params = sum(p.numel() for p in model.parameters())

print(f"Total number of parameters: {total_params}")

198.7421875
Total number of parameters: 355361794


In [14]:
# quantized_model_save_path = "./results/roberta/fine_tune/quant/quantized_model.pth"
# torch.save(quantized_model.state_dict(), quantized_model_save_path)
# print(f"Quantized model saved to {quantized_model_save_path}")

# # Optionally, load the quantized model later for inference



Quantized model saved to ./results/roberta/fine_tune/quant/quantized_model.pth


In [11]:
torch.save(quantized_model, "./results/roberta/fine_tune/quant/quantized_model.pth")



In [15]:
# # 1. Recreate the model architecture
# model_architecture = "roberta-large"  # Use the same architecture that was originally quantized
# quantized_model = RobertaForSequenceClassification.from_pretrained(model_architecture)

# # 2. Load the saved quantized state dictionary
# quantized_model_save_path = "./results/roberta/quantized_model.pth"
# quantized_model.load_state_dict(torch.load(quantized_model_save_path))

In [4]:
model_architecture = "roberta-large"  # Use the same architecture that was originally quantized
quantized_model_save_path = "./results/roberta/fine_tune/quant/quantized_model.pth"
quantized_model2 = torch.load("./results/roberta/fine_tune/quant/quantized_model.pth")

  device=storage.device,


In [5]:
total_params = sum(p.numel() for p in quantized_model2.parameters())

print(f"Total number of parameters: {total_params}")

Total number of parameters: 52099072


In [6]:
def get_model_size(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad) * 4 / (1024 ** 2)  # Size in MB
get_model_size(quantized_model2)

198.7421875

In [7]:
test= pd.read_csv(Path(data_path,'test_data_only_text_and_labels.csv'))
test.head()
X_test = test["text_combined"].tolist()
y_test = test["label"]


In [10]:
test.shape

(16498, 2)

In [12]:
16498/2


8249.0

In [10]:
import torch
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt

# Define device
device = torch.device("cpu")  # Change to "cuda" if GPU is available and required

# Move model to device
quantized_model2.to(device)

# Convert DataFrame column to list of strings
texts = X_test

# Tokenize the texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

# Put model in evaluation mode
quantized_model2.eval()

# Get logits from the model
with torch.no_grad():
    outputs = quantized_model2(**inputs)
    logits = outputs.logits
    logits = logits.cpu()  # Ensure logits are on CPU
    probs = torch.nn.functional.softmax(logits, dim=-1).cpu()  # Ensure probs are on CPU

# Get the predicted labels
predictions = torch.argmax(probs, dim=1).cpu()  # Ensure predictions are on CPU

# Assuming y_true and y_pred are the true and predicted labels respectively
y_true = y_test.values  # True labels (convert to numpy array)
y_pred = predictions.numpy()  # Predicted labels (convert to numpy array)
y_scores = probs[:, 1].numpy()  # Probabilities for the positive class (convert to numpy array)

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)

# Accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

# Precision, Recall, F1-Score
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# ROC Curve and AUC
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
auc = roc_auc_score(y_true, y_scores)
print("AUC:", auc)
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve
precision_vals, recall_vals, thresholds = precision_recall_curve(y_true, y_scores)
plt.figure()
plt.plot(recall_vals, precision_vals, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()


KeyboardInterrupt: 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#Define the data matrix
data = cm

# Define axis labels
x_labels = ['Predicted Negative', 'Predicted Positive']
y_labels = ['Actual Negative', 'Actual Positive']

# Create the heatmap with values inside cells
ax = sns.heatmap(data, annot=True, fmt='d', cmap='coolwarm', cbar=True, 
                 xticklabels=x_labels, yticklabels=y_labels)

# Set axis labels
ax.set_xlabel('Predicted Label')
ax.set_ylabel('True Label')

# Show the plot
plt.show()
