In [1]:
!pip install seaborn
!pip install wordcloud
!pip install plotly 
!pip install textstat
!pip install transformers[torch]
!pip install -U datasets
!pip install fsspec==2023.9.2
!pip install accelerate -U
!pip install nltk

[0m

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import string
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
from collections import Counter
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease, flesch_kincaid_grade

#### Import model from HF and train it 

##### model: "roberta-large"

In [4]:
# RoBERTa tokenizer and model
# roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
# roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2).to(device)

In [5]:
from transformers import Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset('csv', data_files={'train': "./data/train_data_only_text_and_labels.csv", 'eval': "./data/eval_data_only_text_and_labels.csv"})

# Load pre-trained model and tokenizer
model_name = 'roberta-large'
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text_combined'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results/roberta_20_epochs',
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
)

# Fine-tune the model
trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 52790 examples [00:00, 81782.48 examples/s]
Generating eval split: 13198 examples [00:00, 73851.20 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 52790/52790 [01:10<00:00, 744.38 examples/s]
Map: 100%|██████████| 13198/13198 [00:15<00:00, 845.54 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[34m[1mwandb[0m: Currently logged in as: [33mronelias[0m ([33mronelias-tel-aviv-university[0m). Use [1m`wandb login --rel

Epoch,Training Loss,Validation Loss
1,0.0949,0.252298
2,0.6994,0.69254
3,0.7,0.692734
4,0.6962,0.697845
5,0.6976,0.693054
6,0.6939,0.692794
7,0.6761,0.696675
8,0.6702,0.693593
9,0.6651,0.69254
10,0.6648,0.70112


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



TrainOutput(global_step=66000, training_loss=0.6104286856795802, metrics={'train_runtime': 52187.6197, 'train_samples_per_second': 20.231, 'train_steps_per_second': 1.265, 'total_flos': 9.839331334017024e+17, 'train_loss': 0.6104286856795802, 'epoch': 20.0})

#### Evaluate results using the test dataset:

In [9]:
# Load the model from the checkpoint
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Path to the saved checkpoint
checkpoint_path = "./results/roberta/checkpoint-9900/"

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

# Load the model from the checkpoint directory
model = RobertaForSequenceClassification.from_pretrained(checkpoint_path)



In [10]:
export_dir = Path(os.getcwd())
data_path = Path(export_dir, "data")
test= pd.read_csv(Path(data_path,'test_data_only_text_and_labels.csv'))
test.head()
X_test = test["text_combined"].tolist()
y_test = test["label"]

Unnamed: 0,text_combined,label
0,press release dimitri paris 10 th december new...,1
1,ronan waide waiderwaiderie july 19 kevindated1...,0
2,adminviagracom unable see images email please ...,1
3,tyree puckett rollandduluthmcmahontypepadcom r...,1
4,location reminder wholesale retail power syste...,0


In [8]:
import torch
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
import time

# Define device
device = torch.device("cpu")  # Change to "cuda" if GPU is available and required

# Move model to device
model.to(device)

# Convert DataFrame column to list of strings
texts = X_test

# Tokenize the texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

# Put model in evaluation mode
model.eval()
time_start = time.time()
# Get logits from the model
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    logits = logits.cpu()  # Ensure logits are on CPU
    probs = torch.nn.functional.softmax(logits, dim=-1).cpu()  # Ensure probs are on CPU
end_time = time.time()
running_time = end_time - time_start

# Get the predicted labels
predictions = torch.argmax(probs, dim=1).cpu()  # Ensure predictions are on CPU

# Assuming y_true and y_pred are the true and predicted labels respectively
y_true = y_test.values  # True labels (convert to numpy array)
y_pred = predictions.numpy()  # Predicted labels (convert to numpy array)
y_scores = probs[:, 1].numpy()  # Probabilities for the positive class (convert to numpy array)

# Compute metrics
cm = confusion_matrix(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
fpr, tpr, _ = roc_curve(y_true, y_scores)
auc = roc_auc_score(y_true, y_scores)
precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_scores)

# Save results to a text file
with open('./results/roberta/checkpoint-9900/evaluation_results.txt', 'w') as f:
    f.write("Confusion Matrix:\n")
    f.write(f"{cm}\n\n")
    
    f.write("Accuracy:\n")
    f.write(f"{accuracy:.4f}\n\n")
    
    f.write("Precision:\n")
    f.write(f"{precision:.4f}\n\n")
    
    f.write("Recall:\n")
    f.write(f"{recall:.4f}\n\n")
    
    f.write("F1-Score:\n")
    f.write(f"{f1:.4f}\n\n")
    
    f.write("AUC:\n")
    f.write(f"{auc:.4f}\n\n")
    
    f.write(f"Running Time: {running_time:.2f} seconds\n")
    


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Define the data matrix
data = np.array([[7875, 44],
                 [54, 8525]])

# Define axis labels
x_labels = ['Predicted Negative', 'Predicted Positive']
y_labels = ['Actual Negative', 'Actual Positive']

# Create the heatmap with values inside cells
ax = sns.heatmap(data, annot=True, fmt='d', cmap='coolwarm', cbar=True, 
                 xticklabels=x_labels, yticklabels=y_labels)

# Set axis labels
ax.set_xlabel('Predicted Label')
ax.set_ylabel('True Label')

# Show the plot
plt.show()
