In [5]:
# Transcribe Audio

import torch
from transformers import pipeline

asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base",
    device=0 if torch.cuda.is_available() else -1
)

# Transcribe the audio file
audio_file = "audio.wav"
transcription = asr_pipeline(audio_file, return_timestamps=True)

with open("transcription.txt", "w") as file:
    file.write(transcription["text"])



Device set to use cuda:0


In [1]:
## Fine Tuning the model

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import os
os.environ["WANDB_DISABLED"] = "true"

training_data = [
    
    {"text": "The witness claimed I orchestrated a fraudulent scheme.", "label": 1},
    {"text": "Critics accused me of betraying every trust with calculated deceit.", "label": 1},
    {"text": "They insisted that my actions were nothing short of scandalous.", "label": 1},
    {"text": "A prominent influencer declared I was a master of deception.", "label": 1},
    {"text": "Reports alleged that I manipulated facts for personal gain.", "label": 1},
    {"text": "The opposition argued that my reputation was built entirely on lies.", "label": 1},
    {"text": "Observers noted that my public persona masked a cleverly orchestrated fraud.", "label": 1},
    {"text": "A court statement implied I deliberately tarnished a respected legacy.", "label": 1},
    {"text": "Media pundits painted me as the villain in a theatrically staged drama.", "label": 1},
    {"text": "Accusations surfaced that I engineered controversy to ruin careers.", "label": 1},
    
    {"text": "I calmly presented the facts during the hearing.", "label": 0},
    {"text": "The debate centered on clear and documented evidence.", "label": 0},
    {"text": "Observers remarked that the courtroom atmosphere remained respectful.", "label": 0},
    {"text": "I detailed the timeline of events without exaggeration.", "label": 0},
    {"text": "The press coverage focused on the orderly conduct of the trial.", "label": 0},
    {"text": "In court, I answered each question with measured clarity.", "label": 0},
    {"text": "The statements were purely factual and free of personal attacks.", "label": 0},
    {"text": "Everyone appreciated the straightforward narrative of the case.", "label": 0},
    {"text": "The discussion remained focused on verifiable details throughout.", "label": 0},
    {"text": "The dialogue emphasized legal aspects rather than dramatics.", "label": 0},

]

dataset = Dataset.from_list(training_data)

# Load tokenizer and model
model_name = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_function, batched=False)

# Split into train and eval
train_dataset = tokenized_dataset.select(range(16))  # first 16 examples
eval_dataset = tokenized_dataset.select(range(16, 20))  # remaining 4 examples

# Training arguments
training_args = TrainingArguments(
    output_dir="./defamation_model",
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    logging_steps=5,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print("Evaluation Results: ", eval_results)

# Save the model
trainer.save_model("./defamation_model")
tokenizer.save_pretrained("./defamation_model")



2025-03-28 16:20:10.504319: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-28 16:20:10.519197: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743193210.535096 3146830 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743193210.540082 3146830 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-28 16:20:10.558805: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.7124,0.994687
2,0.5396,1.090873
3,0.4776,0.886829
4,0.1891,0.695901
5,0.1317,0.630263


Evaluation Results:  {'eval_loss': 0.6302633881568909, 'eval_runtime': 0.0104, 'eval_samples_per_second': 384.402, 'eval_steps_per_second': 192.201, 'epoch': 5.0}


('./defamation_model/tokenizer_config.json',
 './defamation_model/special_tokens_map.json',
 './defamation_model/vocab.txt',
 './defamation_model/added_tokens.json',
 './defamation_model/tokenizer.json')

In [2]:

with open("transcription.txt", 'r') as file:
    transcript_text = file.read()

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
sentences = nltk.tokenize.sent_tokenize(transcript_text)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/cpsc415_ah2575/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/cpsc415_ah2575/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model_path = "./defamation_model"  # or wherever you saved it
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

def is_defamatory(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()
    return predicted_label  # 1 = defamatory, 0 = non-defamatory

results = []
for sent in sentences:
    label = is_defamatory(sent)
    results.append((sent, label))

for sent, label in results:
    print(f"Sentence: {sent}\nDefamatory? {label}\n")


Sentence:  Can you please tell the jury why you're here today?
Defamatory? 0

Sentence: Miss Herd accused of abuse.
Defamatory? 1

Sentence: My ex-husband is suing me.
Defamatory? 1

Sentence: Brutal, cruel.
Defamatory? 1

Sentence: This is humiliating for any human being to go through.
Defamatory? 1

Sentence: And all false.
Defamatory? 1

Sentence: Amber heard forever changed Mr. Dep's life and reputation.
Defamatory? 1

Sentence: Behind the fame, you're going to see who the real Johnny Dep is.
Defamatory? 0

Sentence: Dep was the one who wanted the cameras in the courtroom.
Defamatory? 0

Sentence: She did.
Defamatory? 1

Sentence: I would argue it's a PR campaign, disguised as a defamation case.
Defamatory? 1

Sentence: There's the man himself.
Defamatory? 1

Sentence: It's being a social media circus of commentary from creators and influencers.
Defamatory? 0

Sentence: Did you commit any kind of prank?
Defamatory? 1

Sentence: Absolutely not.
Defamatory? 1

Sentence: On my side of