In [None]:
!pip install transformers torch accelerate tensorflow-hub bert-tensorflow tensorflow tqdm bert-score

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification, MarianMTModel, MarianTokenizer, BertConfig
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import pipeline
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
from datetime import datetime
from torch.utils.data import DataLoader
import re
import nltk
from nltk.corpus import wordnet
import random
from tqdm import tqdm
import concurrent.futures
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
import json
import numpy as np
from bert_score import score
from google.colab import drive
drive.mount("/content/drive")


In [None]:
nltk.download('punkt')
dataset_path = "/content/drive/My Drive/Diss_Dataset/dataset500_cleaned.csv"
data = pd.read_csv(dataset_path)
data["userid"] = data.iloc[:, 0]
data["posts"] = data.iloc[:, 1]
data["label"] = data.iloc[:, 2]

data['posts'] = data['posts'].str.strip('[]').str.split("', '")
data['posts'] = data['posts'].apply(lambda x: [post.strip("' ") for post in x])

def clean_post(post):
    post = re.sub(r'\*+', '', post)  # Remove ** symbols
    post = re.sub(r'\s+', ' ', post)
    post = re.sub(r'&gt;', '', post)  # Remove &gt; symbols
    post = re.sub(r'[^\x00-\x7F]+', '', post)
    post = re.sub(r'"[^"]*"', '', post)
    post = re.sub(r'\([^)]*\)', '', post)
    post = re.sub(r'[()]', '', post)
    post = re.sub(r'\[[^\]]*\]', '', post)
    post = re.sub(r'[\[\]]', '', post)
    return post.strip().lower()

data["clean_posts"] = data["posts"].apply(lambda posts: [clean_post(post) for post in posts])
data['combined_posts'] = data['clean_posts'].apply(lambda posts: ' '.join(posts))


print(data['combined_posts'][1])

In [None]:
highlights_path = "/content/drive/My Drive/Diss_Dataset/top_5_sentences_per_user.csv"
highlights_df = pd.read_csv(highlights_path)
print(highlights_df.head())

In [None]:
highlights_df['combined_highlights'] = highlights_df.groupby('userid')['sentence'].transform(lambda x: ' '.join(x))
highlights_df = highlights_df.drop_duplicates(subset=['userid', 'combined_highlights'])
merged_df = pd.merge(highlights_df[['userid', 'combined_highlights']], data[['userid', 'combined_posts']], on='userid')
print(merged_df.head())
print(highlights_df.head())

In [None]:
P, R, F1 = score(merged_df['combined_highlights'].tolist(), merged_df['combined_posts'].tolist(), lang="en", verbose=True)
merged_df['bertscore_precision'] = P.tolist()
merged_df['bertscore_recall'] = R.tolist()
merged_df['bertscore_f1'] = F1.tolist()
merged_df.to_csv('/content/drive/My Drive/Diss_Dataset/bert_scores_per_user.csv', index=False)

print(merged_df[['userid', 'bertscore_precision', 'bertscore_recall', 'bertscore_f1']])

In [None]:
#Overall average precision
print(merged_df['bertscore_precision'].mean())
#Overall average recall
print(merged_df['bertscore_recall'].mean())
#Overall average f1
print(merged_df['bertscore_f1'].mean())