In [None]:
import json
import time
import torch
import random
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import Counter, defaultdict
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from utils import *

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [None]:
type_of_labels = 'fine'
column_name = 'COARSE'
if type_of_labels == 'fine':
    column_name = 'FINE'
category_to_aspect = pd.read_csv(f'aspects - {type_of_labels}.csv')
aspect_to_category = defaultdict(set)
for i in range(len(category_to_aspect)):
    aspect_to_category[category_to_aspect['LLM annotation'].to_list()[i]].add(category_to_aspect[column_name].to_list()[i])

In [None]:
texts, labels, aspects, counter = [], [], [], Counter()
annotation = pd.read_csv('annotation - llm.csv')
for i in range(len(annotation)):
    texts.append(annotation['review'].tolist()[i])
    label = merge_synonyms(str(annotation['annotation_1'].tolist()[i]).replace(' and ', ', ').split(', '))
    labels.append(label)
    aspects.extend(label)
    counter.update(label)

In [None]:
categories = ['-']
for aspect, category in aspect_to_category.items():
    categories.extend(list(category))
categories = list(set(categories))
labels_one_hot = []
for item in labels:
    output = np.zeros(len(categories))
    for aspect in item:
        if aspect in aspect_to_category:
            for category in aspect_to_category[aspect]:
                output[categories.index(category)] = 1
    if not output.any():
        output[categories.index('-')] = 1
    labels_one_hot.append(output)

In [None]:
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)

In [None]:
number_of_data = int(len(texts)*0.9)
train_texts = X[:number_of_data]
train_labels = labels_one_hot[:number_of_data]
eval_texts = X[number_of_data:]
eval_labels = labels_one_hot[number_of_data:]

In [None]:
seed = 2266
classifier = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=seed))
classifier.fit(train_texts, train_labels)

In [None]:
predictions_one_hot = classifier.predict(eval_texts)
print(classification_report(eval_labels, predictions_one_hot, target_names=categories, zero_division=0))
precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(eval_labels, predictions_one_hot, average='weighted', zero_division=0)
print(round(precision_weighted, 4), round(recall_weighted, 4), round(f1_weighted, 4))

In [None]:
actuals, predictions = [], []
for item in eval_labels:
    actual = []
    for i in range(len(item)):
        if item[i] == 1:
            actual.append(categories[i])
    actuals.append(actual)

for item in predictions_one_hot:
    prediction = []
    for i in range(len(item)):
        if item[i] == 1:
            prediction.append(categories[i])
    predictions.append(prediction)

In [None]:
similarity = calculate_jaccard_similarity_for_lists(actuals, predictions)

In [None]:
round(sum(similarity) / len(similarity), 4)