# Proposta usando score ponderado

Observação: necessário executar baselines previamente e extrair GoogleNews via script start.sh

In [None]:
import spacy
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from gensim.models import KeyedVectors
from tqdm import tqdm

model = KeyedVectors.load_word2vec_format(
    '../models/GoogleNews-vectors-negative300.bin', binary=True)
nlp = spacy.load('en_core_web_sm')

with open('./input/glossary_fortinet.txt', 'r') as f:
    cybersecurity_words = [line.strip() for line in f.readlines()]


def accuracyScore(csv_path):
    """Calcula e imprime a acurácia e o relatório de classificação comparando os rótulos verdadeiros e preditos carregados de um arquivo CSV."""
    csv = pd.read_csv(csv_path)
    true_labels = csv['annotation']
    predicted_labels = csv['final_score']
    report = classification_report(true_labels, predicted_labels)
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"Acurácia: {accuracy*100:.2f}%")
    print(report)


def normalize_value(x, min_value, max_value, new_min=0, new_max=1):
    """Normaliza um valor x no intervalo [min_value, max_value] para um novo intervalo [new_min, new_max]."""
    if x < min_value or x > max_value:
        x = max(min(x, max_value), min_value)
    normalized_x = new_min + ((new_max - new_min) *
                              (x - min_value)) / (max_value - min_value)
    return normalized_x


def get_word_vectors(text):
    """Retorna os vetores das palavras no texto que estão presentes no modelo."""
    word_vectors = []
    for word in text.split():
        if word in model:
            word_vectors.append(model[word])
    return word_vectors


def text_to_vector(text):
    """Converte o texto em um vetor médio dos vetores das palavras."""
    word_vectors = get_word_vectors(text)
    if not word_vectors:
        return np.zeros(300)
    return np.mean(word_vectors, axis=0)


glossary_vectors = {
    term: text_to_vector(term)
    for term in cybersecurity_words
    if text_to_vector(term) is not None
}


def cosine_similarity(vector1, vector2):
    """Calcula a similaridade de cosseno entre dois vetores."""
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)

    if not norm_vector1 or not norm_vector2:
        return 0

    dot_product = np.dot(vector1, vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity


def cybersecurity_context(text):
    """Calcula o contexto de cibersegurança do texto com base na similaridade com termos do glossário."""
    text_vector = text_to_vector(text)
    similarities = []

    for glossary_vector in glossary_vectors.values():
        if text_vector is not None and glossary_vector is not None:
            similarity = cosine_similarity(text_vector, glossary_vector)
            similarities.append(similarity)

    if similarities:
        average_similarity = np.mean(similarities)
        normalized_similarity = normalize_value(average_similarity, 0, 1)
        return min(max(normalized_similarity, 0), 1)
    else:
        return 0


def entity_in_text(text):
    """Avalia o texto e retorna uma pontuação com base nas entidades nomeadas presentes e na presença de palavras-chave relevantes."""
    doc = nlp(text)
    entity_scores = {
        'CARDINAL': 0.33,
        'DATE': 0.33,
        'ORG': 0.33,
    }
    freq_score = sum(entity_scores.get(ent.label_, 0) for ent in doc.ents)
    normalized_value = normalize_value(freq_score, 0, 1)
    return min(max(normalized_value, 0), 1)


def calculate_score(text, sentiment, org_entity, cybersecurity_context, score, sentiment_weight, org_entity_weight, cybersecurity_context_weight, score_weight, threshold):
    """Calcula e retorna uma pontuação global normalizada, com base nas métricas fornecidas e seus respectivos pesos, para determinar se o texto está relacionado à cibersegurança."""
    weighted_sentiment = sentiment_weight * sentiment
    weighted_org_entity = org_entity_weight * org_entity
    weighted_cybersecurity_context = cybersecurity_context_weight * cybersecurity_context
    weighted_score = score_weight * score

    score = weighted_sentiment + weighted_org_entity + \
        weighted_cybersecurity_context + weighted_score
    max_score = sentiment_weight + org_entity_weight + \
        cybersecurity_context_weight + score_weight
    normalized_score = normalize_value(score, 0, max_score)
    if normalized_score >= threshold:
        return 1
    else:
        return 0


def extract_sentiment():
    """Processa os textos em um arquivo CSV, avalia várias métricas (sentimento, 
    pontuação de entidade e contexto de cibersegurança) e salva os resultados, 
    juntamente com uma pontuação global calculada, em um novo arquivo CSV."""
    csv = pd.read_csv("./input/baselines/resultados.csv")
    list_score = []
    i = 0
    total_rows = csv.shape[0]
    for key, value in tqdm(csv.iterrows(), total=total_rows):
        text = value['text']
        sentiment = value['sentiment']
        entity_score = entity_in_text(text)
        context_score = cybersecurity_context(text)
        score = value['SVM_TFIDF']
        # SVM_TFIDF
        # SVM_W2V
        # KNN_TFIDF
        # KNN_W2V
        # NV_TFIDF
        # NV_W2V
        # RF_TFIDF
        # RF_W2V

        sentiment_weight = 0.3
        org_entity_weight = 0.3
        cybersecurity_context_weight = 0.3
        score_weight = 0.1
        threshold = 0.9

        res_score = calculate_score(text, sentiment, entity_score, context_score, score, sentiment_weight,
                                    org_entity_weight, cybersecurity_context_weight, score_weight, threshold)
        list_score.append(res_score)

    csv['final_score'] = list_score
    csv.to_csv("./output/resultados_new.csv", index=False)


extract_sentiment()
accuracyScore("./output/resultados_new.csv")