# Proposta usando única representação vetorial

In [None]:
import spacy
import pytz
import pandas as pd
import numpy as np
import requests
import os
import pickle
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from sklearn.svm import SVC
from collections import defaultdict
from tqdm import tqdm
from datetime import datetime

model = KeyedVectors.load_word2vec_format(
    '../models/GoogleNews-vectors-negative300.bin', binary=True)
nlp = spacy.load('en_core_web_sm')

with open('./input/glossary_fortinet.txt', 'r') as f:
    cybersecurity_words = [line.strip() for line in f.readlines()]


def train_and_evaluate_random_forest(X, y):
    """Treina um modelo Random Forest usando os conjuntos de treinamento fornecidos e avalia sua performance nos conjuntos de teste."""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    with open('random_forest_model.pkl', 'wb') as model_file:
        pickle.dump(rf_model, model_file)

    y_pred = rf_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


def evaluate_svm(text):
    vector = text_to_vector(text)
    with open('./input/mysvmclassifier/SVM.pkl', 'rb') as model_file:
        loaded_svm_model = pickle.load(model_file)

    pred_range = loaded_svm_model.predict_proba([vector])
    prob_second_class = pred_range[0][1]
    return prob_second_class


def accuracyScore(csv_path):
    """Calcula e imprime a acurácia e o relatório de classificação comparando os rótulos verdadeiros e preditos carregados de um arquivo CSV."""
    csv = pd.read_csv(csv_path)
    true_labels = csv['annotation']
    predicted_labels = csv['context']
    report = classification_report(true_labels, predicted_labels)
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"Acurácia: {accuracy*100:.2f}%")
    print(report)


def normalize_value(x, min_value, max_value, new_min=0, new_max=1):
    """Normaliza um valor x no intervalo [min_value, max_value] para um novo intervalo [new_min, new_max]."""
    if x < min_value or x > max_value:
        x = max(min(x, max_value), min_value)
    normalized_x = new_min + ((new_max - new_min) *
                              (x - min_value)) / (max_value - min_value)
    return normalized_x


def get_word_vectors(text):
    """Retorna os vetores das palavras no texto que estão presentes no modelo."""
    word_vectors = []
    for word in text.split():
        if word in model:
            word_vectors.append(model[word])
    return word_vectors


def text_to_vector(text):
    """Converte o texto em um vetor médio dos vetores das palavras."""
    word_vectors = get_word_vectors(text)
    if not word_vectors:
        return np.zeros(300)
    return np.mean(word_vectors, axis=0)


glossary_vectors = {
    term: text_to_vector(term).squeeze()
    for term in cybersecurity_words
}


def cosine_similarity(vector1, vector2):
    """Calcula a similaridade de cosseno entre dois vetores."""
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)

    if not norm_vector1 or not norm_vector2:
        return 0

    dot_product = np.dot(vector1, vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity


def cybersecurity_context(text):
    """Calcula o contexto de cibersegurança do texto com base na similaridade com termos do glossário."""
    text_vector = text_to_vector(text)
    similarities = []

    for glossary_vector in glossary_vectors.values():
        if text_vector is not None and glossary_vector is not None:
            similarity = cosine_similarity(text_vector, glossary_vector)
            similarities.append(similarity)

    if similarities:
        average_similarity = np.mean(similarities)
        normalized_similarity = normalize_value(average_similarity, 0, 1)
        return min(max(normalized_similarity, 0), 1)
    else:
        return 0


def entity_in_text(text):
    """Avalia o texto e retorna uma pontuação com base nas entidades nomeadas presentes e na presença de palavras-chave relevantes."""
    doc = nlp(text)
    entity_scores = {
        "CARDINAL": 0.3208,
        "DATE": 0.3807,
        "EVENT": 0.0002,
        "FAC": 0.0003,
        "GPE": 0.0392,
        "LANGUAGE": 0.0002,
        "LAW": 0.0004,
        "LOC": 0.0030,
        "MONEY": 0.0014,
        "NORP": 0.0254,
        "ORDINAL": 0.0097,
        "ORG": 0.1288,
        "PERCENT": 0.0003,
        "PERSON": 0.0630,
        "PRODUCT": 0.0052,
        "QUANTITY": 0.0133,
        "TIME": 0.0079,
        "WORK_OF_ART": 0.0003,
    }
    freq_score = sum(entity_scores.get(ent.label_, 0) for ent in doc.ents)
    normalized_value = normalize_value(freq_score, 0, 1)
    return min(max(normalized_value, 0), 1)


def extract_sentiment():
    """Processa os textos em um arquivo CSV, avalia várias métricas (sentimento, 
    pontuação de entidade e contexto de cibersegurança) e salva os resultados, 
    juntamente com uma pontuação global calculada, em um novo arquivo CSV."""
    csv = pd.read_csv(
        "./input/validation_sentiment.csv")
    y_train = csv['annotation'].values
    all_vectors = []
    for key, value in tqdm(csv.iterrows(), total=csv.shape[0]):
        text = value['text']
        sentiment = value['sentiment']
        entity_score = entity_in_text(text)
        context_score = cybersecurity_context(text)
        svm_prediction = evaluate_svm(text)
        vector = text_to_vector(text)
        full_vector = np.concatenate(
            (vector, sentiment, entity_score, context_score, svm_prediction), axis=None)
        all_vectors.append(full_vector)
    train_and_evaluate_random_forest(
        np.array(all_vectors, dtype=object), y_train)


extract_sentiment()