In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import spacy

nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('./input/validation_sentiment.csv')
df_annotation_1 = df[df['annotation'] == 1]
df_annotation_0 = df[df['annotation'] == 0]


def find_most_common_words(texts):
    words = ' '.join(texts).split()
    word_counts = Counter(words)
    return word_counts


common_words_1 = find_most_common_words(df_annotation_1['text'])
common_words_0 = find_most_common_words(df_annotation_0['text'])


def get_sentiment(sentiments):
    return "Negativo" if sentiments.mean() >= 0.5 else "Positivo"


sentiment_1 = get_sentiment(df_annotation_1['sentiment'])
sentiment_0 = get_sentiment(df_annotation_0['sentiment'])


def find_most_common_entities(texts):
    all_entities = []
    for text in texts:
        doc = nlp(text)
        entities = [ent.label_ for ent in doc.ents]
        all_entities.extend(entities)
    entity_counts = Counter(all_entities)
    return entity_counts


entity_counts_1 = find_most_common_entities(df_annotation_1['text'])
entity_counts_0 = find_most_common_entities(df_annotation_0['text'])


def save_common_words_to_file(common_words, output_file, num_words=200):
    most_common = common_words.most_common(num_words)
    words, _ = zip(*most_common)
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write('\n'.join(words))


save_common_words_to_file(common_words_1, './output/termos_relevantes_ameaca.txt')
save_common_words_to_file(common_words_0, './output/termos_relevantes_nao_ameaca.txt')


def calculate_sentiment_percentage(df_class):
    total_samples = len(df_class)
    negative_samples = len(df_class[df_class['sentiment'] >= 0.5])
    positive_samples = total_samples - negative_samples

    negative_percentage = (negative_samples / total_samples) * 100
    positive_percentage = (positive_samples / total_samples) * 100

    return positive_percentage, negative_percentage


positive_percentage_1, negative_percentage_1 = calculate_sentiment_percentage(df_annotation_1)
positive_percentage_0, negative_percentage_0 = calculate_sentiment_percentage(df_annotation_0)


def calculate_entity_percentage(entity_counts, total_entities):
    entity_percentage = {entity: (count / total_entities) * 100 for entity, count in entity_counts.items()}
    return entity_percentage


def plot_common_words(common_words, annotation):
    most_common = common_words.most_common(10)
    words, counts = zip(*most_common)

    plt.figure(figsize=(10, 6))
    plt.bar(words, counts)
    plt.title(f'Palavras mais comuns na classe {annotation}')
    plt.xlabel('Palavras')
    plt.ylabel('Frequência')
    plt.xticks(rotation=45)
    plt.show()


def plot_common_entities(entity_counts, annotation, frequency_scale=500):
    sorted_entities = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)
    labels, counts = zip(*sorted_entities)

    plt.figure(figsize=(10, 6))
    plt.barh(labels, counts)
    plt.title(f'Entidades mais comuns na classe {annotation}')
    plt.xlabel('Frequência')

    x_ticks = range(0, max(counts) + 1, frequency_scale)
    plt.xticks(x_ticks)

    plt.gca().invert_yaxis()
    plt.show()


def plot_annotation_counts(annotation):
    annotation_counts = annotation.value_counts()

    annotation_counts.plot(kind="bar")
    plt.title("Contagem de Valores na Coluna 'annotation'")
    plt.xlabel("Valor")
    plt.ylabel("Contagem")
    plt.xticks([0, 1], ["0 (Sem Ameaça)", "1 (Ameaça Cibernética)"])
    plt.show()

    annotation_counts.plot(kind="pie", autopct='%1.1f%%')
    plt.title("Distribuição de Valores na Coluna 'annotation'")
    plt.legend(["0 (Sem Ameaça)", "1 (Ameaça Cibernética)"], loc="upper right")
    plt.axis('equal')
    plt.show()


total_entities_1 = sum(entity_counts_1.values())
total_entities_0 = sum(entity_counts_0.values())
entity_percentage_1 = calculate_entity_percentage(entity_counts_1, total_entities_1)
entity_percentage_0 = calculate_entity_percentage(entity_counts_0, total_entities_0)

print(f'Sentimento médio da classe 1: {sentiment_1}')
print(f'Sentimento médio da classe 0: {sentiment_0}')

print(f'\nEntidades mais comuns na classe 1: {entity_counts_1.most_common(10)}')
print(f'Entidades mais comuns na classe 0: {entity_counts_0.most_common(10)}')

print(f"\nPorcentagem de Sentimento Positivo na Classe 1: {positive_percentage_1:.2f}%")
print(f"Porcentagem de Sentimento Negativo na Classe 1: {negative_percentage_1:.2f}%")

print(f"\nPorcentagem de Sentimento Positivo na Classe 0: {positive_percentage_0:.2f}%")
print(f"Porcentagem de Sentimento Negativo na Classe 0: {negative_percentage_0:.2f}%")

print("\nPorcentagem de Entidades na Classe 1:")
for entity, percentage in entity_percentage_1.items():
    print(f"{entity}: {percentage:.2f}%")

print("\nPorcentagem de Entidades na Classe 0:")
for entity, percentage in entity_percentage_0.items():
    print(f"{entity}: {percentage:.2f}%")

plot_common_words(common_words_1, 1)
plot_common_words(common_words_0, 0)

plot_common_entities(entity_counts_1, 1)
plot_common_entities(entity_counts_0, 0)

plot_annotation_counts(df['annotation'])