In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from collections import Counter

# Ensure stopwords are downloaded
nltk.download('stopwords')
spanish_stopwords = set(stopwords.words('spanish'))

In [None]:
# Load the CSV file
file_path = 'comentarios.csv'  # Asegúrate de usar la ruta correcta
comentarios_data = pd.read_csv(file_path)

# Remove rows with missing values in 'texto'
comentarios_data.dropna(subset=['texto'], inplace=True)

# Clean and normalize text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'[^\w\sáéíóúüñ\U0001F600-\U0001F64F]', '', text)  # Keep emojis and alphanumerics
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    words = text.split()
    words = [word for word in words if word not in spanish_stopwords]  # Remove stopwords
    return ' '.join(words)

comentarios_data['texto_limpio'] = comentarios_data['texto'].apply(clean_text)

In [None]:
# Define the keyword dictionary
cyberbullying_categories = {
    'insulto': ['idiota', 'estúpido', 'tonto', 'imbécil', 'fea', 'gordo', 'inútil'],
    'amenaza': ['matar', 'lastimar', 'destruir', 'te voy a', 'te arrepentirás'],
    # Añade el resto de categorías aquí...
}

# Add columns for each category
for category, keywords in cyberbullying_categories.items():
    comentarios_data[category] = comentarios_data['texto_limpio'].apply(
        lambda text: any(keyword in text for keyword in keywords)
    )


In [None]:
# 1. Distribution of categories
category_counts = comentarios_data[list(cyberbullying_categories.keys())].sum()
category_counts.sort_values(ascending=False, inplace=True)

plt.figure(figsize=(10, 6))
category_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Cyberbullying Categories')
plt.ylabel('Number of Texts')
plt.xlabel('Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Word frequency analysis
all_words = ' '.join(comentarios_data['texto_limpio']).split()
word_freq = Counter(all_words).most_common(20)

words, counts = zip(*word_freq)
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='orange')
plt.title('Top 20 Most Frequent Words')
plt.ylabel('Frequency')
plt.xlabel('Words')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
