In [None]:
!pip install -q spacy scikit-learn pandas matplotlib seaborn
!python -m spacy download es_core_news_sm


Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline

# Cargar spaCy español
nlp = spacy.load("es_core_news_sm")

# Cargar dataset
df = pd.read_csv("Corpus_estres.csv")
df['label'] = df['Estres'].map({'si': 1, 'no': 0})

# Preprocesamiento
def preprocess(text):
    doc = nlp(text.lower())
    return " ".join([t.lemma_ for t in doc if t.is_alpha and not t.is_stop])

df['clean_text'] = df['Transcripcion'].apply(preprocess)

#  80/20
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)


In [None]:
# Vectorizadores
vectorizers = {
    "TF-IDF": TfidfVectorizer,
    "Frecuencia": CountVectorizer,
    "Binaria": lambda **kwargs: CountVectorizer(binary=True, **kwargs)
}

# Rangos de n-gramas
ngram_ranges = {
    "unigramas": (1, 1),
    "bigramas": (2, 2),
    "trigramas": (3, 3),
    "uni+bi": (1, 2),
    "uni+tri": (1, 3)
}

# Modelos y grids
models = {
    "NaiveBayes": (MultinomialNB(), {'clf__alpha': [0.1, 1.0, 10]}),
    "SVM": (SVC(), {'clf__C': [0.1, 1, 10], 'clf__kernel': ['linear', 'rbf'], 'clf__gamma': ['scale', 'auto']}),
    "LogisticRegression": (LogisticRegression(max_iter=1000), {'clf__C': [0.1, 1, 10], 'clf__penalty': ['l2']})
}


In [None]:
from sklearn.metrics import classification_report

results = []

for vec_name, vec_class in vectorizers.items():
    for ngram_name, ngram_range in ngram_ranges.items():
        for model_name, (clf, grid) in models.items():
            print(f"Probando {vec_name} + {ngram_name} + {model_name}")
            vec = vec_class(ngram_range=ngram_range)
            pipe = Pipeline([
                ('vec', vec),
                ('clf', clf)
            ])
            gs = GridSearchCV(pipe, param_grid=grid, cv=5, scoring='f1_macro', n_jobs=-1)
            gs.fit(X_train, y_train)

            y_pred = gs.best_estimator_.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='macro')

            results.append({
                'Vectorizador': vec_name,
                'N-gramas': ngram_name,
                'Modelo': model_name,
                'Accuracy': round(acc, 4),
                'F1_macro': round(f1, 4),
                'Mejores parámetros': gs.best_params_
            })

            # Mostrar reporte
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred, target_names=["No Estrés", "Estrés"]))
            print("-" * 80)


Probando TF-IDF + unigramas + NaiveBayes

Classification Report:
              precision    recall  f1-score   support

   No Estrés       0.81      0.80      0.81        71
      Estrés       0.73      0.75      0.74        51

    accuracy                           0.78       122
   macro avg       0.77      0.77      0.77       122
weighted avg       0.78      0.78      0.78       122

--------------------------------------------------------------------------------
Probando TF-IDF + unigramas + SVM

Classification Report:
              precision    recall  f1-score   support

   No Estrés       0.84      0.80      0.82        71
      Estrés       0.74      0.78      0.76        51

    accuracy                           0.80       122
   macro avg       0.79      0.79      0.79       122
weighted avg       0.80      0.80      0.80       122

--------------------------------------------------------------------------------
Probando TF-IDF + unigramas + LogisticRegression

Classificat

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='F1_macro', ascending=False)
results_df.reset_index(drop=True, inplace=True)
results_df.head(20)  # Muestra los 10 mejores


Unnamed: 0,Vectorizador,N-gramas,Modelo,Accuracy,F1_macro,Mejores parámetros
0,Frecuencia,unigramas,SVM,0.8525,0.8484,"{'clf__C': 1, 'clf__gamma': 'scale', 'clf__ker..."
1,Frecuencia,unigramas,LogisticRegression,0.8443,0.8412,"{'clf__C': 10, 'clf__penalty': 'l2'}"
2,Binaria,uni+bi,LogisticRegression,0.8361,0.8339,"{'clf__C': 10, 'clf__penalty': 'l2'}"
3,Binaria,unigramas,LogisticRegression,0.8361,0.8324,"{'clf__C': 10, 'clf__penalty': 'l2'}"
4,Binaria,unigramas,SVM,0.8279,0.8259,"{'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__k..."
5,Binaria,uni+tri,SVM,0.8279,0.8259,"{'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__k..."
6,Frecuencia,uni+bi,LogisticRegression,0.8279,0.8259,"{'clf__C': 10, 'clf__penalty': 'l2'}"
7,Binaria,uni+bi,SVM,0.8279,0.8259,"{'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__k..."
8,Binaria,uni+bi,NaiveBayes,0.8197,0.8192,{'clf__alpha': 0.1}
9,Binaria,uni+tri,NaiveBayes,0.8197,0.8192,{'clf__alpha': 0.1}


In [None]:
results_df.to_csv("/content/resultados_experimentos.csv", index=False)
