# Los MiserAIbles
Sorany Hincapie Salazar  
Brayan Montoya Osorio

### Exploración de los datos

In [2]:
import pandas as pd

In [None]:
path_to_data = 'data/challenge_data-18-ago.csv'

df = pd.read_csv(path_to_data, sep = ';')
df.head(10)

In [4]:
rows, columns = df.shape
print(f"Number of rows: {rows}, Number of columns: {columns}")

Number of rows: 3565, Number of columns: 3


In [5]:
 df = df.copy()

## Extracción de características con NLP

Pasos a seguir:
1. Preprocesamiento del texto

- Limpieza: eliminar caracteres especiales, números innecesarios, URLs, referencias
- Normalización: convertir a minúsculas, manejar acentos y caracteres especiales
- Tokenización: dividir el texto en palabras/tokens individuales
- Eliminación de stopwords: quitar palabras comunes sin valor semántico ("el", "la", "de", "and", "the")
- Stemming/Lemmatización: reducir palabras a su raíz o forma base

2.  Extracción de características textuales
Métodos tradicionales:

- Bag of Words (BoW): frecuencia de palabras
- TF-IDF: Term Frequency - Inverse Document Frequency
- N-gramas: combinaciones de 2-3 palabras consecutivas

3. Entrenamiento modelos clasicos
4. Reducción de dimensionalidad
5. Re-entrenamiento


## Ejemplo de test con los pasos anteriores:

1. Tokenización:  
Dividir texto en palabras, oraciones o elementos pequeños. Ej: Convertir párrafo en lista de palabras.

In [6]:
import nltk

def tokenize_text(text):
    return nltk.word_tokenize(text)

In [None]:
abstract_example = df.iloc[0]['abstract']
print(abstract_example)

In [None]:
tokens = tokenize_text(abstract_example)
print(tokens)

In [9]:
from nltk.tokenize import word_tokenize

2. Limpieza:
Eliminar caracteres especiales.

In [10]:
def clean_basic_tokens(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

In [None]:
cleaned_tokens = clean_basic_tokens(tokens)
print(cleaned_tokens)

3. Stemming:  
Llevar palabras a su forma raíz.

In [12]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stem_words(words):
    return [stemmer.stem(word) for word in words]

In [None]:
stemmed_tokens = stem_words(cleaned_tokens)
print(stemmed_tokens)


4. Eliminar StopWords

In [14]:
def remove_stopwords(tokens):
    from nltk.corpus import stopwords
    academic_stopwords = {
        'abstract', 'paper', 'study', 'research', 'article', 'journal',
        'analysis', 'method', 'approach', 'technique', 'result', 'conclusion',
        'introduction', 'discussion', 'experimental', 'theoretical',
        'also', 'however', 'therefore', 'furthermore', 'moreover'
    }

    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words and token not in academic_stopwords]


In [None]:
filtered_tokens = remove_stopwords(stemmed_tokens)
print(filtered_tokens)

### Procesamiento de dataset:

In [16]:
def preprocessNPL(text):
    tokens = tokenize_text(text)
    clean_tokens = clean_basic_tokens(tokens)
    stemmed_tokens = stem_words(clean_tokens)
    filtered_tokens = remove_stopwords(stemmed_tokens)
    return filtered_tokens


df['tokens_abstract'] = df['abstract'].apply(preprocessNPL)
df['tokens_title'] = df['title'].apply(preprocessNPL)

In [None]:
df.head(10)

In [140]:
dfstr=df.copy()

In [141]:
dfstr['tokens_abstract'] = df['tokens_abstract'].apply(lambda x: " ".join(x))
dfstr['tokens_title'] = df['tokens_title'].apply(lambda x: " ".join(x))

In [None]:
dfstr.head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

import numpy as np
import lightgbm as lgb
from deap import base, creator, tools, algorithms
import random
import xgboost as xgb


In [276]:
# vectorizador para tokens de títulos (cambia valores string a matriz sparse: númerica)
tfidf_title = TfidfVectorizer(max_features=5000)#Ajustable
X_title = tfidf_title.fit_transform(dfstr['tokens_title'])

# vectorizador para tokens de abstracts
tfidf_abstract = TfidfVectorizer(max_features=15000)
X_abstract = tfidf_abstract.fit_transform(dfstr['tokens_abstract'])

In [277]:
top_tokens = np.argsort(np.array(X_abstract.sum(axis=0)).ravel())[-8000:]
Xcarac = X_abstract[:, top_tokens]
feature_names = np.array(tfidf_abstract.get_feature_names_out())[top_tokens]#Asigna nombres a las columnas con las que se entrena el modelo LGBM
                    #la posición del array indica el índice de Xcarac(subset de tokens más repetidos)
y = df['group']  #etiquetas

In [278]:
n_features = Xcarac.shape[1]

In [279]:
X_train, X_test, y_train, y_test = train_test_split(Xcarac, y, test_size=0.2, random_state=42, stratify=y)

In [280]:
def fitness_function(individual):
    selected_indices = [i for i, bit in enumerate(individual) if bit == 1]#Lista de 1 y 0s que indican token relevante a entrenar.
    if len(selected_indices) == 0:
        return 0.,  # evitar entrenamiento con 0 características

    clf = lgb.LGBMClassifier(
        n_estimators=50,
        max_depth=5,
        min_child_samples=5,
        min_gain_to_split=0.0,
        n_jobs=-1
    )

    #sparse matrix (n_muestras, n_features). Toma las características de acuerdo a los índices(col) indicados; conserva número de filas
    X_train_sel = X_train[:, selected_indices]
    X_test_sel  = X_test[:, selected_indices]

    #LightGBM: acepta matrices sparse
    clf.fit(X_train_sel, y_train, feature_name=[feature_names[i] for i in selected_indices])

    acc = clf.score(X_test_sel, y_test)#accuracy
    return acc,

In [None]:
# Configuración DEAP
# Crear clase de fitness y individuo
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", fitness_function)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

In [None]:
pop_size = 12
n_gen = 6

population = toolbox.population(n=pop_size)

algorithms.eaSimple(
    population,
    toolbox,
    cxpb=0.5,
    mutpb=0.2,
    ngen=n_gen,
    verbose=True
)

#Mejor individuo
best_ind = tools.selBest(population, 1)[0]
selected_features_final = [feature_names[i] for i, bit in enumerate(best_ind) if bit == 1]#tokens más significativos según el módelo

print("Número de tokens seleccionados:", len(selected_features_final))
print("Tokens seleccionados:", selected_features_final)

In [None]:
# selected_features: tokens que DEAP seleccionó
vocab = set(vectorizer.get_feature_names_out())
valid_tokens = [t for t in selected_features if t in vocab]
print(len(valid_tokens))
print(len(selected_features))

In [None]:
token_to_index = {t: i for i, t in enumerate(vectorizer.get_feature_names_out())}#Crea un diccionario con todos los tokens que el vectorizador(TF-IDF) identificó.
selected_indices = [token_to_index[t] for t in valid_tokens]#Asigna un índice a cada token. 

X_final = X_abstract[:, selected_indices]
print("Tamaño de la matriz final:", X_final.shape)

In [285]:
from sklearn.preprocessing import OneHotEncoder
original_columns = ['title', 'abstract']

encoder = OneHotEncoder(handle_unknown='ignore')
X_original_sparse = encoder.fit_transform(dfstr[original_columns])

In [286]:
from scipy.sparse import hstack

X_total = hstack([X_final, X_original_sparse])

In [287]:
y = dfstr['group'].str.split('|')
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y)  # ahora cada columna es una categoría

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, hamming_loss
#Clasificación con tokens seleccionados y columnas title y abstract
X_ = X_total
y_ = Y
X_train, X_test, y_train, y_test = train_test_split(
    X_, y_, test_size=0.2, random_state=42, stratify=y
)


clf = OneVsRestClassifier(
    lgb.LGBMClassifier(n_estimators=100, max_depth=7, n_jobs=-1)
)
clf.fit(X_train, y_train)
y_predlg = clf.predict(X_test)

# Métricas multilabel
f1 = f1_score(y_test, y_pred, average='micro')
hamming = hamming_loss(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_predlg))

Clasificador XGBoost

In [290]:
from sklearn.preprocessing import MultiLabelBinarizer

# Separar las etiquetas por '|'
y_multilabel = [y.split('|') for y in dfstr['group']]

mlb = MultiLabelBinarizer()#transformar cada etiqueta en una columna 0/1
Y = mlb.fit_transform(y_multilabel)

In [306]:
x_train, x_test, y_train, y_test = train_test_split(
    Xcarac, Y, test_size=0.2, random_state=42
)

In [None]:
clf_xgb = xgb_multi = OneVsRestClassifier(xgb.XGBClassifier(objective='binary:logistic', n_estimators=10,
                            seed=123))

clf_xgb.fit(x_train, y_train)
preds_xgb = clf_xgb.predict(x_test)

print(classification_report(y_test, preds_xgb))

In [None]:
from sklearn.ensemble import VotingClassifier

#Probabilidades
probs_xgb = clf_xgb.predict_proba(x_test)
probs_lgb = clf.predict_proba(X_test)

f1_xgb = f1_score(y_test, clf_xgb.predict(x_test), average='micro')
f1_lgb = f1_score(y_test, clf.predict(X_test), average='micro')

weights = np.array([f1_xgb, f1_lgb])
weights = weights / weights.sum()

# Votación ponderada
combined_probs = probs_xgb * weights[0] + probs_lgb * weights[1]
y_predcom = (combined_probs >= 0.7).astype(int)#umbral de clasificación según probabilidad combinada

# Métricas multilabel
f1_combined = f1_score(y_test, y_predcom, average='micro')
print("F1 micro combinado:", f1_combined)

In [None]:
report = classification_report(y_test, y_predcom, output_dict=True)
df_report = pd.DataFrame(report).transpose()
print(df_report)
hamming = hamming_loss(y_test, y_predcom)
print("-----------------------------")
print("Hamming loss:", hamming)#proporción de etiquetas incorrectas