In [1]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import datetime
import pandas as pd
from catboost import Pool
from collections import Counter


In [2]:
# Cargar los datos
df = pd.read_csv("../../data/processed/train_preprocess_v1.csv")
df.shape

# Creamos un df_train sobre el que hacer las transformaciones
train_df = df.copy()

In [3]:
df.head(2)

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,party_affiliation_uni,party_affiliation_category_map,...,pos_info_without_stopwords,pos_freq_without_stopwords,lemma_freq_without_stopwords,tag_freq_without_stopwords,processed_subject,speaker_entities,speaker_type,speaker_job_tokens,state_info_tokens,party_affiliation_tokens
0,81f884c64a7,1,china is in the south china sea and (building)...,"china,foreign-policy,military",donald-trump,president-elect,new_york,republican,republican,political-affiliation,...,"[{'lemma': 'china', 'pos': 'PROPN', 'tag': 'NN...","Counter({'PROPN': 4, 'NOUN': 4, 'ADJ': 1, 'VER...","Counter({'china': 2, 'south': 1, 'sea': 1, 'bu...","Counter({'NNP': 4, 'NN': 3, 'JJ': 1, 'NNS': 1,...","['china', 'foreign-policy', 'military']",['donald trump'],['PERSON'],"['president', '-', 'elect']",['new_york'],['republican']
1,30c2723a188,0,with the resources it takes to execute just ov...,health-care,chris-dodd,u.s. senator,connecticut,democrat,democrat,political-affiliation,...,"[{'lemma': 'resource', 'pos': 'NOUN', 'tag': '...","Counter({'NOUN': 7, 'VERB': 4, 'PROPN': 2, 'AD...","Counter({'resource': 1, 'take': 1, 'execute': ...","Counter({'NN': 4, 'NNS': 3, 'VB': 2, 'NNP': 2,...",['health-care'],['chris dodd'],['PERSON'],"['u.s', '.', 'senator']",['connecticut'],['democrat']


## Selección de categorías

A continuación se describen brevemente las características útiles para el modelo:

### id

La variable `id` carece de poder descriptivo. Se añade a las columnas a eliminar.

In [4]:
drop_cols = ["id"]

### Statement 

La variable `statement` resulta muy útil para el modelo. En este caso, se declarará como variable de tipo texto para que CatBoost la identifique como tal y le aplique el procesamiento nativo adecuado.

In [5]:
text_features = ["statement"]

### Variables derivadas del procesado de Statement

En relación con `statement`, tenemos `statement_tokens`, que no es procesable nativamente para CatBoost. Como ya se va a representar el efecto del statement con la variable original, se elimina del modelo. Su variante sin stopwords, también se quita, y la lista de stopwords también.

In [6]:
drop_cols += ["statement_tokens", "statement_tokens_without_stopwords", "stopwords"]
drop_cols

['id', 'statement_tokens', 'statement_tokens_without_stopwords', 'stopwords']

Se podría evaluar su rendimiento en sustitución de `statement` procesada por CatBoost. Se verá más adelante. Gracias al procesamiento nativo de CatBoost de variables de tipo texto, obtiene de forma natural estadísticas derivadas de la tokenización, por lo que eliminaremos estas variables y nos quedaremos con las que tienen significado en términos de análisis linguístico. 

También están los contadores de tokens, que se dejarán en una primera iteración, pero teniendo presente el procesamiento nativo de CatBoost.

In [7]:
train_df["num_stopwords_tokens"] = train_df["num_tokens"] - train_df["num_tokens_without_stopwords"]
drop_cols += ["num_tokens_without_stopwords"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords']

Se observa `num_sentences` porque a pesar de que CatBoost procesa tokens, no sabemos si obtiene información sobre el número de frases.

In [8]:
df["num_sentences"].describe()

count    8950.000000
mean        1.136983
std         0.469908
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max        14.000000
Name: num_sentences, dtype: float64

Debido a que incluso por debajo del percentil 75, la mayoría de los valores es 1, se decide eliminarla del modelo.

In [9]:
drop_cols += ["num_sentences"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences']

#### POS

Tenemos `pos_info` y `pos_freq`, con sus variantes `without_stopwords`. 

In [10]:
df["pos_info"].head(2)

0    [{'lemma': 'china', 'pos': 'PROPN', 'tag': 'NN...
1    [{'lemma': 'with', 'pos': 'ADP', 'tag': 'IN', ...
Name: pos_info, dtype: object

In [11]:
df["pos_freq"].head(2)

0    Counter({'PROPN': 4, 'NOUN': 4, 'DET': 3, 'AUX...
1    Counter({'NOUN': 7, 'ADP': 5, 'VERB': 4, 'DET'...
Name: pos_freq, dtype: object

`pos_info`contiene la clasificación de palabras del statement, y `pos_freq`el número de palabras de cada tipo. Se va a tratar de representar el estilo de lenguaje a través de métricas obtenidas de estas variables: 

In [12]:
import ast
from collections import Counter

# Convertir a diccionario limpio
def clean_counter(x):
    if isinstance(x, str) and x.startswith("Counter("):
        try:
            return ast.literal_eval(x.replace("Counter(", "").rstrip(")"))
        except:
            return {}
    elif isinstance(x, Counter):
        return dict(x)
    elif isinstance(x, dict):
        return x
    else:
        return {}
        

In [13]:
print(train_df.shape)
# Aplicar la conversión
train_df["pos_freq_clean"] = train_df["pos_freq"].apply(clean_counter)
train_df["pos_freq_without_stopwords_clean"] = train_df["pos_freq_without_stopwords"].apply(clean_counter)

# Expandir en columnas
pos_df = train_df["pos_freq_clean"].apply(pd.Series).fillna(0).astype(int)
pos_df_wo_stopwords = train_df["pos_freq_without_stopwords_clean"].apply(pd.Series).fillna(0).astype(int)

# Renombrar columnas
pos_df.columns = [f"pos_count_{col.upper()}" for col in pos_df.columns]
pos_df_wo_stopwords.columns = [f"pos_{col.upper()}" for col in pos_df_wo_stopwords.columns]

pos_df.head(5)

(8950, 32)


Unnamed: 0,pos_count_PROPN,pos_count_NOUN,pos_count_DET,pos_count_AUX,pos_count_ADP,pos_count_PUNCT,pos_count_CCONJ,pos_count_ADJ,pos_count_PRON,pos_count_ADV,pos_count_PART,pos_count_VERB,pos_count_NUM,pos_count_SCONJ,pos_count_SYM,pos_count_X,pos_count_INTJ,pos_count_SPACE
0,4,4,3,2,2,2,1,1,1,1,1,1,0,0,0,0,0,0
1,2,7,3,1,5,2,0,1,3,2,1,4,1,0,0,0,0,0
2,1,4,1,1,1,3,0,0,0,0,0,1,0,0,0,0,0,0
3,0,10,3,2,3,1,0,1,5,1,0,2,0,0,0,0,0,0
4,0,7,0,1,4,4,0,3,0,0,1,4,0,0,0,0,0,0


In [14]:
pos_df_wo_stopwords.head(5)

Unnamed: 0,pos_PROPN,pos_NOUN,pos_ADJ,pos_VERB,pos_ADV,pos_NUM,pos_AUX,pos_SCONJ,pos_SYM,pos_PART,pos_PRON,pos_X,pos_ADP,pos_INTJ,pos_DET,pos_CCONJ,pos_PUNCT,pos_SPACE
0,4,4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,7,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,9,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,7,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Puesto que contienen información muy similar y la cantidad de stopwords se puede obtener a través de stopwords, se decide mantener únicamente una de ellas, la versión sin `stopwords`, por considerar que introducen menor ruido al modelo. Comprobamos los resultados de la transformación, verificando: 
- Número de columnas anteriores y actuales

In [15]:
# Comprobamos los resultados
print(train_df.shape)
print(pos_df_wo_stopwords.shape)

(8950, 34)
(8950, 18)


Antes de nada, quitamos las versiones clean creadas anteriormente, para que el dataset vuelva a tener 31 columnas

In [16]:
aux_drop_cols = ["pos_freq_clean", "pos_freq_without_stopwords_clean"]
train_df = train_df.drop(columns=aux_drop_cols)

In [17]:
train_df = pd.concat([train_df, pos_df_wo_stopwords], axis=1)
print(train_df.shape)
train_df.head(1)

(8950, 50)


Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,party_affiliation_uni,party_affiliation_category_map,...,pos_SYM,pos_PART,pos_PRON,pos_X,pos_ADP,pos_INTJ,pos_DET,pos_CCONJ,pos_PUNCT,pos_SPACE
0,81f884c64a7,1,china is in the south china sea and (building)...,"china,foreign-policy,military",donald-trump,president-elect,new_york,republican,republican,political-affiliation,...,0,0,0,0,0,0,0,0,0,0


- Resultados: 

In [18]:
print(df["pos_freq_without_stopwords"].iloc[1])
pos_df_wo_stopwords.head(2)

Counter({'NOUN': 7, 'VERB': 4, 'PROPN': 2, 'ADV': 1, 'ADJ': 1})


Unnamed: 0,pos_PROPN,pos_NOUN,pos_ADJ,pos_VERB,pos_ADV,pos_NUM,pos_AUX,pos_SCONJ,pos_SYM,pos_PART,pos_PRON,pos_X,pos_ADP,pos_INTJ,pos_DET,pos_CCONJ,pos_PUNCT,pos_SPACE
0,4,4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,7,1,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
pos_df_wo_stopwords.sum()

pos_PROPN    15079
pos_NOUN     41363
pos_ADJ       9244
pos_VERB     18197
pos_ADV       1912
pos_NUM       2244
pos_AUX        361
pos_SCONJ       36
pos_SYM       1117
pos_PART       463
pos_PRON        66
pos_X           45
pos_ADP        294
pos_INTJ        32
pos_DET         57
pos_CCONJ       16
pos_PUNCT        7
pos_SPACE      291
dtype: int64

Se observa su frecuencia por si fuera interesante reducir cardinalidad.

Eliminamos las columnas de POS:

In [20]:
drop_cols += ["pos_info", "pos_freq", "pos_freq_without_stopwords","pos_info_without_stopwords"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords']

#### Lemma y Tag

In [21]:
# Aplicar a ambas columnas
train_df["lemma_freq_clean"] = df["lemma_freq"].apply(clean_counter)
train_df["tag_freq_clean"] = df["tag_freq"].apply(clean_counter)

# Expandir lemas
lemma_df = train_df["lemma_freq_clean"].apply(pd.Series).fillna(0).astype(int)
lemma_df.columns = [f"lemma_count_{col}" for col in lemma_df.columns]

# Expandir tags
tag_df = train_df["tag_freq_clean"].apply(pd.Series).fillna(0).astype(int)
tag_df.columns = [f"tag_count_{col}" for col in tag_df.columns]

In [22]:
drop_aux_cols = ["lemma_freq_clean","tag_freq_clean"]
train_df = train_df.drop(columns=drop_aux_cols)

In [23]:
print(train_df.shape)
print(lemma_df.shape)
print(tag_df.shape)

(8950, 50)
(8950, 9357)
(8950, 50)


`lemma_freq` incluye palabras que carecen de importancia para el modelo, por lo que, debido a su cardinalidad, finalmente no se incluye. Se elimina del dataset. 

Por otro lado, `tag` incluye una clasificación de tipología de palabras más específica que POS. Se evalúa sin stopwords por si reduce significativamente la cardinalidad y resulta relevante para el modelo.

In [24]:
drop_cols += ["lemma_freq","tag_freq"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq']

In [25]:
print(train_df.shape)

(8950, 50)


Dado que en lemma se incluyen `stopwords`, se observa la diferencia entre ambas para ver si lemma_freq_without_stopwords aportaría información

In [26]:
df["lemma_freq"]

0       Counter({'the': 3, 'china': 2, 'be': 1, 'in': ...
1       Counter({'the': 3, 'of': 2, 'with': 1, 'resour...
2       Counter({'the': 1, '(': 1, 'wisconsin': 1, ')'...
3       Counter({'be': 2, 'a': 2, 'say': 1, 'her': 1, ...
4       Counter({'at': 1, 'protest': 1, 'in': 1, 'wisc...
                              ...                        
8945    Counter({'do': 2, 'the': 2, 'if': 1, 'rhode': ...
8946    Counter({'health': 2, 'care': 2, 'the': 1, 'ne...
8947    Counter({'the': 2, 'health': 1, 'insurance': 1...
8948    Counter({'a': 3, '.': 2, 'no': 1, 'one': 1, 'i...
8949    Counter({'say': 1, 'the': 1, 'army': 1, 'be': ...
Name: lemma_freq, Length: 8950, dtype: object

In [27]:
df["lemma_freq_without_stopwords"]

0       Counter({'china': 2, 'south': 1, 'sea': 1, 'bu...
1       Counter({'resource': 1, 'take': 1, 'execute': ...
2       Counter({'wisconsin': 1, 'governor': 1, 'propo...
3       Counter({'say': 1, 'representation': 1, 'ex': ...
4       Counter({'protest': 1, 'wisconsin': 1, 'propos...
                              ...                        
8945    Counter({'rhode': 1, 'island': 1, 'hybrid': 1,...
8946    Counter({'health': 2, 'care': 2, 'new': 1, 'la...
8947    Counter({'health': 1, 'insurance': 1, 'plan': ...
8948    Counter({'american': 1, 'history': 1, 'move': ...
8949    Counter({'say': 1, 'army': 1, 'spend': 1, '$':...
Name: lemma_freq_without_stopwords, Length: 8950, dtype: object

Puede ser interesante, dependiendo de cuánto se reduzca la cardinalidad. Evaluamos: 

In [28]:
# Aplicar a ambas columnas
#train_df["lemma_freq_without_stopwords"] = df["lemma_freq_without_stopwords"].apply(clean_counter)
train_df["tag_freq_without_stopwords"] = df["tag_freq_without_stopwords"].apply(clean_counter)

# Expandir lemas
#lemma_df = train_df["lemma_freq_without_stopwords"].apply(pd.Series).fillna(0).astype(int)
#lemma_df.columns = [f"lemma_count_{col}" for col in lemma_df.columns]

# Expandir tags
tag_df = train_df["tag_freq_without_stopwords"].apply(pd.Series).fillna(0).astype(int)
tag_df.columns = [f"tag_count_{col}" for col in tag_df.columns]

In [29]:
train_df.shape

(8950, 50)

In [30]:
tag_df.sum()

tag_count_NNP     14526
tag_count_NN      29119
tag_count_JJ       8380
tag_count_NNS     12294
tag_count_VBN      3625
tag_count_VB       3465
tag_count_VBZ      3798
tag_count_RB       2297
tag_count_VBD      3667
tag_count_VBG      2685
tag_count_JJS       507
tag_count_CD       2244
tag_count_VBP      1272
tag_count_MD         43
tag_count_IN        318
tag_count_JJR       356
tag_count_$        1116
tag_count_RBS        27
tag_count_NNPS      555
tag_count_FW         21
tag_count_UH         32
tag_count_ADD         7
tag_count_RBR        50
tag_count_PDT        51
tag_count_RP         12
tag_count_DT         11
tag_count_LS          7
tag_count_XX         10
tag_count_CC         16
tag_count_WDT         2
tag_count_SYM         1
tag_count_,           2
tag_count_PRP$        2
tag_count_:           2
tag_count__SP       291
tag_count_``          2
tag_count_TO          1
tag_count_PRP         8
tag_count_AFX         1
tag_count_NFP         1
dtype: int64

Se reduce, pero no suficiente. No se añade. En el caso de `tag_freq_without_stopwords`, se evalua filtrar por frecuencia o por media. Tras hacer el estudio por media, se ve que no es interesante por lo que finalmente se filtra por frecuencia. 

In [31]:
tag_cols = [col for col in tag_df.columns if col.startswith("tag_count_")]

# Frecuencia total
tag_counts = tag_df[tag_cols].sum().sort_values(ascending=False)

# Umbral
threshold = 1000
tag_to_keep = tag_counts[tag_counts >= threshold].index.tolist()
tag_to_drop = tag_counts[tag_counts < threshold].index.tolist()

# Columna para los que no superan el umbral
tag_df["tag_OTHER"] = tag_df[tag_to_drop].any(axis=1).astype(int)

# Eliminar los tags raros
tag_df = tag_df.drop(columns=tag_to_drop, errors="ignore")
train_df = pd.concat([train_df, tag_df], axis=1)

In [32]:
print(tag_df.shape)
train_df.shape

(8950, 14)


(8950, 64)

In [33]:
train_df.head(2)

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,party_affiliation_uni,party_affiliation_category_map,...,tag_count_VBN,tag_count_VB,tag_count_VBZ,tag_count_RB,tag_count_VBD,tag_count_VBG,tag_count_CD,tag_count_VBP,tag_count_$,tag_OTHER
0,81f884c64a7,1,china is in the south china sea and (building)...,"china,foreign-policy,military",donald-trump,president-elect,new_york,republican,republican,political-affiliation,...,1,0,0,0,0,0,0,0,0,0
1,30c2723a188,0,with the resources it takes to execute just ov...,health-care,chris-dodd,u.s. senator,connecticut,democrat,democrat,political-affiliation,...,0,2,1,1,1,0,0,0,0,0


In [34]:
drop_cols += ["lemma_freq_without_stopwords","tag_freq_without_stopwords"]

### Subject

Para reflejar el efecto de subject, vamos a utilizar encoding sobre `processed_subject`, su versión tokenizada. Por tanto, descartamos `subject` y obtendremos dummies:

In [35]:
drop_cols += ["subject"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject']

In [36]:
import ast

# Convertir strings tipo "[tax, healthcare]" en listas reales
train_df["processed_subject"] = df["processed_subject"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)

# Aplanar la lista de subjects
all_subjects = [item for sublist in train_df["processed_subject"] for item in sublist]

In [37]:
# Seleccionar los más comunes
subject_counts = Counter(all_subjects)
top_n = 100
top_subjects = [s for s, count in subject_counts.most_common(top_n)]
top_subjects

['economy',
 'health-care',
 'taxes',
 'federal-budget',
 'education',
 'jobs',
 'state-budget',
 'candidates-biography',
 'elections',
 'immigration',
 'foreign-policy',
 'crime',
 'history',
 'energy',
 'legal-issues',
 'environment',
 'guns',
 'military',
 'job-accomplishments',
 'workers',
 'terrorism',
 'abortion',
 'message-machine-2012',
 'transportation',
 'criminal-justice',
 'state-finances',
 'states',
 'public-health',
 'deficit',
 'pundits',
 'congress',
 'women',
 'message-machine',
 'corrections-and-updates',
 'children',
 'campaign-finance',
 'voting-record',
 'medicare',
 'stimulus',
 'labor',
 'income',
 'poverty',
 'government-regulation',
 'ethics',
 'religion',
 'public-safety',
 'polls',
 'government-efficiency',
 'corporations',
 'iraq',
 'social-security',
 'climate-change',
 'market-regulation',
 'city-government',
 'debt',
 'families',
 'homeland-security',
 'gays-and-lesbians',
 'financial-regulation',
 'civil-rights',
 'abc-news-week',
 'drugs',
 'trade',
 '

In [38]:
# Inicializar a 0 las columnas dummies
for subj in top_subjects:
    train_df[f"subject_{subj}"] = 0

# Asignar 1 si el subject aparece en esa fila
def add_subject_dummies(row):
    for subj in row["processed_subject"]:
        if subj in top_subjects:
            row[f"subject_{subj}"] = 1
    return row

train_df = train_df.apply(add_subject_dummies, axis=1)

  train_df[f"subject_{subj}"] = 0
  train_df[f"subject_{subj}"] = 0
  train_df[f"subject_{subj}"] = 0
  train_df[f"subject_{subj}"] = 0
  train_df[f"subject_{subj}"] = 0


In [39]:
train_df

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,party_affiliation_uni,party_affiliation_category_map,...,subject_oil-spill,subject_afghanistan,subject_wealth,subject_china,subject_florida,subject_sexuality,subject_water,subject_population,subject_animals,subject_pensions
0,81f884c64a7,1,china is in the south china sea and (building)...,"china,foreign-policy,military",donald-trump,president-elect,new_york,republican,republican,political-affiliation,...,0,0,0,1,0,0,0,0,0,0
1,30c2723a188,0,with the resources it takes to execute just ov...,health-care,chris-dodd,u.s. senator,connecticut,democrat,democrat,political-affiliation,...,0,0,0,0,0,0,0,0,0,0
2,6936b216e5d,0,the (wisconsin) governor has proposed tax give...,"corporations,pundits,taxes,abc-news-week",donna-brazile,political commentator,washington_dc,democrat,democrat,political-affiliation,...,0,0,0,0,0,0,0,0,0,0
3,b5cd9195738,1,says her representation of an ex-boyfriend who...,"candidates-biography,children,ethics,families,...",rebecca-bradley,non-define,non-define,none,none,other-political-groups,...,0,0,0,0,0,0,0,0,0,0
4,84f8dac7737,0,at protests in wisconsin against proposed coll...,"health-care,labor,state-budget",republican-party-wisconsin,non-define,wisconsin,republican,republican,political-affiliation,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,44edff2b865,1,if rhode island does a hybrid [retirement] pla...,"pensions,public-service,retirement,workers",lincoln-chafee,non-define,rhode_island,democrat,democrat,political-affiliation,...,0,0,0,0,0,0,0,0,0,1
8946,4a63b5f9c16,1,the new health care law will force seniors int...,"medicare,message-machine,retirement",dan-coats,non-define,indiana,republican,republican,political-affiliation,...,0,0,0,0,0,0,0,0,0,0
8947,7c57fa8e81c,0,the health insurance plan that (members of con...,health-care,steve-southerland,"u.s. representative, florida district 2",florida,republican,republican,political-affiliation,...,0,0,0,0,0,0,0,0,0,0
8948,2375e3cf4b7,1,no one in american history has moved from a ju...,"elections,history",newt-gingrich,"co-host on cnn's ""crossfire""",georgia,republican,republican,political-affiliation,...,0,0,0,0,0,0,0,0,0,0


Ahora los datos de entrenamiento contienen los temas en formato dummies, más reconocible para catboost

El dataset tiene 100 columnas más, ya que se han seleccionado los 100 topics más hablados. Después de hacer uso de la variable origen, podemos descartarla.

In [40]:
drop_cols += ["processed_subject"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject']

### Speaker Job

Para speaker job vamos a integrar sus tokens, guardarlos en `speaker_job_text` y usarlos como texto. la variable original no se usará directamente. 

In [41]:
drop_cols += ["speaker_job","speaker_entities"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject',
 'speaker_job',
 'speaker_entities']

In [42]:
import ast

# Transformamos a lista
train_df["speaker_job_tokens"] = df["speaker_job_tokens"].apply(ast.literal_eval)

train_df["speaker_job_text"] = train_df["speaker_job_tokens"].apply(lambda x: " ".join(x))
train_df.head(2)

  train_df["speaker_job_text"] = train_df["speaker_job_tokens"].apply(lambda x: " ".join(x))


Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,party_affiliation_uni,party_affiliation_category_map,...,subject_afghanistan,subject_wealth,subject_china,subject_florida,subject_sexuality,subject_water,subject_population,subject_animals,subject_pensions,speaker_job_text
0,81f884c64a7,1,china is in the south china sea and (building)...,"china,foreign-policy,military",donald-trump,president-elect,new_york,republican,republican,political-affiliation,...,0,0,1,0,0,0,0,0,0,president - elect
1,30c2723a188,0,with the resources it takes to execute just ov...,health-care,chris-dodd,u.s. senator,connecticut,democrat,democrat,political-affiliation,...,0,0,0,0,0,0,0,0,0,u.s . senator


La lista de tokens no es procesable para CatBoost, por lo que se elimina del conjunto.

In [43]:
drop_cols += ["speaker_job_tokens"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject',
 'speaker_job',
 'speaker_entities',
 'speaker_job_tokens']

Como `speaker_job_text`es una variable de tipo texto, tenemos que indicárselo a CatBoost como tal

In [44]:
text_features += ["speaker_job_text"]
text_features

['statement', 'speaker_job_text']

In [45]:
train_df.head(2)

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state_info,party_affiliation,party_affiliation_uni,party_affiliation_category_map,...,subject_afghanistan,subject_wealth,subject_china,subject_florida,subject_sexuality,subject_water,subject_population,subject_animals,subject_pensions,speaker_job_text
0,81f884c64a7,1,china is in the south china sea and (building)...,"china,foreign-policy,military",donald-trump,president-elect,new_york,republican,republican,political-affiliation,...,0,0,1,0,0,0,0,0,0,president - elect
1,30c2723a188,0,with the resources it takes to execute just ov...,health-care,chris-dodd,u.s. senator,connecticut,democrat,democrat,political-affiliation,...,0,0,0,0,0,0,0,0,0,u.s . senator


### State info

Es una variable puramente categórica, ya procesada y con interés predictivo. Se añade a la lista de features categóricas. `state_info_tokens` contiene la misma información, se descarta. 

In [46]:
cat_features = ["state_info"]
cat_features

['state_info']

In [47]:
drop_cols += ["state_info_tokens"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject',
 'speaker_job',
 'speaker_entities',
 'speaker_job_tokens',
 'state_info_tokens']

In [48]:
df["state_info"].nunique()

56

Comprobamos que quizás la cardinalidad es demasiado alta. Posible reducción.

### Party_affiliation

Tenemos tres variables que hacen referencia a esta información:
- `party_affiliation`: variable original sin limpiar, se descarta.
- `party_affiliation_uni`: variable procesada, observar cardinalidad.
- `party_afiliation_category_map`: variable mapeada manualmente, comparar rendimiento con la anterior.
- `party_affiliation_tokens`: versión tokenizada, se quita

Descartamos la original:    

In [49]:
drop_cols += ["party_affiliation","party_affiliation_tokens"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject',
 'speaker_job',
 'speaker_entities',
 'speaker_job_tokens',
 'state_info_tokens',
 'party_affiliation',
 'party_affiliation_tokens']

Contamos los valores únicos en `_uni`

In [50]:
df["party_affiliation_uni"].nunique()

12

Se trata de un valor aceptable para pasarlo como categórica al modelo. Se añade.

In [51]:
cat_features+=["party_affiliation_uni"]
cat_features

['state_info', 'party_affiliation_uni']

In [52]:
df["party_affiliation_category_map"].nunique()

4

Dado que la cardinalidad de `_uni` es adecuada y esta variable viene de un mapeo manual, se elimina del modelo provisionalmente.

In [53]:
drop_cols += ["party_affiliation_category_map"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject',
 'speaker_job',
 'speaker_entities',
 'speaker_job_tokens',
 'state_info_tokens',
 'party_affiliation',
 'party_affiliation_tokens',
 'party_affiliation_category_map']

In [54]:
df.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job',
       'state_info', 'party_affiliation', 'party_affiliation_uni',
       'party_affiliation_category_map', 'statement_tokens', 'num_tokens',
       'num_sentences', 'pos_info', 'pos_freq', 'lemma_freq', 'tag_freq',
       'entities', 'stopwords', 'statement_tokens_without_stopwords',
       'num_tokens_without_stopwords', 'pos_info_without_stopwords',
       'pos_freq_without_stopwords', 'lemma_freq_without_stopwords',
       'tag_freq_without_stopwords', 'processed_subject', 'speaker_entities',
       'speaker_type', 'speaker_job_tokens', 'state_info_tokens',
       'party_affiliation_tokens'],
      dtype='object')

### Entities

In [55]:
df["entities"]

0         Entidad Tipo de Entidad                Expli...
1                         Entidad Tipo de Entidad  \\n...
2                 Entidad Tipo de Entidad             ...
3       Empty DataFrame\nColumns: [Entidad, Tipo de En...
4            Entidad Tipo de Entidad                Ex...
                              ...                        
8945            Entidad Tipo de Entidad               ...
8946                       Entidad Tipo de Entidad  \\...
8947                 Entidad Tipo de Entidad          ...
8948                  Entidad Tipo de Entidad  \\n0   ...
8949          Entidad Tipo de Entidad                 ...
Name: entities, Length: 8950, dtype: object

In [56]:
import re

def extract_entities(entity_str):
    if not isinstance(entity_str, str) or "Tipo de Entidad" not in entity_str:
        return []

    try:
        lines = entity_str.strip().split("\n")
        lines = lines[1:]
        entities = []
        for line in lines:
                matches = re.findall(r"\b\w*[A-Z]{3,}\w*\b", line)
                if matches:
                    entities+=matches
        return entities if entities else ["OTHER"]
    except:
        return []

# Aplicar al DataFrame
train_df["entity_list"] = df["entities"].apply(extract_entities)
train_df["entity_list"]

  train_df["entity_list"] = df["entities"].apply(extract_entities)


0                                [GPE]
1                [DATE, EVENT, PERSON]
2                                [ORG]
3                              [OTHER]
4                                [GPE]
                     ...              
8945                    [GPE, ORDINAL]
8946                             [ORG]
8947                        [ORG, GPE]
8948    [NORP, DATE, CARDINAL, PERSON]
8949                           [MONEY]
Name: entity_list, Length: 8950, dtype: object

In [57]:
from sklearn.preprocessing import MultiLabelBinarizer

train_df["entity_list"] = train_df["entity_list"].apply(lambda x: x if isinstance(x, list) else [])

# Binarizar
mlb = MultiLabelBinarizer()
entity_dummies = pd.DataFrame(
    mlb.fit_transform(train_df["entity_list"]),
    columns=[f"entity_{cls}" for cls in mlb.classes_],
    index=train_df.index
)

entity_counts = entity_dummies.sum().sort_values(ascending=False)
entity_dummies

Unnamed: 0,entity_CARDINAL,entity_DATE,entity_EVENT,entity_FAC,entity_GPE,entity_LANGUAGE,entity_LAW,entity_LOC,entity_MONEY,entity_NORP,entity_ORDINAL,entity_ORG,entity_OTHER,entity_PERCENT,entity_PERSON,entity_PRODUCT,entity_QUANTITY,entity_TIME,entity_WORK_OF_ART
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8946,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8947,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8948,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [58]:
entity_freq_df = entity_counts.reset_index()
entity_freq_df.columns = ["entity_type", "frequency"]
entity_freq_df

Unnamed: 0,entity_type,frequency
0,entity_PERSON,2690
1,entity_GPE,2689
2,entity_DATE,1943
3,entity_CARDINAL,1869
4,entity_OTHER,1331
5,entity_ORG,1296
6,entity_NORP,1077
7,entity_MONEY,1075
8,entity_PERCENT,924
9,entity_ORDINAL,361


Reducir cardinalidad para no meter ruido. Para ello, vamos a quedarnos con 10 características (después de observar su frecuencia)

In [59]:
import pandas as pd

# Definir umbral 
threshold = 100
entities_to_keep = entity_freq_df[entity_freq_df["frequency"] >= threshold]["entity_type"].tolist()
entities_to_drop = entity_freq_df[entity_freq_df["frequency"] < threshold]["entity_type"].tolist()

# Mostrar
print("Entidades frecuentes (a conservar):")
print(entities_to_keep)
print("Entidades poco frecuentes (a eliminar):")
print(entities_to_drop)

# Crear nueva columna 'entity_OTHER' en entity_dummies
entity_dummies["entity_OTHER"] = entity_dummies[entities_to_drop].any(axis=1).astype(int)
entity_dummies

Entidades frecuentes (a conservar):
['entity_PERSON', 'entity_GPE', 'entity_DATE', 'entity_CARDINAL', 'entity_OTHER', 'entity_ORG', 'entity_NORP', 'entity_MONEY', 'entity_PERCENT', 'entity_ORDINAL']
Entidades poco frecuentes (a eliminar):
['entity_LOC', 'entity_TIME', 'entity_QUANTITY', 'entity_EVENT', 'entity_FAC', 'entity_LANGUAGE', 'entity_PRODUCT', 'entity_LAW', 'entity_WORK_OF_ART']


Unnamed: 0,entity_CARDINAL,entity_DATE,entity_EVENT,entity_FAC,entity_GPE,entity_LANGUAGE,entity_LAW,entity_LOC,entity_MONEY,entity_NORP,entity_ORDINAL,entity_ORG,entity_OTHER,entity_PERCENT,entity_PERSON,entity_PRODUCT,entity_QUANTITY,entity_TIME,entity_WORK_OF_ART
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8946,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8947,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
8948,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0


In [60]:
drop_cols += ["entities","entity_list"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject',
 'speaker_job',
 'speaker_entities',
 'speaker_job_tokens',
 'state_info_tokens',
 'party_affiliation',
 'party_affiliation_tokens',
 'party_affiliation_category_map',
 'entities',
 'entity_list']

### Speaker_type

In [61]:
import ast

train_df["speaker_type"] = df["speaker_type"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else (x if isinstance(x, list) else [])
)


In [62]:
train_df["speaker_type"]

0         [PERSON]
1         [PERSON]
2         [PERSON]
3         [PERSON]
4       [ORG, GPE]
           ...    
8945         [ORG]
8946      [PERSON]
8947      [PERSON]
8948      [PERSON]
8949      [PERSON]
Name: speaker_type, Length: 8950, dtype: object

In [63]:
# Binarizar
mlb = MultiLabelBinarizer()
speaker_type_dummies = pd.DataFrame(
    mlb.fit_transform(train_df["speaker_type"]),
    columns=[f"speaker_type_{cls}" for cls in mlb.classes_],
    index=train_df.index
)

speaker_type_counts = speaker_type_dummies.sum().sort_values(ascending=False)
speaker_type_counts

speaker_type_PERSON      5995
speaker_type_ORG          780
speaker_type_GPE          264
speaker_type_NORP         188
speaker_type_CARDINAL      41
speaker_type_DATE          16
speaker_type_PRODUCT       13
speaker_type_ORDINAL        7
speaker_type_FAC            7
speaker_type_EVENT          4
speaker_type_PERCENT        2
speaker_type_LAW            1
speaker_type_TIME           1
dtype: int64

In [64]:
speaker_type_freq = speaker_type_counts.reset_index()
speaker_type_freq.columns = ["entity_type", "frequency"]
speaker_type_freq

Unnamed: 0,entity_type,frequency
0,speaker_type_PERSON,5995
1,speaker_type_ORG,780
2,speaker_type_GPE,264
3,speaker_type_NORP,188
4,speaker_type_CARDINAL,41
5,speaker_type_DATE,16
6,speaker_type_PRODUCT,13
7,speaker_type_ORDINAL,7
8,speaker_type_FAC,7
9,speaker_type_EVENT,4


In [65]:
# Definir umbral 
threshold = 100
speaker_to_keep = speaker_type_freq[speaker_type_freq["frequency"] >= threshold]["entity_type"].tolist()
speaker_to_drop = speaker_type_freq[speaker_type_freq["frequency"] < threshold]["entity_type"].tolist()

# Mostrar
print("Speakers frecuentes (a conservar):")
print(speaker_to_keep)
print("speakers poco frecuentes (a eliminar):")
print(speaker_to_drop)

# Crear nueva columna 'entity_OTHER' en speaker_dummies
speaker_type_dummies["speaker_OTHER"] = speaker_type_dummies[speaker_to_drop].any(axis=1).astype(int)
speaker_type_dummies = speaker_type_dummies.drop(columns=speaker_to_drop)
speaker_type_dummies

Speakers frecuentes (a conservar):
['speaker_type_PERSON', 'speaker_type_ORG', 'speaker_type_GPE', 'speaker_type_NORP']
speakers poco frecuentes (a eliminar):
['speaker_type_CARDINAL', 'speaker_type_DATE', 'speaker_type_PRODUCT', 'speaker_type_ORDINAL', 'speaker_type_FAC', 'speaker_type_EVENT', 'speaker_type_PERCENT', 'speaker_type_LAW', 'speaker_type_TIME']


Unnamed: 0,speaker_type_GPE,speaker_type_NORP,speaker_type_ORG,speaker_type_PERSON,speaker_OTHER
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,1,0,1,0,0
...,...,...,...,...,...
8945,0,0,1,0,0
8946,0,0,0,1,0
8947,0,0,0,1,0
8948,0,0,0,1,0


In [66]:
train_df = pd.concat([train_df,speaker_type_dummies],axis=1)
drop_cols += ["speaker_type"]
drop_cols

['id',
 'statement_tokens',
 'statement_tokens_without_stopwords',
 'stopwords',
 'num_tokens_without_stopwords',
 'num_sentences',
 'pos_info',
 'pos_freq',
 'pos_freq_without_stopwords',
 'pos_info_without_stopwords',
 'lemma_freq',
 'tag_freq',
 'lemma_freq_without_stopwords',
 'tag_freq_without_stopwords',
 'subject',
 'processed_subject',
 'speaker_job',
 'speaker_entities',
 'speaker_job_tokens',
 'state_info_tokens',
 'party_affiliation',
 'party_affiliation_tokens',
 'party_affiliation_category_map',
 'entities',
 'entity_list',
 'speaker_type']

In [67]:
train_df.shape

(8950, 171)

In [68]:
train_df = train_df.drop(columns=drop_cols)
#train_df = train_df.drop(columns=["statement_tokens_without_stopwords"])

#train_df["num_stopwords_tokens"] = train_df["num_tokens"] - train_df["num_tokens_without_stopwords"]
#train_df = train_df.drop(columns=["num_tokens_without_stopwords"])
#train_df = train_df.drop(columns=["num_sentences"])
#train_df = train_df.drop(columns=["lemma_freq_without_stopwords","tag_freq_without_stopwords"])
#train_df = train_df.drop(columns=["stopwords"])
#train_df = train_df.drop(columns=["speaker_entities"])
#train_df = train_df.drop(columns=["state_info_tokens"])
#train_df = train_df.drop(columns=["party_affiliation_tokens"])
#train_df = train_df.drop(columns=["speaker"])


In [69]:
cat_features

['state_info', 'party_affiliation_uni']

In [70]:
# Clasificar columnas por tipo
import pandas as pd

# 1. Columnas numéricas
numeric_cols = train_df.select_dtypes(include=["int64", "float64"]).columns.tolist()

# 2. Columnas categóricas (tipo object o category con pocos valores únicos)
categorical_cols = train_df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# 3. Detectar columnas de texto: muchas categorías únicas y tipo str
text_cols = [col for col in categorical_cols if train_df[col].nunique() > 30 and train_df[col].apply(lambda x: isinstance(x, str)).all()]

# 4. Excluir columnas de texto de las categóricas
categorical_cols = [col for col in categorical_cols if col not in text_cols]


In [71]:
train_df

Unnamed: 0,label,statement,speaker,state_info,party_affiliation_uni,num_tokens,num_stopwords_tokens,pos_PROPN,pos_NOUN,pos_ADJ,...,subject_water,subject_population,subject_animals,subject_pensions,speaker_job_text,speaker_type_GPE,speaker_type_NORP,speaker_type_ORG,speaker_type_PERSON,speaker_OTHER
0,1,china is in the south china sea and (building)...,donald-trump,new_york,republican,23,13,4,4,1,...,0,0,0,0,president - elect,0,0,0,1,0
1,0,with the resources it takes to execute just ov...,chris-dodd,connecticut,democrat,32,17,2,7,1,...,0,0,0,0,u.s . senator,0,0,0,1,0
2,0,the (wisconsin) governor has proposed tax give...,donna-brazile,washington_dc,democrat,12,6,1,4,0,...,0,0,0,0,political commentator,0,0,0,1,0
3,1,says her representation of an ex-boyfriend who...,rebecca-bradley,non-define,none,28,17,0,9,1,...,0,0,0,0,non - define,0,0,0,1,0
4,0,at protests in wisconsin against proposed coll...,republican-party-wisconsin,wisconsin,republican,24,10,0,7,3,...,0,0,0,0,non - define,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,1,if rhode island does a hybrid [retirement] pla...,lincoln-chafee,rhode_island,democrat,22,15,2,4,1,...,0,0,0,1,non - define,0,0,1,0,0
8946,1,the new health care law will force seniors int...,dan-coats,indiana,republican,18,5,0,8,2,...,0,0,0,0,non - define,0,0,0,1,0
8947,0,the health insurance plan that (members of con...,steve-southerland,florida,republican,25,14,3,6,2,...,0,0,0,0,"u.s . representative , florida district 2",0,0,0,1,0
8948,1,no one in american history has moved from a ju...,newt-gingrich,georgia,republican,26,16,1,5,2,...,0,0,0,0,"co - host on cnn 's "" crossfire """,0,0,0,1,0


In [None]:
from catboost import CatBoostClassifier

# Supongamos que tu variable objetivo se llama 'label'
X = train_df.drop(columns=["label","speaker"])
y = train_df["label"]

# Dividir en train y test (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
import numpy as np


# Modelo base
base_model = CatBoostClassifier(
    auto_class_weights='Balanced',
    verbose=100,
    random_state=42,
    cat_features=cat_features,
    text_features=text_features,
    early_stopping_rounds = 100
)

# Hiperparámetros a explorar
param_dist = {
    'depth': [4, 6],
    'learning_rate': np.linspace(0.01, 0.05, 5),
    'iterations': [800, 1000],
    'l2_leaf_reg': [5, 10, 20],
    'min_data_in_leaf': [10, 20],
    'bagging_temperature': [0.5, 1, 1.5],
    'random_strength': [0.5, 1, 1.5]
}

stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Búsqueda aleatoria
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=stratified_cv,
    verbose=3,
    n_jobs=1
)

search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
0:	learn: 0.6923013	total: 191ms	remaining: 3m 10s
100:	learn: 0.6569140	total: 19.2s	remaining: 2m 51s
200:	learn: 0.6375651	total: 32.8s	remaining: 2m 10s
300:	learn: 0.6041446	total: 47.7s	remaining: 1m 50s
400:	learn: 0.5730324	total: 1m 1s	remaining: 1m 32s


train_pool = Pool(data=X_train, label=y_train, 
                  cat_features=cat_features, 
                  text_features=text_features)

test_pool = Pool(data=X_test, label=y_test, 
                 cat_features=cat_features, 
                 text_features=text_features)

In [None]:
from sklearn.metrics import f1_score, classification_report# Entrenar
model.fit(train_pool, eval_set=test_pool, use_best_model=True)

best_model = search.best_estimator_

# Predicción sobre el test set
y_pred = best_model.predict(X_test)

# Evaluación
from sklearn.metrics import f1_score, classification_report
print("F1 macro:", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from catboost import CatBoostClassifier, Pool

# Obtener importancias
importances = model.get_feature_importance()
features = X_train.columns
importance_df = pd.DataFrame({"feature": features, "importance": importances})
importance_df = importance_df.sort_values(by="importance", ascending=False).head(20)

# Graficar
plt.figure(figsize=(10, 6))
plt.barh(importance_df["feature"][::-1], importance_df["importance"][::-1])
plt.xlabel("Feature Importance")
plt.title("Top 20 Feature Importances - CatBoost")
plt.tight_layout()
plt.show()


In [None]:
# Subir a Kaggle

In [None]:
test_df = pd.read_csv("../../data/processed/test_preprocess_v1.csv")
df_test_id = test_df["id"]
test_df

In [None]:
test_df["num_stopwords_tokens"] = test_df["num_tokens"] - test_df["num_tokens_without_stopwords"]

In [None]:
test_df["pos_freq_without_stopwords_clean"] = test_df["pos_freq_without_stopwords"].apply(clean_counter)
# Expandir en columnas
pos_test_df_wo_stopwords = test_df["pos_freq_without_stopwords_clean"].apply(pd.Series).fillna(0).astype(int)
# Renombrar columnas
pos_test_df_wo_stopwords.columns = [f"pos_{col.upper()}" for col in pos_test_df_wo_stopwords.columns]

pos_test_df_wo_stopwords.columns

In [None]:
test_df = pd.concat([test_df, pos_test_df_wo_stopwords], axis=1)

In [None]:
test_df.head(1)

In [None]:
# Aplicar a ambas columnas
test_df["tag_freq_without_stopwords"] = test_df["tag_freq_without_stopwords"].apply(clean_counter)

# Expandir tags
tag_df = test_df["tag_freq_without_stopwords"].apply(pd.Series).fillna(0).astype(int)
tag_df.columns = [f"tag_count_{col}" for col in tag_df.columns]

In [None]:
# Identificar columnas a eliminar
tag_to_drop = [col for col in tag_df.columns if col not in tag_to_keep]

# Crear 'tag_OTHER' como suma de las columnas poco frecuentes
tag_df["tag_OTHER"] = tag_df[tag_to_drop].sum(axis=1)

# Eliminar columnas no deseadas
tag_df = tag_df.drop(columns=tag_to_drop)

In [None]:
test_df = pd.concat([test_df, tag_df], axis=1)

In [None]:
test_df.head(1)

In [None]:
# Convertir strings tipo "[tax, healthcare]" en listas reales
test_df["processed_subject"] = test_df["processed_subject"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)

# Aplanar la lista de subjects
all_test_subjects = [item for sublist in test_df["processed_subject"] for item in sublist]

In [None]:
filtered_subjects = [s if s in top_subjects else "other" for s in all_test_subjects]

In [None]:
unique_values = list(set(filtered_subjects))
print(len(unique_values))
unique_values = list(set(all_test_subjects))
print(len(unique_values))

In [None]:
# Inicializar a 0 las columnas dummies
for subj in filtered_subjects:
    test_df[f"subject_{subj}"] = 0

test_df = test_df.apply(add_subject_dummies, axis=1)

In [None]:
test_df.head(2)

In [None]:
# Transformamos a lista
test_df["speaker_job_tokens"] = test_df["speaker_job_tokens"].apply(ast.literal_eval)

test_df["speaker_job_text"] = test_df["speaker_job_tokens"].apply(lambda x: " ".join(x))
test_df.head(2)

In [None]:
# Aplicar al DataFrame
test_df["entity_list"] = test_df["entities"].apply(extract_entities)
test_df["entity_list"]

In [None]:
test_df["entity_list"] = test_df["entity_list"].apply(lambda x: x if isinstance(x, list) else [])

# Binarizar
mlb = MultiLabelBinarizer()
test_entity_dummies = pd.DataFrame(
    mlb.fit_transform(test_df["entity_list"]),
    columns=[f"entity_{cls}" for cls in mlb.classes_],
    index=test_df.index
)

test_entity_counts = test_entity_dummies.sum().sort_values(ascending=False)
test_entity_counts

In [None]:
# Crear nueva columna 'entity_OTHER' en entity_dummies
test_drops = []
test_keep = []
for col in test_entity_dummies.columns.tolist():
    if col in entities_to_keep:
        test_keep.append(col)
    else:
        test_drops.append(col)
        
test_entity_dummies["entity_OTHER"] = test_entity_dummies[test_drops].any(axis=1).astype(int)
test_entity_dummies = test_entity_dummies.drop(columns=test_drops)
test_entity_dummies.head(2)

In [None]:
test_entity_dummies.columns

In [None]:
test_df["speaker_type"] = test_df["speaker_type"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else (x if isinstance(x, list) else [])
)

# Binarizar
mlb = MultiLabelBinarizer()
speaker_type_test_dummies = pd.DataFrame(
    mlb.fit_transform(test_df["speaker_type"]),
    columns=[f"speaker_type_{cls}" for cls in mlb.classes_],
    index=test_df.index
)

In [None]:
speaker_to_drop

In [None]:
speaker_to_keep

In [None]:
# Crear nueva columna 'entity_OTHER' en entity_dummies
test_speaker_drops = []
test_speaker_keep = []
for col in speaker_type_test_dummies.columns.tolist():
    if col in speaker_to_keep:
        test_speaker_keep.append(col)
    else:
        test_speaker_drops.append(col)
        
speaker_type_test_dummies["speaker_OTHER"] = speaker_type_test_dummies[test_speaker_drops].any(axis=1).astype(int)
speaker_type_test_dummies = speaker_type_test_dummies.drop(columns=test_speaker_drops)
test_df = pd.concat([test_df,speaker_type_test_dummies],axis=1)

In [None]:
#test_df = test_df.drop(columns=drop_cols)
#test_df = test_df.drop(columns=["speaker"])
#test_df = test_df.drop(columns=["pos_freq_without_stopwords_clean"])
test_df = test_df[X.columns]
X.columns.tolist()

In [None]:
test_df.columns.tolist()

In [None]:
test_pool = Pool(data=test_df,
                 cat_features=cat_features, 
                 text_features=text_features)

In [None]:
import datetime
y_pred_test = model.predict(test_pool)
# Guardar predicciones
current_date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output = pd.DataFrame({
    "id": df_test_id,
    "label": y_pred_test.astype(int)
})
filename = f"../3_summision/CatBoost_NewProcessing_{current_date}.csv"
output.to_csv(filename, index=False)
print(f"Predicciones guardadas en {filename}")

In [None]:
test_df.columns.tolist()

In [None]:
train_df.columns.tolist()