# Procesado 

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import pandas as pd
import datetime

In [6]:
df = pd.read_csv("../../data/processed/train_simp_preprocess_v2.csv")
df_test = pd.read_csv("../../data/processed/test_simp_preprocess_v2.csv")


## Procesado inicial 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 31 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   id                                  8950 non-null   object
 1   label                               8950 non-null   int64 
 2   statement                           8950 non-null   object
 3   subject                             8950 non-null   object
 4   speaker                             8950 non-null   object
 5   speaker_job                         8950 non-null   object
 6   state_info                          8950 non-null   object
 7   party_affiliation                   8950 non-null   object
 8   party_affiliation_uni               8950 non-null   object
 9   party_affiliation_category_map      8950 non-null   object
 10  statement_tokens                    8950 non-null   object
 11  num_tokens                          8950 non-null   int6

### Intento 1

In [9]:
X = df['statement']
y = df['label']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

X_test_tfidf = vectorizer.transform(X_test)

y_pred = clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6208530805687204
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.38      0.40      1014
           1       0.70      0.75      0.72      1940

    accuracy                           0.62      2954
   macro avg       0.57      0.56      0.56      2954
weighted avg       0.61      0.62      0.61      2954

Confusion Matrix:
[[ 381  633]
 [ 487 1453]]


### Intento 2

In [None]:
cols = [
    'statement_tokens_without_stopwords',
    'processed_subject',
    'party_affiliation_category_map',
]

X = df[cols]
y = df['label']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


X = df[[
    'statement_tokens_without_stopwords',
    'processed_subject',
    'party_affiliation_category_map'
]]
y = df['label']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)


preprocessor = ColumnTransformer(transformers=[
    ('txt1', TfidfVectorizer(), 'statement_tokens_without_stopwords'),
    ('txt2', TfidfVectorizer(), 'processed_subject'),
    ('cat',  OneHotEncoder(handle_unknown='ignore'),
             ['party_affiliation_category_map'])
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LinearSVC())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))


IndexError: tuple index out of range

### Intento 3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df[[
    'statement_tokens_without_stopwords',
    'processed_subject',
    'party_affiliation_category_map',
    'num_tokens_without_stopwords',
    'num_sentences'
]]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

preprocessor = ColumnTransformer(transformers=[
    ('txt1', TfidfVectorizer(), 'statement_tokens_without_stopwords'),
    ('txt2', TfidfVectorizer(), 'processed_subject'),
    ('cat',  OneHotEncoder(handle_unknown='ignore'),
             ['party_affiliation_category_map']),
    ('num', StandardScaler(),
            ['num_tokens_without_stopwords', 'num_sentences'])
])

pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', LinearSVC(max_iter=5000))
])

param_grid = {
    'pre__txt1__ngram_range': [(1,1), (1,2)],
    'pre__txt1__max_df':      [0.8, 1.0],
    'pre__txt2__ngram_range': [(1,1), (1,2)],
    'pre__txt2__max_df':      [0.8, 1.0],
    'clf__C':                [0.1, 1, 10],
    'clf__class_weight':    [None, 'balanced']
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print(" Mejores parámetros:", grid.best_params_)
print(" Mejor F1-macro CV:", grid.best_score_)

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print(f"\nAccuracy (test): {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report (test):")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix (test):")
print(confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 96 candidates, totalling 480 fits
▶︎ Mejores parámetros: {'clf__C': 0.1, 'clf__class_weight': 'balanced', 'pre__txt1__max_df': 0.8, 'pre__txt1__ngram_range': (1, 2), 'pre__txt2__max_df': 0.8, 'pre__txt2__ngram_range': (1, 2)}
▶︎ Mejor F1-macro CV: 0.5745775498838119

Accuracy (test): 0.5992

Classification Report (test):
              precision    recall  f1-score   support

           0       0.44      0.53      0.48      1041
           1       0.71      0.64      0.67      1913

    accuracy                           0.60      2954
   macro avg       0.58      0.58      0.58      2954
weighted avg       0.62      0.60      0.61      2954


Confusion Matrix (test):
[[ 547  494]
 [ 690 1223]]


In [None]:
import datetime
import pandas as pd

cols_submission = [
    'statement_tokens_without_stopwords',
    'processed_subject',
    'party_affiliation_category_map',
    'num_tokens_without_stopwords',
    'num_sentences'
]

X_test_raw = df_test[cols_submission]

y_pred = best_model.predict(X_test_raw)

submission = pd.DataFrame({
    'id':    df_test['id'],
    'label': y_pred
})

filename = f"svc_submission_{datetime.datetime.now().strftime('%Y%m%d')}.csv"
submission.to_csv(filename, index=False, columns=['id', 'label'])

print(f" Submission generada correctamente: '{filename}'")
print(submission.head())


### Intento 4

In [None]:
from sentence_transformers import SentenceTransformer
path = r"C:\modelos\all-MiniLM-L6-v2"
bert = SentenceTransformer(path)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
import numpy as np
import datetime
import spacy

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from scipy.stats import uniform

from sentence_transformers import SentenceTransformer


for col in ['statement_tokens_without_stopwords','processed_subject','party_affiliation_category_map']:
    df[col].fillna('', inplace=True)
    df_test[col].fillna('', inplace=True)

for col in ['num_tokens_without_stopwords','num_sentences']:
    df[col].fillna(0, inplace=True)
    df_test[col].fillna(0, inplace=True)


nlp = spacy.load('en_core_web_sm', disable=['parser','ner','tok2vec','tagger'])

class StyleFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        rows = []
        for doc in nlp.pipe(X, batch_size=32):
            rows.append([
                sum(1 for t in doc if t.text.lower() in {'i','me','my','mine','we','us','our','ours'}),
                sum(1 for t in doc if t.text.lower() in {'they','them','their','theirs','he','she','him','her'}),
                sum(1 for t in doc if t.dep_ == 'neg'),
                sum(1 for t in doc if t.text.lower() in {'maybe','perhaps','possibly','could','might'}),
                sum(1 for t in doc if t.text.lower() in {'but','however','although','though','yet'}),
                len(doc) / max(1, len(list(doc.sents)))
            ])
        return np.array(rows)


model_path = r"C:\modelos\all-MiniLM-L6-v2" 
bert = SentenceTransformer(model_path)

bert_transformer = FunctionTransformer(
    func=lambda X: bert.encode(X.tolist(), batch_size=32, show_progress_bar=False),
    validate=False
)

cols = [
    'statement_tokens_without_stopwords',
    'processed_subject',
    'party_affiliation_category_map',
    'num_tokens_without_stopwords',
    'num_sentences'
]

X = df[cols]
y = df['label']
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
preprocessor = ColumnTransformer([
    ('bert', bert_transformer, 'statement_tokens_without_stopwords'),          
    ('tfidf_subj', TfidfVectorizer(), 'processed_subject'),                   
    ('cat_party', OneHotEncoder(handle_unknown='ignore'), ['party_affiliation_category_map']),  
    ('num_feats', StandardScaler(), ['num_tokens_without_stopwords', 'num_sentences']),         
    ('style', StyleFeaturesExtractor(), 'statement_tokens_without_stopwords')                    
])

pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', LinearSVC(max_iter=5000))
])

param_dist = {
    'preproc__tfidf_subj__max_df':      uniform(0.7,0.3),
    'preproc__tfidf_subj__ngram_range': [(1,1), (1,2)],
    'clf__C':                           uniform(0.01, 10),
    'clf__class_weight':                [None, 'balanced']
}

search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist,
    n_iter=20, cv=5, scoring='f1_macro',
    n_jobs=-1, verbose=2, random_state=42
)

search.fit(X_train, y_train)

print("Mejores parámetros:", search.best_params_)
print("Mejor F1 macro CV:", search.best_score_)

y_pred = search.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_valid, y_pred))
try:
    print("ROC AUC:", roc_auc_score(y_valid, search.decision_function(X_valid)))
except:
    print("ROC AUC no disponible para este clasificador.")

test_pred = search.predict(df_test[cols])
submission = pd.DataFrame({'id': df_test['id'], 'label': test_pred})
filename = f"svc_final_submission_{datetime.datetime.now():%Y%m%d}.csv"
submission.to_csv(filename, index=False)
print(f"Submission guardada en: {filename}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test[col].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a co

Fitting 5 folds for each of 20 candidates, totalling 100 fits


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1001, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\compose\_column_transformer.py", line 910, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\utils\parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\joblib\parallel.py", line 1985, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\joblib\parallel.py", line 1913, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\utils\parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\anaeg\Desktop\LBBYs_CH2\spacy_env\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\anaeg\AppData\Local\Temp\ipykernel_13236\2367161636.py", line 44, in transform
  File "spacy\tokens\doc.pyx", line 926, in sents
ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.


## **Conclusión**

Aunque el experimento híbrido con SBERT y rasgos de estilo (intento 4) alcanzó el F1-macro más alto (~0,58) y mejoró notablemente el equilibrio entre precisión y cobertura de la clase minoritaria, la variabilidad en la validación y los problemas de integración de librerías impidieron establecerlo como claro vencedor; por lo tanto, el que mejor metrcias finales ha dado es el primero. El clásico pipeline de TF-IDF sobre el texto crudo y LinearSVC obtuvo una accuracy de test del 62,09 % y un F1-macro ≈ 0,56 (precisión/recall 0 → 0,44/0,38; 1 → 0,70/0,75). Aunque sencillo, demostró ser muy competitivo frente a configuraciones intermedias que sólo añadían metadatos o contadores numéricos.