# Procesado 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
df = pd.read_csv("../../data/processed/train_preprocess_v1.csv")

## Procesado inicial 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 31 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   id                                  8950 non-null   object
 1   label                               8950 non-null   int64 
 2   statement                           8950 non-null   object
 3   subject                             8950 non-null   object
 4   speaker                             8950 non-null   object
 5   speaker_job                         8950 non-null   object
 6   state_info                          8950 non-null   object
 7   party_affiliation                   8950 non-null   object
 8   party_affiliation_uni               8950 non-null   object
 9   party_affiliation_category_map      8950 non-null   object
 10  statement_tokens                    8950 non-null   object
 11  num_tokens                          8950 non-null   int6

Seleccion de categorias a a modelar

In [7]:
X = df['statement']
y = df['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

clf = LinearSVC()
clf.fit(X_train_tfidf, y_train)

X_test_tfidf = vectorizer.transform(X_test)

y_pred = clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))



Accuracy: 0.6208530805687204
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.38      0.40      1014
           1       0.70      0.75      0.72      1940

    accuracy                           0.62      2954
   macro avg       0.57      0.56      0.56      2954
weighted avg       0.61      0.62      0.61      2954

Confusion Matrix:
[[ 381  633]
 [ 487 1453]]


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC


X = df.copy()
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

text_col = 'statement'
cat_cols  = ['subject', 'party_affiliation_uni', 'speaker_type']
num_cols  = ['num_tokens', 'num_sentences', 'num_tokens_without_stopwords']

# 6) Pre‑procesado
txt_vect = TfidfVectorizer(
            ngram_range=(1,2),
            min_df=2, max_df=0.9,
            sublinear_tf=True,
            stop_words='english')

preprocess = ColumnTransformer([
        ('text', txt_vect, text_col),
        ('cat',  OneHotEncoder(handle_unknown='ignore'), cat_cols)])

pipe_svc = ImbPipeline(steps=[
        ('prep', preprocess),
        ('clf',  LinearSVC(class_weight='balanced'))

])

param_grid_svc = {
    'prep__text__ngram_range': [(1,2), (1,3)],
    'clf__C': [0.5, 1, 2]
}

grid_svc = GridSearchCV(pipe_svc,
                        param_grid_svc,
                        scoring='f1_macro',
                        cv=5,
                        n_jobs=-1,
                        verbose=2)
grid_svc.fit(X_train, y_train)

print("\n Mejor combinación:", grid_svc.best_params_) 
print("Mejor F1‑macro CV:", round(grid_svc.best_score_, 4)) 

# 10) Evaluación en test ------------------------------------------------------
y_pred = grid_svc.predict(X_test)

print("\n=========  MÉTRICAS EN TEST  =========")
print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 2); total time=   0.4s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 2); total time=   0.4s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 2); total time=   0.5s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 2); total time=   0.5s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 2); total time=   0.5s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 3); total time=   0.5s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 3); total time=   0.6s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 3); total time=   0.4s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 3); total time=   0.4s
[CV] END .........clf__C=0.5, prep__text__ngram_range=(1, 3); total time=   0.5s
[CV] END ...........clf__C=1, prep__text__ngram_range=(1, 2); total time=   0.3s
[CV] END ...........clf__C=1, prep__text__ngram_r

In [8]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# ---------- datos ----------
X = df.copy()
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42, stratify=y)

text_col = 'statement'
cat_cols = ['subject', 'party_affiliation_uni', 'speaker_type']
num_cols = ['num_tokens', 'num_sentences', 'num_tokens_without_stopwords']

txt_vect = TfidfVectorizer(
            ngram_range=(1,3),          # uni‑bi‑trigram
            min_df=2, max_df=0.9,
            sublinear_tf=True,
            stop_words='english')

preprocess = ColumnTransformer([
    ('text', Pipeline([
        ('vect', txt_vect),
        ('chi',  SelectKBest(chi2, k=40000))  # ≈ 40‑60k funciona bien
    ]), text_col),
    ('cat',  OneHotEncoder(min_frequency=10, handle_unknown='ignore'), cat_cols),
    ('num',  StandardScaler(), num_cols)
])

pipe = Pipeline([
    ('prep', preprocess),
    ('clf',  LinearSVC(class_weight='balanced',
                       max_iter=10000,
                       tol=1e-4))
])

param_grid = {
    'clf__C': [0.5, 1, 2, 5, 10, 20],
    'prep__text__vect__ngram_range': [(1,2), (1,3)]
}

grid = GridSearchCV(pipe,
                    param_grid,
                    scoring='f1_macro',
                    cv=5,
                    n_jobs=-1,
                    verbose=2)

grid.fit(X_train, y_train)

print("Mejor F1_macro CV:", round(grid.best_score_,4))
print("Mejor set:", grid.best_params_)

y_pred = grid.predict(X_test)
print("Accuracy :", round(accuracy_score(y_test, y_pred),4))
print(classification_report(y_test, y_pred, digits=4))


Fitting 5 folds for each of 12 candidates, totalling 60 fits




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 2); total time=   2.0s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 2); total time=   7.7s
[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 2); total time=   7.8s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 2); total time=   8.5s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 2); total time=   7.6s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 3); total time=   2.8s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 3); total time=   8.1s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 3); total time=   8.3s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 3); total time=   8.2s




[CV] END ...clf__C=0.5, prep__text__vect__ngram_range=(1, 3); total time=   8.4s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 2); total time=   4.0s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 2); total time=   7.9s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 2); total time=   7.4s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 2); total time=   7.1s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 2); total time=   6.8s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 3); total time=   2.9s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 3); total time=   7.3s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 3); total time=   7.6s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 3); total time=   7.3s




[CV] END .....clf__C=1, prep__text__vect__ngram_range=(1, 3); total time=   7.6s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 2); total time=   3.4s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 2); total time=   6.7s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 2); total time=   6.6s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 2); total time=   6.7s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 2); total time=   7.1s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 3); total time=   3.7s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 3); total time=   7.0s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 3); total time=   7.6s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 3); total time=   7.1s




[CV] END .....clf__C=2, prep__text__vect__ngram_range=(1, 3); total time=   7.1s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 2); total time=   5.6s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 2); total time=   6.0s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 2); total time=   6.1s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 2); total time=   6.3s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 2); total time=   5.9s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 3); total time=   4.6s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 3); total time=   6.9s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 3); total time=   7.8s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 3); total time=   8.4s




[CV] END .....clf__C=5, prep__text__vect__ngram_range=(1, 3); total time=  10.4s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 2); total time=   8.3s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 2); total time=   7.7s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 2); total time=   7.0s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 2); total time=   6.5s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 2); total time=   6.0s
[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 3); total time=   4.1s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 3); total time=   6.0s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 3); total time=   6.4s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 3); total time=   6.2s




[CV] END ....clf__C=10, prep__text__vect__ngram_range=(1, 3); total time=   6.6s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 2); total time=   4.3s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 2); total time=   5.5s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 2); total time=   5.8s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 2); total time=   6.0s
[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 2); total time=   5.4s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 3); total time=   5.9s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 3); total time=   6.1s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 3); total time=   5.7s
[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 3); total time=   5.9s




[CV] END ....clf__C=20, prep__text__vect__ngram_range=(1, 3); total time=   3.9s




Mejor F1_macro CV: 0.5741
Mejor set: {'clf__C': 0.5, 'prep__text__vect__ngram_range': (1, 3)}
Accuracy : 0.5961
              precision    recall  f1-score   support

           0     0.4361    0.4986    0.4653      1041
           1     0.7041    0.6492    0.6756      1913

    accuracy                         0.5961      2954
   macro avg     0.5701    0.5739    0.5704      2954
weighted avg     0.6097    0.5961    0.6014      2954



