In [9]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importación librerías
import pandas as pd
import os
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
# from imblearn.over_sampling import RandomOverSampler
from scipy.sparse import vstack, csr_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier,XGBRegressor

In [6]:
# Carga de datos de archivo .csv
data_training = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
data_testing = pd.read_csv('https://github.com/albahnsen/MIAD_ML_and_NLP/raw/main/datasets/dataTesting.zip', encoding='UTF-8', index_col=0)

In [7]:
# Función de preprocesamiento de texto
# Función de preprocesamiento de texto
def preprocess_text_manual(text):
    text = text.lower()  # Convertir a minúsculas
    text = re.sub(r'[^a-z\s]', '', text)  # Eliminar caracteres no alfabéticos
    tokens = text.split()  # Tokenizar
    # Eliminar algunas stopwords básicas
    stopwords = {'the', 'and', 'in', 'to', 'of', 'a', 'is', 'that', 'with', 'as', 'for', 'on', 'at', 'by', 'an'}
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

# Aplicar preprocesamiento
data_training['clean_plot'] = data_training['plot'].apply(preprocess_text_manual)
data_testing['clean_plot'] = data_testing['plot'].apply(preprocess_text_manual)

# Convertir géneros a etiquetas binarias
mlb = MultiLabelBinarizer()
data_training['genres'] = data_training['genres'].apply(eval)  # Convertir las listas de cadenas a listas de Python
genres_binary = mlb.fit_transform(data_training['genres'])

In [8]:
# Vectorización usando TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(data_training['clean_plot'])
X_test = vectorizer.transform(data_testing['clean_plot'])

# Separación de variables predictoras (X) y variable de interés (y) en set de entrenamiento y test usandola función train_test_split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, genres_binary, test_size=0.33, random_state=42)

<h1>Regresion Logistica</h1>

In [9]:
# Entrenar un clasificador de regresión logística
log_reg = LogisticRegression(max_iter=1000)
clf = MultiOutputClassifier(log_reg, n_jobs=-1)
clf.fit(X_train_split, y_train_split)

# Realizar predicciones en el conjunto de validación
y_val_pred = clf.predict_proba(X_val_split)
y_val_pred = np.array([pred[:, 1] for pred in y_val_pred]).T  # Convertir las probabilidades a la forma correcta

# Calcular el MCAUC
mauc = roc_auc_score(y_val_split, y_val_pred, average='macro')
print(f'MCAUC: {mauc}')

MCAUC: 0.8751628393523943


<h1>XGBClassifier</h1>

In [1]:
# Entrenar un regresor XGB
xgb_reg = XGBRegressor(objective='reg:logistic', n_estimators=10, max_depth=40, learning_rate=0.1)#reg:logistic
reg = MultiOutputClassifier(xgb_reg, n_jobs=-1)
reg.fit(X_train_split, y_train_split)

# Realizar predicciones en el conjunto de validación
y_val_pred = reg.predict(X_val_split)
# y_val_pred = np.array([pred[:, 1] for pred in y_val_pred]).T
# Calcular el MCAUC
mauc = roc_auc_score(y_val_split, y_val_pred, average='macro')
print(f'MCAUC: {mauc}')

NameError: name 'XGBClassifier' is not defined

In [None]:
# Realizar predicciones en el conjunto de prueba
y_test_pred = clf.predict_proba(X_test)
y_test_pred = np.array([pred[:, 1] for pred in y_test_pred]).T  # Convertir las probabilidades a la forma correcta

cols = ['p_Action', 'p_Adventure', 'p_Animation', 'p_Biography', 'p_Comedy', 'p_Crime', 'p_Documentary', 'p_Drama', 'p_Family',
        'p_Fantasy', 'p_Film-Noir', 'p_History', 'p_Horror', 'p_Music', 'p_Musical', 'p_Mystery', 'p_News', 'p_Romance',
        'p_Sci-Fi', 'p_Short', 'p_Sport', 'p_Thriller', 'p_War', 'p_Western']

# Guardar predicciones en formato exigido en la competencia de Kaggle
submission = pd.DataFrame(y_test_pred, index=data_testing.index, columns=cols)
submission.to_csv('pred_genres_text_RF.csv', index_label='ID')