In [108]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error,       # Para MSE y RMSE
    mean_absolute_error,      # Para MAE
    r2_score,                 # Para R²
)

import numpy as np
import pandas as pd
import re
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import unicodedata
from joblib import load
from joblib import dump
from transformers import CleanText, TokenizerText 


In [111]:
df = pd.read_csv('datos_productos.csv')

In [113]:
preprocessor = load('preprocessor.pkl') 

In [115]:
df

Unnamed: 0,Price,Star_Rating,Reviews,Product_Description,Tipo
0,6.95,4.4,477,"L'Oréal Men Expert Champú sólido para hombres,...",champu
1,11.99,4.2,455,Störtebekker® Premium Champú sólido Sándalo (1...,champu
2,14.99,5.0,12,Champú Sólido sin Parabenos ni Sulfatos-Vegano...,champu
3,11.99,4.4,64,RAW REVIVAL - Champú solido Romero | Anticaspa...,champu
4,11.99,4.4,24,Störtebekker® Premium Champú sólido Tortuga - ...,champu
...,...,...,...,...,...
390,21.50,5.0,2,Exfoliante Exfoliante de Bambú de Phyt's Men 1...,exfoliante
391,42.00,4.2,131,Paula’s Choice RESIST Antiedad 2% BHA Exfolian...,exfoliante
392,20.50,4.1,194,Himalaya Herbals Exfoliante de albaricoque 50 ...,exfoliante
393,70.52,4.3,649,Vasanti – Rejuvenecedor facial enzimático Brig...,exfoliante


In [117]:
from joblib import load

# Cargar el preprocesador previamente guardado
preprocessor = load('preprocessor.pkl')

def Featuring(df, preprocessor):
    # Verificar que las columnas necesarias existen
    required_columns = ['Tipo', 'Product_Description', 'Price', 'Reviews', 'Star_Rating']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"El DataFrame debe contener las siguientes columnas: {required_columns}")
    
    # Separar el dataset en dos grupos
    df_menos_60 = df[df['Reviews'] < 60]   # Datos con menos de 60 reviews
    df_mas_60 = df[df['Reviews'] >= 60] 

    # Aplicar preprocesamiento a cada subconjunto
    X_bajo = df_menos_60[['Tipo', 'Product_Description', 'Price']]
    y_bajo = df_menos_60['Star_Rating']

    X_alto = df_mas_60[['Tipo', 'Product_Description', 'Price']]
    y_alto = df_mas_60['Star_Rating']

    # Preprocesar características
    X_bajo_preprocessed = preprocessor.transform(X_bajo)  # Usar el preprocesador cargado
    X_alto_preprocessed = preprocessor.transform(X_alto)  # Usar el preprocesador cargado
    
    return X_bajo_preprocessed, y_bajo, X_alto_preprocessed, y_alto

In [119]:
X_bajo_preprocessed, y_bajo, X_alto_preprocessed, y_alto = Featuring(df,preprocessor)

In [121]:
# Concatenar X_preprocessed y y, asegurando que los índices coincidan
data_bajo = pd.concat([pd.DataFrame(X_bajo_preprocessed.toarray()), y_bajo.reset_index(drop=True)], axis=1)
data_alto = pd.concat([pd.DataFrame(X_alto_preprocessed.toarray()), y_alto.reset_index(drop=True)], axis=1)

# Comparación de modelos para data_bajo

In [92]:

from pycaret.regression import *   


exp = setup(data=data_bajo, 
            target='Star_Rating',
            session_id=123)


best_model = compare_models()


print(best_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Star_Rating
2,Target type,Regression
3,Original data shape,"(223, 69)"
4,Transformed data shape,"(223, 69)"
5,Transformed train set shape,"(156, 69)"
6,Transformed test set shape,"(67, 69)"
7,Numeric features,68
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
br,Bayesian Ridge,0.5281,0.5705,0.7317,0.0443,0.1724,0.1861,0.01
ada,AdaBoost Regressor,0.5308,0.5996,0.75,-0.0028,0.1748,0.1828,0.022
llar,Lasso Least Angle Regression,0.5471,0.6237,0.7645,-0.0338,0.1796,0.1955,0.01
dummy,Dummy Regressor,0.5471,0.6237,0.7645,-0.0338,0.1796,0.1955,0.01
en,Elastic Net,0.5471,0.6237,0.7645,-0.0338,0.1796,0.1955,0.012
lasso,Lasso Regression,0.5471,0.6237,0.7645,-0.0338,0.1796,0.1955,0.384
omp,Orthogonal Matching Pursuit,0.5663,0.6074,0.755,-0.0382,0.1754,0.1923,0.011
ridge,Ridge Regression,0.5684,0.6064,0.7571,-0.0451,0.1757,0.1927,0.011
lightgbm,Light Gradient Boosting Machine,0.5574,0.6084,0.7533,-0.0642,0.175,0.1894,0.023
rf,Random Forest Regressor,0.5513,0.6292,0.7753,-0.1248,0.181,0.187,0.038


BayesianRidge()


# Comparacion de modelos para data_alto

In [94]:

from pycaret.regression import *   


exp = setup(data=data_alto, 
            target='Star_Rating',
            session_id=123)


best_model = compare_models()


print(best_model)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Star_Rating
2,Target type,Regression
3,Original data shape,"(172, 69)"
4,Transformed data shape,"(172, 69)"
5,Transformed train set shape,"(120, 69)"
6,Transformed test set shape,"(52, 69)"
7,Numeric features,68
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,0.1969,0.0647,0.2517,-0.1547,0.0483,0.0471,0.032
br,Bayesian Ridge,0.2043,0.0663,0.2529,-0.1614,0.0486,0.0488,0.01
ridge,Ridge Regression,0.2049,0.0671,0.256,-0.2109,0.0492,0.0489,0.01
et,Extra Trees Regressor,0.1933,0.0654,0.2525,-0.2182,0.0484,0.0462,0.031
llar,Lasso Least Angle Regression,0.2159,0.0715,0.2612,-0.2274,0.0502,0.0516,0.011
dummy,Dummy Regressor,0.2159,0.0715,0.2612,-0.2274,0.0502,0.0516,0.011
en,Elastic Net,0.2159,0.0715,0.2612,-0.2274,0.0502,0.0516,0.011
lasso,Lasso Regression,0.2159,0.0715,0.2612,-0.2274,0.0502,0.0516,0.011
omp,Orthogonal Matching Pursuit,0.1986,0.0696,0.2597,-0.2794,0.05,0.0475,0.01
ada,AdaBoost Regressor,0.2098,0.0692,0.2606,-0.3074,0.0501,0.0503,0.022


RandomForestRegressor(n_jobs=-1, random_state=123)


# Model Bajo

In [96]:

# # Crear DataFrames para data_bajo y data_alto
# data_bajo = pd.concat([pd.DataFrame(X_bajo_preprocessed.toarray()), y_bajo.reset_index(drop=True)], axis=1)
# data_alto = pd.concat([pd.DataFrame(X_alto_preprocessed.toarray()), y_alto.reset_index(drop=True)], axis=1)

# # Separar características (X) y objetivo (y) para ambos conjuntos
# X_bajo = data_bajo.iloc[:, :-1]  # Todas las columnas excepto la última
# y_bajo = data_bajo.iloc[:, -1]  # Última columna (Star_Rating)

# X_alto = data_alto.iloc[:, :-1]
# y_alto = data_alto.iloc[:, -1]


# # Dividir los datos en entrenamiento y prueba
# X_train_bajo, X_test_bajo, y_train_bajo, y_test_bajo = train_test_split(X_bajo, y_bajo, test_size=0.2, random_state=42)
# X_train_alto, X_test_alto, y_train_alto, y_test_alto = train_test_split(X_alto, y_alto, test_size=0.2, random_state=42)

# # -------------------------
# # Modelo para data_bajo: Bayesian Ridge
# # -------------------------

# # Definir el modelo base
# bayesian_ridge = BayesianRidge()

# # Parámetros para ajustar
# param_grid_bayesian = {
#     'alpha_1': [1e-6, 1e-5, 1e-4],
#     'alpha_2': [1e-6, 1e-5, 1e-4],
#     'lambda_1': [1e-6, 1e-5, 1e-4],
#     'lambda_2': [1e-6, 1e-5, 1e-4],
#     'n_iter': [100, 300, 500]
# }

# # Usar GridSearchCV para ajustar hiperparámetros
# grid_search_bajo = GridSearchCV(bayesian_ridge, param_grid_bayesian, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search_bajo.fit(X_train_bajo, y_train_bajo)

# # Mejor modelo
# best_bayesian_ridge = grid_search_bajo.best_estimator_

# # Evaluar en el conjunto de prueba
# y_pred_bajo = best_bayesian_ridge.predict(X_test_bajo)

# mse_bajo = mean_squared_error(y_test_bajo, y_pred_bajo)
# rmse_bajo = mse_bajo ** 0.5  # Raíz cuadrada del MSE
# mae_bajo = mean_absolute_error(y_test_bajo, y_pred_bajo)
# r2_bajo = r2_score(y_test_bajo, y_pred_bajo)

# # Mostrar métricas
# print(f"Error Cuadrático Medio (MSE): {mse_bajo}")
# print(f"Raíz del Error Cuadrático Medio (RMSE): {rmse_bajo}")
# print(f"Error Absoluto Medio (MAE): {mae_bajo}")
# print(f"Coeficiente de Determinación (R²): {r2_bajo}")

# # Entrenar el modelo final con todos los datos
# best_params_bajo = grid_search_bajo.best_params_
# final_model_bajo = BayesianRidge(**best_params_bajo)
# final_model_bajo.fit(X_bajo, y_bajo)

# # Guardar el modelo entrenado
# dump(final_model_bajo, 'modelo_bajo.pkl')
# print("Modelo bajo entrenado con todos los datos y guardado como modelo_bajo_final.pkl")



In [124]:

# Definir el modelo base
random_forest = RandomForestRegressor(random_state=42)

# Parámetros para ajustar
param_grid_random_forest = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Usar GridSearchCV para ajustar hiperparámetros
grid_search_bajo = GridSearchCV(random_forest, param_grid_random_forest, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_bajo.fit(X_train_bajo, y_train_bajo)

# Mejor modelo
best_random_forest = grid_search_bajo.best_estimator_

# Evaluar en el conjunto de prueba
y_pred_bajo = best_random_forest.predict(X_test_bajo)

mse_bajo = mean_squared_error(y_test_bajo, y_pred_bajo)
rmse_bajo = mse_bajo ** 0.5  # Raíz cuadrada del MSE
mae_bajo = mean_absolute_error(y_test_bajo, y_pred_bajo)
r2_bajo = r2_score(y_test_bajo, y_pred_bajo)

# Mostrar métricas
print(f"Error Cuadrático Medio (MSE): {mse_bajo}")
print(f"Raíz del Error Cuadrático Medio (RMSE): {rmse_bajo}")
print(f"Error Absoluto Medio (MAE): {mae_bajo}")
print(f"Coeficiente de Determinación (R²): {r2_bajo}")

# Entrenar el modelo final con todos los datos
best_params_bajo = grid_search_bajo.best_params_
final_model_bajo = RandomForestRegressor(random_state=42, **best_params_bajo)
final_model_bajo.fit(X_bajo, y_bajo)

# Guardar el modelo entrenado
dump(final_model_bajo, 'modelo_bajo.pkl')
print("Modelo bajo entrenado con todos los datos y guardado como modelo_bajo.pkl")

Error Cuadrático Medio (MSE): 0.8519854062500924
Raíz del Error Cuadrático Medio (RMSE): 0.9230305554260338
Error Absoluto Medio (MAE): 0.6697511587142893
Coeficiente de Determinación (R²): -0.02157128422848653
Modelo bajo entrenado con todos los datos y guardado como modelo_bajo.pkl


# Model alto

In [126]:

# Definir el modelo base
random_forest = RandomForestRegressor(random_state=42)

# Parámetros para ajustar
param_grid_random_forest = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Usar GridSearchCV para ajustar hiperparámetros
grid_search_alto = GridSearchCV(random_forest, param_grid_random_forest, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_alto.fit(X_train_alto, y_train_alto)

# Mejor modelo
best_random_forest = grid_search_alto.best_estimator_

# Evaluar en el conjunto de prueba
y_pred_alto = best_random_forest.predict(X_test_alto)

mse_alto = mean_squared_error(y_test_alto, y_pred_alto)
rmse_alto = mse_alto ** 0.5  # Raíz cuadrada del MSE
mae_alto = mean_absolute_error(y_test_alto, y_pred_alto)
r2_alto = r2_score(y_test_alto, y_pred_alto)

# Mostrar métricas
print(f"Error Cuadrático Medio (MSE): {mse_alto}")
print(f"Raíz del Error Cuadrático Medio (RMSE): {rmse_alto}")
print(f"Error Absoluto Medio (MAE): {mae_alto}")
print(f"Coeficiente de Determinación (R²): {r2_alto}")

# Entrenar el modelo final con todos los datos
best_params_alto = grid_search_alto.best_params_
final_model_alto = RandomForestRegressor(random_state=42, **best_params_alto)
final_model_alto.fit(X_alto, y_alto)

# Guardar el modelo entrenado
dump(final_model_alto, 'modelo_alto.pkl')
print("Modelo alto entrenado con todos los datos y guardado como modelo_alto.pkl")

Error Cuadrático Medio (MSE): 0.0474217383084859
Raíz del Error Cuadrático Medio (RMSE): 0.21776532852703137
Error Absoluto Medio (MAE): 0.17356273103242018
Coeficiente de Determinación (R²): 0.15638063566809146
Modelo alto entrenado con todos los datos y guardado como modelo_alto_final.pkl
