In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# RandomForest (train raw)

In [5]:
# Cargar datos de entrenamiento
df_train = pd.read_csv("../data/train_reviews_merged_all_mota.csv")
df_train.sample(5)

Unnamed: 0,review_stars,review_useful,review_funny,review_cool,review_date,buss_postal_code,buss_latitude,buss_longitude,buss_stars,buss_review_count,...,users_compliment_more,users_compliment_profile,users_compliment_cute,users_compliment_list,users_compliment_note,users_compliment_plain,users_compliment_cool,users_compliment_funny,users_compliment_writer,users_compliment_photos
52135,2.0,-0.391093,-0.150312,-0.251644,73,869,-0.387363,-2.027087,4.0,5.731823,...,-0.026385,-0.014603,-0.013422,-0.007925,-0.028437,-0.032377,-0.038728,-0.038728,-0.042674,-0.016644
235414,1.0,-0.391093,-0.150312,-0.251644,96,754,-1.146669,-0.048588,4.0,1.201035,...,0.023091,-0.014603,-0.013422,-0.007925,0.002294,-0.032377,-0.038728,-0.038728,-0.042674,-0.016644
312903,5.0,-0.391093,-0.150312,-0.251644,139,23,0.544074,0.958906,3.5,-0.232659,...,-0.026385,-0.014603,-0.013422,-0.007925,-0.028437,-0.026934,-0.038728,-0.038728,-0.042674,-0.016644
244084,5.0,-0.062341,0.300774,0.255613,78,248,0.545515,0.946288,4.0,0.17131,...,-0.026385,-0.014603,-0.013422,-0.007925,-0.028437,-0.032377,-0.038728,-0.038728,-0.042674,-0.016644
561512,5.0,0.595161,-0.150312,0.255613,120,402,-1.459937,0.454874,4.0,2.15155,...,-0.026385,0.027257,-0.013422,-0.007925,-0.018193,-0.032377,-0.038728,-0.038728,-0.042674,-0.009922


In [6]:
df_train.columns

Index(['review_stars', 'review_useful', 'review_funny', 'review_cool',
       'review_date', 'buss_postal_code', 'buss_latitude', 'buss_longitude',
       'buss_stars', 'buss_review_count', 'buss_state_AB', 'buss_state_AZ',
       'buss_state_CA', 'buss_state_CO', 'buss_state_DE', 'buss_state_FL',
       'buss_state_ID', 'buss_state_IL', 'buss_state_IN', 'buss_state_LA',
       'buss_state_MO', 'buss_state_NJ', 'buss_state_NV', 'buss_state_PA',
       'buss_state_TN', 'buss_state_VI', 'buss_state_WA', 'Beauty & Spas',
       'Automotive', 'Food & Dining', 'Health & Medical', 'Education',
       'Entertainment', 'users_review_count', 'users_yelping_since',
       'users_useful', 'users_funny', 'users_cool', 'users_elite',
       'users_friends', 'users_fans', 'users_average_stars',
       'users_compliment_hot', 'users_compliment_more',
       'users_compliment_profile', 'users_compliment_cute',
       'users_compliment_list', 'users_compliment_note',
       'users_compliment_plain', 'u

In [10]:
# Combinar las características numéricas con el embedding del texto
X_train = df_train.drop(columns=['review_stars'])
y_train = df_train['review_stars']

# Dividir el conjunto de entrenamiento en entrenamiento y validación
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Entrenar el modelo RandomForestRegressor
model = RandomForestRegressor(n_estimators=200, warm_start=True, random_state=42, verbose=1, n_jobs=-1)
model.fit(X_train_final, y_train_final)

# Predecir en el conjunto de validación para evaluar
y_pred_val = model.predict(X_val)
print("MAE en validación:", mean_absolute_error(y_val, y_pred_val))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 11.3min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.9s


MAE en validación: 0.740231092040809


[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    4.2s finished


In [11]:
# Ahora, procesar el conjunto de test (sin ajustar de nuevo el vectorizador ni el SVD)
df_test = pd.read_csv('../data/test_reviews_merged_all_mota.csv')
df_test.sample(5)

Unnamed: 0,review_id,review_useful,review_funny,review_cool,review_date,buss_postal_code,buss_latitude,buss_longitude,buss_stars,buss_review_count,...,users_compliment_more,users_compliment_profile,users_compliment_cute,users_compliment_list,users_compliment_note,users_compliment_plain,users_compliment_cool,users_compliment_funny,users_compliment_writer,users_compliment_photos
62384,RjdH6HhwlEJOBXoYdukWaA,-0.391093,-0.150312,0.255613,113,268,0.567665,0.914187,4.0,0.115863,...,-0.026385,-0.014603,-0.013422,-0.007925,-0.028437,-0.032377,-0.038728,-0.038728,-0.042674,-0.016644
97691,HWQtdYfbC0GqL97yUmfuVw,0.923912,-0.150312,-0.251644,55,807,-0.752753,-1.438323,2.0,-0.296027,...,-0.026385,-0.014603,-0.013422,-0.007925,-0.028437,-0.032377,-0.038728,-0.038728,-0.042674,-0.016644
282086,6YCYI78yl9pmMdBeDaoKbA,-0.062341,-0.150312,-0.251644,50,444,-1.477082,0.435619,4.5,0.868354,...,-0.026385,-0.014603,-0.013422,-0.007925,-0.028437,-0.032377,-0.038728,-0.038728,-0.042674,-0.016644
224519,MBADgyQdFTTdHkn_w2X-Gw,-0.391093,-0.150312,-0.251644,106,845,0.475272,-2.03341,3.5,2.468388,...,-0.026385,-0.014603,-0.013422,-0.007925,-0.028437,-0.026934,-0.038728,-0.038728,-0.042674,-0.016644
324759,jdlUCMiWQ9YitBGUGpDUwQ,1.910166,0.300774,0.255613,153,754,-1.143767,-0.047962,3.5,5.121909,...,-0.026385,0.027257,0.150927,-0.007925,0.002294,0.016609,-0.003426,-0.003426,0.102956,0.010244


In [12]:
X_test = df_test.drop(columns=['review_id'])

# Realizar las predicciones en el conjunto de test
predictions = model.predict(X_test)

# Crear el DataFrame con el formato requerido: review_id, stars
df_output = pd.DataFrame({
    'review_id': df_test['review_id'],
    'stars': predictions.round(0)  
})

# Exportar el CSV con las predicciones
df_output.to_csv('../out/random_forest_review_predictions.csv', index=False)
"Predicciones exportadas a 'test_review_predictions.csv'"

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    8.4s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    9.1s finished


"Predicciones exportadas a 'test_review_predictions.csv'"