In [None]:
import pandas as pd
import joblib
import os
import tarfile
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
''' Local testing
bucket_name = 'mpsgroupstack-processeddatabucket4e25d3b7-j2zcvjx9ql0b'
file_key_X_train = 'X_train.csv'
file_key_X_test = 'X_test.csv'
file_key_y_train = 'y_train.csv'
file_key_y_test = 'y_test.csv'
s3_uri_X_train = f's3://{bucket_name}/processed/{file_key_X_train}'
s3_uri_X_test = f's3://{bucket_name}/processed/{file_key_X_test}'
s3_uri_y_train = f's3://{bucket_name}/processed/{file_key_y_train}'
s3_uri_y_test = f's3://{bucket_name}/processed/{file_key_y_test}'

# Cargar datos desde S3 procced bucket
X_train = pd.read_csv(s3_uri_X_train)
X_test = pd.read_csv(s3_uri_X_test)
y_train = pd.read_csv(s3_uri_y_train).squeeze()  # Size convertion
y_test = pd.read_csv(s3_uri_y_test).squeeze()  
'''
input_dir = "/opt/ml/processing/input"
model_dir = "/opt/ml/model"

X_train = pd.read_csv(f"{input_dir}/X_train.csv")
X_test = pd.read_csv(f"{input_dir}/X_test.csv")
y_train = pd.read_csv(f"{input_dir}/y_train.csv").squeeze()
y_test = pd.read_csv(f"{input_dir}/y_test.csv").squeeze()

print(y_train.shape, y_test.shape)


(16512,) (4128,)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.272587,-1.372811,0.34849,0.222569,0.211228,0.768276,0.322906,-0.326196,0.0,0.0,0.0,0.0,1.0
1,0.709162,-0.876696,1.618118,0.340293,0.593094,-0.098901,0.672027,-0.035843,0.0,0.0,0.0,0.0,1.0
2,-0.447603,-0.460146,-1.95271,-0.342597,-0.495226,-0.449818,-0.430461,0.144701,0.0,0.0,0.0,0.0,1.0
3,1.232698,-1.382172,0.586545,-0.56149,-0.409306,-0.007434,-0.380587,-1.017864,0.0,0.0,0.0,0.0,1.0
4,-0.108551,0.532084,1.142008,-0.119565,-0.256559,-0.485877,-0.314962,-0.171488,0.0,1.0,0.0,0.0,0.0


In [24]:
param_grid = {
    'n_estimators': randint(20, 100),
    'max_depth': randint(5, 12),
    'min_samples_split': randint(2, 50)
}

In [25]:
model= RandomForestRegressor(random_state=42, n_jobs=-1)
randomS = RandomizedSearchCV(model, param_grid, n_iter=50,cv=10, scoring='r2', n_jobs=-1, random_state=42)
randomS.fit(X_train, y_train)
print("Best parameters:")
print(randomS.best_params_)

Best parameters:
{'max_depth': 11, 'min_samples_split': 9, 'n_estimators': 97}


In [26]:
best_model = randomS.best_estimator_
y_pred_train = best_model.predict(X_train)
mse = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 1674005042.5748951
R^2 Score: 0.8747731652258117


In [27]:
y_pred_test = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 2721477737.5559387
R^2 Score: 0.7923184454055507


In [None]:


# save model as .joblib
model_path = os.path.join(model_dir, "model.joblib")
joblib.dump(best_model, model_path)

assert os.path.exists(model_path), "Model file not found before archiving!"

#  .tar.gz
with tarfile.open(os.path.join(model_dir, "model.tar.gz"), mode="w:gz") as archive:
    archive.add(model_path, arcname="model.joblib")

print("Model packaged correctly.")
