In [77]:
import pandas as pd
from scipy.stats import randint
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np  
import os

In [None]:
'''
bucket_name = 'mpsgroupstack-rawdatabucket57f26c03-e7evfziygnoz' #local testing
file_key = 'data.csv'
s3_uri = f's3://{bucket_name}/{file_key}'
'''
input_path = "/opt/ml/processing/input/data.csv"


# Cargar archivo desde S3
#df = pd.read_csv(s3_uri) local testing
df = pd.read_csv(input_path)

# Verifica que cargó bien
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [70]:
X = df.drop(columns=["median_house_value"])
y = df["median_house_value"]
print(X.shape, y.shape)

(20640, 9) (20640,)


In [71]:
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = ['ocean_proximity']
print("Numéricas:", num_features)
print("Categóricas:", cat_features)

Numéricas: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
Categóricas: ['ocean_proximity']


In [72]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Opcional para NaNs en la categórica
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [73]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [74]:
processing_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(16512, 9) (4128, 9)


In [None]:
X_train_processed = processing_pipe.fit_transform(X_train)
X_test_processed = processing_pipe.transform(X_test)


array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [None]:
''''
output_dir = "processedData"  #for local testing
os.makedirs(output_dir, exist_ok=True)

pd.DataFrame(X_train_processed).to_csv(f"{output_dir}/X_train.csv", index=False)
pd.DataFrame(X_test_processed).to_csv(f"{output_dir}/X_test.csv", index=False)
y_train.to_csv(f"{output_dir}/y_train.csv", index=False)
y_test.to_csv(f"{output_dir}/y_test.csv", index=False)
'''

In [None]:
output_dir = "/opt/ml/processing/output"  #for S3

pd.DataFrame(X_train_processed).to_csv(f"{output_dir}/X_train.csv", index=False)
pd.DataFrame(X_test_processed).to_csv(f"{output_dir}/X_test.csv", index=False)
y_train.to_csv(f"{output_dir}/y_train.csv", index=False)
y_test.to_csv(f"{output_dir}/y_test.csv", index=False)

print("Preprocessing completed and files saved.")
