In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import json

In [2]:
df=pd.read_csv('train.csv')

In [3]:
cols_to_drop = [
        'Id', 'MSSubClass', 'Alley', 'MiscFeature', 'PoolQC', 'BsmtQual', 
        'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 
        'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
        'BsmtFullBath', 'BsmtHalfBath', 
        'Heating', 'HeatingQC', 
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
        '3SsnPorch', 'ScreenPorch', 
        'SaleType', 'SaleCondition', 'Fence', 'Electrical', 'FireplaceQu', 
        'Fireplaces', 'PoolArea', 'CentralAir', 'MoSold', 'PavedDrive', 'Condition1', 'Condition2',
        'HouseStyle', 'RoofStyle', 'RoofMatl', 'Functional', 'GarageFinish', 'GarageCond', 'GarageYrBlt'
        'Exterior2nd', 'MasVnrType', 'YearRemodAdd', 'MasVnrArea', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'YrSold'
    ]


df = df.drop(columns=cols_to_drop, errors='ignore')

In [4]:
df = df[~(df[['FullBath', 'BedroomAbvGr', 'KitchenAbvGr']] == 0).any(axis=1)]

In [5]:
moda_del_barrio = df.groupby('Neighborhood')['LotFrontage'].agg(lambda x: x.mode()[0])

## Reemplazo los valores nulos por el valor de moda del barrio
df = df.copy()
df['LotFrontage'] = df['LotFrontage'].fillna(df['Neighborhood'].map(moda_del_barrio))

print(df['LotFrontage'].isnull().sum())

0


In [6]:
df.columns
print(len(df.columns))

30


In [7]:
cols_num = [col for col, dtype in df.dtypes.items() if dtype == 'float64' or dtype == 'int64']

df_num = df[cols_num].copy()

cols_ordinales = [
    'Utilities', 'ExterQual',  'KitchenQual', 'GarageQual', 
    'Neighborhood', 'ExterCond'
]

df_ord = df[cols_ordinales].copy() 

utilities_orden = {
    'ELO': 1,       # Electricity only
    'NoSeWa': 2,    # Electricity and Gas Only
    'NoSewr': 3,    # Electricity, Gas, and Water (Septic Tank)
    'AllPub': 4     # All public Utilities (E,G,W,& S)
}

df_ord['Utilities'] = df_ord['Utilities'].astype("category")
df_ord['Utilities'] = df_ord['Utilities'].cat.set_categories(utilities_orden, ordered=True)
df_ord['Utilities'] = df_ord['Utilities'].cat.codes

exterqual_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Average/Typical
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['ExterQual'] = df_ord['ExterQual'].astype("category")
df_ord['ExterQual'] = df_ord['ExterQual'].cat.set_categories(exterqual_orden, ordered=True)
df_ord['ExterQual'] = df_ord['ExterQual'].cat.codes



kitchenqual_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Typical/Average
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['KitchenQual'] = df_ord['KitchenQual'].astype("category")
df_ord['KitchenQual'] = df_ord['KitchenQual'].cat.set_categories(kitchenqual_orden, ordered=True)
df_ord['KitchenQual'] = df_ord['KitchenQual'].cat.codes






garagequal_orden = {
    'NoGarage': 0,  # No Garage
    'Po': 1,        # Poor
    'Fa': 2,        # Fair
    'TA': 3,        # Typical/Average
    'Gd': 4,        # Good
    'Ex': 5         # Excellent
}

df_ord['GarageQual'] = df_ord['GarageQual'].astype("category")
df_ord['GarageQual'] = df_ord['GarageQual'].cat.set_categories(garagequal_orden, ordered=True)
df_ord['GarageQual'] = df_ord['GarageQual'].cat.codes


extercond_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Average/Typical
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['ExterCond'] = df_ord['ExterCond'].astype("category")
df_ord['ExterCond'] = df_ord['ExterCond'].cat.set_categories(extercond_orden, ordered=True)
df_ord['ExterCond'] = df_ord['ExterCond'].cat.codes

neighborhood_orden = {
    'Green': 1,
    'Yellow': 2,
    'Orange': 3
}

df_ord['Neighborhood'] = df_ord['Neighborhood'].astype("category")
df_ord['Neighborhood'] = df_ord['Neighborhood'].cat.set_categories(neighborhood_orden, ordered=True)
df_ord['Neighborhood'] = df_ord['Neighborhood'].cat.codes



cols_nominales = [
        'MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 
        'BldgType', 'Exterior1st', 'Foundation', 
         'GarageType'
    ]

df_nomi = df[cols_nominales].copy()

df_nomi = pd.get_dummies(df_nomi, drop_first=True)

In [8]:
df_tree_ready = pd.concat([df_num, df_ord, df_nomi], axis=1 )

In [9]:
print(df_tree_ready.isnull().sum()[df_tree_ready.isnull().sum() > 0])

Series([], dtype: int64)


In [10]:
X = df_tree_ready.drop(columns=['SalePrice'])
y = df_tree_ready['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
output_folder = "processed_data/rf"

X_train.to_pickle(f"{output_folder}/Xrf_train.pkl")
X_test.to_pickle(f"{output_folder}/Xrf_test.pkl")

# Export train and test target variables
y_train.to_pickle(f"{output_folder}/yrf_train.pkl")
y_test.to_pickle(f"{output_folder}/yrf_test.pkl")

In [12]:
feature_names = list(X_train.columns)
print()

# Save to JSON
with open("xg_features.json", "w") as f:
    json.dump(feature_names, f)




In [13]:
print(feature_names)

['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageCars', 'MiscVal', 'Utilities', 'ExterQual', 'KitchenQual', 'GarageQual', 'Neighborhood', 'ExterCond', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Pave', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Mod', 'LandSlope_Sev', 'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm', 'Exterior1st_BrkFace', 'Exterior1st_CBlock', 'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_ImStucc', 'Exterior1st_MetalSd', 'Exterior1st_Plywood', 'Exterior1st_Stone', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng', 'Exterior1st_WdShing', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Foundatio

In [14]:
print(list(X_test.columns))

['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageCars', 'MiscVal', 'Utilities', 'ExterQual', 'KitchenQual', 'GarageQual', 'Neighborhood', 'ExterCond', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Pave', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Mod', 'LandSlope_Sev', 'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm', 'Exterior1st_BrkFace', 'Exterior1st_CBlock', 'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_ImStucc', 'Exterior1st_MetalSd', 'Exterior1st_Plywood', 'Exterior1st_Stone', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng', 'Exterior1st_WdShing', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Foundatio