In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('dataset/train.csv')

In [3]:
cols_to_drop = [
        'Id', 'Alley', 'MiscFeature', 'PoolQC', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 
        'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
        'BsmtFullBath', 'BsmtHalfBath', 
        'Heating', 'HeatingQC', 
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
        '3SsnPorch', 'ScreenPorch', 
        'SaleType', 'SaleCondition', 'Fence', 'Electrical', 'FireplaceQu', 'Fireplaces', 'PoolArea', 'CentralAir'
]


df = df.drop(columns=cols_to_drop, errors='ignore')

In [4]:
cols_null_a_cero = ['MasVnrArea', 'GarageYrBlt']

for col in cols_null_a_cero:
    df[col] = df[col].fillna(0)

print(df.isnull().sum()[df.isnull().sum() > 0])

LotFrontage     259
MasVnrType      872
GarageType       81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64


In [5]:
moda_del_barrio = df.groupby('Neighborhood')['LotFrontage'].agg(lambda x: x.mode()[0])

## Reemplazo los valores nulos por el valor de moda del barrio
df = df.copy()
df['LotFrontage'] = df['LotFrontage'].fillna(df['Neighborhood'].map(moda_del_barrio))

print(df['LotFrontage'].isnull().sum())

0


In [6]:
df.columns
print(len(df.columns))

51


In [8]:
cols_num = [col for col, dtype in df.dtypes.items() if dtype == 'float64' or dtype == 'int64']

df_num = df[cols_num].copy()

cols_ordinales = [
    'Utilities', 'ExterQual', 'KitchenQual', 'Functional',  'GarageFinish', 'GarageQual', 'GarageCond', 
    'Neighborhood', 'ExterCond'
]

df_ord = df[cols_ordinales].copy() 

utilities_orden = {
    'ELO': 1,       # Electricity only
    'NoSeWa': 2,    # Electricity and Gas Only
    'NoSewr': 3,    # Electricity, Gas, and Water (Septic Tank)
    'AllPub': 4     # All public Utilities (E,G,W,& S)
}

df_ord['Utilities'] = df_ord['Utilities'].astype("category")
df_ord['Utilities'] = df_ord['Utilities'].cat.set_categories(utilities_orden, ordered=True)
df_ord['Utilities'] = df_ord['Utilities'].cat.codes

exterqual_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Average/Typical
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['ExterQual'] = df_ord['ExterQual'].astype("category")
df_ord['ExterQual'] = df_ord['ExterQual'].cat.set_categories(exterqual_orden, ordered=True)
df_ord['ExterQual'] = df_ord['ExterQual'].cat.codes



kitchenqual_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Typical/Average
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['KitchenQual'] = df_ord['KitchenQual'].astype("category")
df_ord['KitchenQual'] = df_ord['KitchenQual'].cat.set_categories(kitchenqual_orden, ordered=True)
df_ord['KitchenQual'] = df_ord['KitchenQual'].cat.codes

functional_orden = {
    'Sal': 1,  # Salvage only
    'Sev': 2,  # Severely Damaged
    'Maj2': 3, # Major Deductions 2
    'Maj1': 4, # Major Deductions 1
    'Mod': 5,  # Moderate Deductions
    'Min2': 6, # Minor Deductions 2
    'Min1': 7, # Minor Deductions 1
    'Typ': 8   # Typical Functionality
}

df_ord['Functional'] = df_ord['Functional'].astype("category")
df_ord['Functional'] = df_ord['Functional'].cat.set_categories(functional_orden, ordered=True)
df_ord['Functional'] = df_ord['Functional'].cat.codes


garagefinish_orden = {
    'NoGarage': 0,   # No Garage
    'Unf': 1,        # Unfinished
    'RFn': 2,        # Rough Finished
    'Fin': 3         # Finished
}

df_ord['GarageFinish'] = df_ord['GarageFinish'].astype("category")
df_ord['GarageFinish'] = df_ord['GarageFinish'].cat.set_categories(garagefinish_orden, ordered=True)
df_ord['GarageFinish'] = df_ord['GarageFinish'].cat.codes

garagequal_orden = {
    'NoGarage': 0,  # No Garage
    'Po': 1,        # Poor
    'Fa': 2,        # Fair
    'TA': 3,        # Typical/Average
    'Gd': 4,        # Good
    'Ex': 5         # Excellent
}

df_ord['GarageQual'] = df_ord['GarageQual'].astype("category")
df_ord['GarageQual'] = df_ord['GarageQual'].cat.set_categories(garagequal_orden, ordered=True)
df_ord['GarageQual'] = df_ord['GarageQual'].cat.codes

garagecond_orden = {
    'NoGarage': 0,  # No Garage
    'Po': 1,        # Poor
    'Fa': 2,        # Fair
    'TA': 3,        # Typical/Average
    'Gd': 4,        # Good
    'Ex': 5         # Excellent
}

df_ord['GarageCond'] = df_ord['GarageCond'].astype("category")
df_ord['GarageCond'] = df_ord['GarageCond'].cat.set_categories(garagecond_orden, ordered=True)
df_ord['GarageCond'] = df_ord['GarageCond'].cat.codes


extercond_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Average/Typical
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['ExterCond'] = df_ord['ExterCond'].astype("category")
df_ord['ExterCond'] = df_ord['ExterCond'].cat.set_categories(extercond_orden, ordered=True)
df_ord['ExterCond'] = df_ord['ExterCond'].cat.codes

neighborhood_orden = {
    'Green': 1,
    'Yellow': 2,
    'Orange': 3
}

df_ord['Neighborhood'] = df_ord['Neighborhood'].astype("category")
df_ord['Neighborhood'] = df_ord['Neighborhood'].cat.set_categories(neighborhood_orden, ordered=True)
df_ord['Neighborhood'] = df_ord['Neighborhood'].cat.codes



cols_nominales = [
        'MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 
        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 
        'GarageType', 'PavedDrive', 'MasVnrType', 'GarageType'
    ]

df_nomi = df[cols_nominales].copy()

df_nomi = pd.get_dummies(df_nomi, drop_first=True)

In [9]:
df_tree_ready = pd.concat([df_num, df_ord, df_nomi], axis=1 )

In [10]:
print(df_tree_ready.isnull().sum()[df_tree_ready.isnull().sum() > 0])

Series([], dtype: int64)


In [11]:
X = df_tree_ready.drop(columns=['SalePrice'])
y = df_tree_ready['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
output_folder = "dataset/processed_data/rf"

X_train.to_pickle(f"{output_folder}/Xrf_train.pkl")
X_test.to_pickle(f"{output_folder}/Xrf_test.pkl")

# Export train and test target variables
y_train.to_pickle(f"{output_folder}/yrf_train.pkl")
y_test.to_pickle(f"{output_folder}/yrf_test.pkl")