In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import json

In [2]:
df=pd.read_csv('train.csv')

In [3]:
cols_to_drop = [
        'Id', 'MSSubClass', 'Alley', 'MiscFeature', 'PoolQC', 'BsmtQual', 
        'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 
        'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
        'BsmtFullBath', 'BsmtHalfBath', 
        'Heating', 'HeatingQC', 
        'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 
        '3SsnPorch', 'ScreenPorch', 
        'SaleType', 'SaleCondition', 'Fence', 'Electrical', 'FireplaceQu', 
        'Fireplaces', 'PoolArea', 'CentralAir', 'MoSold', 'PavedDrive', 'Condition1', 'Condition2',
        'HouseStyle', 'RoofStyle', 'RoofMatl', 'Functional', 'GarageFinish', 'GarageCond', 'GarageYrBlt'
        'Exterior2nd', 'MasVnrType', 'YearRemodAdd', 'MasVnrArea', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'YrSold', 'KitchenAbvGr'
    ]


df = df.drop(columns=cols_to_drop, errors='ignore')

In [4]:
df = df[~(df[['FullBath', 'BedroomAbvGr']] == 0).any(axis=1)]

In [5]:
moda_del_barrio = df.groupby('Neighborhood')['LotFrontage'].agg(lambda x: x.mode()[0])

## Reemplazo los valores nulos por el valor de moda del barrio
df = df.copy()
df['LotFrontage'] = df['LotFrontage'].fillna(df['Neighborhood'].map(moda_del_barrio))

print(df['LotFrontage'].isnull().sum())

0


In [6]:
df.columns
print(len(df.columns))

29


In [7]:
cols_num = [col for col, dtype in df.dtypes.items() if dtype == 'float64' or dtype == 'int64']

df_num = df[cols_num].copy()

cols_ordinales = [
    'Utilities', 'ExterQual',  'KitchenQual', 'GarageQual', 
    'Neighborhood', 'ExterCond'
]

df_ord = df[cols_ordinales].copy() 

utilities_orden = {
    'ELO': 1,       # Electricity only
    'NoSeWa': 2,    # Electricity and Gas Only
    'NoSewr': 3,    # Electricity, Gas, and Water (Septic Tank)
    'AllPub': 4     # All public Utilities (E,G,W,& S)
}

df_ord['Utilities'] = df_ord['Utilities'].astype("category")
df_ord['Utilities'] = df_ord['Utilities'].cat.set_categories(utilities_orden, ordered=True)
df_ord['Utilities'] = df_ord['Utilities'].cat.codes

exterqual_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Average/Typical
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['ExterQual'] = df_ord['ExterQual'].astype("category")
df_ord['ExterQual'] = df_ord['ExterQual'].cat.set_categories(exterqual_orden, ordered=True)
df_ord['ExterQual'] = df_ord['ExterQual'].cat.codes


kitchenqual_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Typical/Average
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['KitchenQual'] = df_ord['KitchenQual'].astype("category")
df_ord['KitchenQual'] = df_ord['KitchenQual'].cat.set_categories(kitchenqual_orden, ordered=True)
df_ord['KitchenQual'] = df_ord['KitchenQual'].cat.codes






garagequal_orden = {
    'NoGarage': 0,  # No Garage
    'Po': 1,        # Poor
    'Fa': 2,        # Fair
    'TA': 3,        # Typical/Average
    'Gd': 4,        # Good
    'Ex': 5         # Excellent
}

df_ord['GarageQual'] = df_ord['GarageQual'].astype("category")
df_ord['GarageQual'] = df_ord['GarageQual'].cat.set_categories(garagequal_orden, ordered=True)
df_ord['GarageQual'] = df_ord['GarageQual'].cat.codes



extercond_orden = {
    'Po': 1,  # Poor
    'Fa': 2,  # Fair
    'TA': 3,  # Average/Typical
    'Gd': 4,  # Good
    'Ex': 5   # Excellent
}

df_ord['ExterCond'] = df_ord['ExterCond'].astype("category")
df_ord['ExterCond'] = df_ord['ExterCond'].cat.set_categories(extercond_orden, ordered=True)
df_ord['ExterCond'] = df_ord['ExterCond'].cat.codes

neighborhood_orden = {
    'Green': 1,
    'Yellow': 2,
    'Orange': 3
}

df_ord['Neighborhood'] = df_ord['Neighborhood'].astype("category")
df_ord['Neighborhood'] = df_ord['Neighborhood'].cat.set_categories(neighborhood_orden, ordered=True)
df_ord['Neighborhood'] = df_ord['Neighborhood'].cat.codes



cols_nominales = [
        'MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 
        'BldgType', 'Exterior1st', 'Foundation', 
         'GarageType'
    ]

df_nomi = df[cols_nominales].copy()

df_nomi = pd.get_dummies(df_nomi, drop_first=True)

In [8]:
df_concat = pd.concat([df_num, df_nomi, df_ord], axis=1)

In [9]:
predictores_elegidos = ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'GrLivArea',
       'GarageCars', 'Exterior1st_BrkFace', 'KitchenQual']

In [10]:
X = df_concat.drop(columns=['SalePrice'])
y = df_concat['SalePrice']

X = X[predictores_elegidos]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
print(list(X_train.columns))

['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'GrLivArea', 'GarageCars', 'Exterior1st_BrkFace', 'KitchenQual']


In [13]:
## Para los modelos lineales debemos normalizar y escalar los datos numericos 

cols_para_sesgamiento = X_train[['LotArea', 'GrLivArea']]




sesgamiento = cols_para_sesgamiento.skew().sort_values(ascending=False)

cols_sesgadas = sesgamiento[abs(sesgamiento) > 0.5].index

print(cols_sesgadas)

Index(['LotArea', 'GrLivArea'], dtype='object')


In [14]:
X_train[cols_sesgadas] = np.log1p(X_train[cols_sesgadas])
X_test[cols_sesgadas] = np.log1p(X_test[cols_sesgadas])

In [15]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [16]:
scaler = StandardScaler()

cols_para_escalar = ['LotArea', 'OverallQual', 'YearBuilt', 'GrLivArea', 'GarageCars', 'KitchenQual', 'OverallCond']

X_train[cols_para_escalar] = scaler.fit_transform(X_train[cols_para_escalar])
X_test[cols_para_escalar] = scaler.transform(X_test[cols_para_escalar])

joblib.dump(scaler, 'linearScaler.pkl')

['linearScaler.pkl']

In [17]:
output_folder = "processed_data/linear"

X_train.to_pickle(f"{output_folder}/Xlm_train.pkl")
X_test.to_pickle(f"{output_folder}/Xlm_test.pkl")

# Export train and test target variables
y_train.to_pickle(f"{output_folder}/ylm_train.pkl")
y_test.to_pickle(f"{output_folder}/ylm_test.pkl")

In [18]:
feature_names = list(X_train.columns)
print()

# Save to JSON
with open("en_features.json", "w") as f:
    json.dump(feature_names, f)




In [19]:
print(list(X_train.columns))

['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'GrLivArea', 'GarageCars', 'Exterior1st_BrkFace', 'KitchenQual']
