# Feature Engineering

## Importando bibliotecas

In [62]:
import pathlib
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Importando os dados do notebook limpo

In [63]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

/Users/luccahiratsuca/Github/Insper/4 Semestre/Machine Learning/ames-project/data


In [64]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

In [65]:
df = pd.read_pickle(clean_data_path)
df.head()

Unnamed: 0,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Lot.Shape,Land.Contour,Lot.Config,Land.Slope,Neighborhood,Bldg.Type,...,Sale.Type,Sale.Condition,SalePrice,Condition,HasShed,HasAlley,Exterior,Garage.Age,Remod.Age,House.Age
0,20,RL,141.0,31770.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.332438,Norm,False,False,BrkFace,50.0,50.0,50.0
1,20,RH,80.0,11622.0,Reg,Lvl,Inside,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.021189,Roads,False,False,VinylSd,49.0,49.0,49.0
2,20,RL,81.0,14267.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.235528,Norm,False,False,Wd Sdng,52.0,52.0,52.0
3,20,RL,93.0,11160.0,Reg,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.38739,Norm,False,False,BrkFace,42.0,42.0,42.0
4,60,RL,74.0,13830.0,IR1,Lvl,Inside,Gtl,Gilbert,1Fam,...,GroupedWD,Normal,5.278525,Norm,False,False,VinylSd,13.0,12.0,13.0


In [66]:
df.shape

(2877, 70)

## Encoding das variáveis

In [67]:
categorical_columns = []
ordinal_columns = []
for col in df.select_dtypes('category').columns:
    if df[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

In [68]:
categorical_columns

['MS.SubClass',
 'MS.Zoning',
 'Land.Contour',
 'Lot.Config',
 'Neighborhood',
 'Bldg.Type',
 'House.Style',
 'Roof.Style',
 'Mas.Vnr.Type',
 'Foundation',
 'Bsmt.Qual',
 'Bsmt.Cond',
 'Bsmt.Exposure',
 'BsmtFin.Type.1',
 'BsmtFin.Type.2',
 'Central.Air',
 'Garage.Type',
 'Garage.Finish',
 'Sale.Type',
 'Sale.Condition',
 'Condition',
 'Exterior']

In [69]:
ordinal_columns

['Lot.Shape',
 'Land.Slope',
 'Overall.Qual',
 'Overall.Cond',
 'Exter.Qual',
 'Exter.Cond',
 'Heating.QC',
 'Electrical',
 'Kitchen.Qual',
 'Functional',
 'Paved.Drive',
 'Fence']

### Encoding das variáveis ordinais

In [70]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
df[ordinal_columns] = ordinal_encoder.fit_transform(df[ordinal_columns])
df.head()

Unnamed: 0,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Lot.Shape,Land.Contour,Lot.Config,Land.Slope,Neighborhood,Bldg.Type,...,Sale.Type,Sale.Condition,SalePrice,Condition,HasShed,HasAlley,Exterior,Garage.Age,Remod.Age,House.Age
0,20,RL,141.0,31770.0,0.0,Lvl,Corner,0.0,NAmes,1Fam,...,GroupedWD,Normal,5.332438,Norm,False,False,BrkFace,50.0,50.0,50.0
1,20,RH,80.0,11622.0,3.0,Lvl,Inside,0.0,NAmes,1Fam,...,GroupedWD,Normal,5.021189,Roads,False,False,VinylSd,49.0,49.0,49.0
2,20,RL,81.0,14267.0,0.0,Lvl,Corner,0.0,NAmes,1Fam,...,GroupedWD,Normal,5.235528,Norm,False,False,Wd Sdng,52.0,52.0,52.0
3,20,RL,93.0,11160.0,3.0,Lvl,Corner,0.0,NAmes,1Fam,...,GroupedWD,Normal,5.38739,Norm,False,False,BrkFace,42.0,42.0,42.0
4,60,RL,74.0,13830.0,0.0,Lvl,Inside,0.0,Gilbert,1Fam,...,GroupedWD,Normal,5.278525,Norm,False,False,VinylSd,13.0,12.0,13.0


Confirmando a transformação das variáveis ordinais:

In [71]:
df[ordinal_columns].info()

<class 'pandas.core.frame.DataFrame'>
Index: 2877 entries, 0 to 2929
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Lot.Shape     2877 non-null   float64
 1   Land.Slope    2877 non-null   float64
 2   Overall.Qual  2877 non-null   float64
 3   Overall.Cond  2877 non-null   float64
 4   Exter.Qual    2877 non-null   float64
 5   Exter.Cond    2877 non-null   float64
 6   Heating.QC    2877 non-null   float64
 7   Electrical    2877 non-null   float64
 8   Kitchen.Qual  2877 non-null   float64
 9   Functional    2877 non-null   float64
 10  Paved.Drive   2877 non-null   float64
 11  Fence         2877 non-null   float64
dtypes: float64(12)
memory usage: 292.2 KB


### Encoding das variáveis categóricas

In [72]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Primeiro, precisamos converter o tipo das variáveis categóricas para string. Caso contrário, recebemos a seguinte mensagem ao tentar aplicar o OneHotEncoder:

```python
TypeError: Encoders require their input to be uniformly strings or numbers. Got ['float', 'str']
```

Assim:

In [73]:
df[categorical_columns] = df[categorical_columns].astype(str)

Agora, podemos aplicar o OneHotEncoder:

In [74]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first'), categorical_columns),
    ],
    remainder='passthrough', # Isso faz com que as colunas que não foram transformadas sejam mantidas
)

array_transformed = preprocessor.fit_transform(df)

new_categorical_columns = preprocessor.named_transformers_['onehot'].get_feature_names_out()
new_columns = new_categorical_columns.tolist() + [col for col in df.columns if col not in categorical_columns]

df = pd.DataFrame(array_transformed, columns=new_columns)
new_columns

['MS.SubClass_160',
 'MS.SubClass_190',
 'MS.SubClass_20',
 'MS.SubClass_30',
 'MS.SubClass_50',
 'MS.SubClass_60',
 'MS.SubClass_70',
 'MS.SubClass_80',
 'MS.SubClass_85',
 'MS.SubClass_90',
 'MS.SubClass_Other',
 'MS.Zoning_RH',
 'MS.Zoning_RL',
 'MS.Zoning_RM',
 'Land.Contour_HLS',
 'Land.Contour_Low',
 'Land.Contour_Lvl',
 'Lot.Config_CulDSac',
 'Lot.Config_FR2',
 'Lot.Config_FR3',
 'Lot.Config_Inside',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_IDOTRR',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NAmes',
 'Neighborhood_NPkVill',
 'Neighborhood_NWAmes',
 'Neighborhood_NoRidge',
 'Neighborhood_NridgHt',
 'Neighborhood_OldTown',
 'Neighborhood_SWISU',
 'Neighborhood_Sawyer',
 'Neighborhood_SawyerW',
 'Neighborhood_Somerst',
 'Neighborhood_StoneBr',
 'Neighborhood_Timber',
 'Neighborhood_Veenker',
 'Bldg.Type_2fmCon',
 '

Verificando a transformação das variáveis categóricas:

In [76]:
for cat in categorical_columns:
    dummies = []
    for col in df.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

From column "MS.SubClass" we made "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_20", "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NPkVill", "Neighborhood_NWAmes", "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_OldTown", "Neighborhood_SWISU", "Neighborhood_Sawyer", "Neighborhood_Saw

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2877 entries, 0 to 2876
Columns: 165 entries, MS.SubClass_160 to House.Age
dtypes: object(165)
memory usage: 3.6+ MB
