In [None]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

ames_housing.head()

data.head()

target.head()

### Questions

data.head()

data.info()

numerical_data = data.select_dtypes(exclude=object)

numerical_data.info()

numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

### Model implementation

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

data_numerical = data[numerical_features]

data_numerical.head()

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), LogisticRegression())

from sklearn.model_selection import cross_validate
cv = cross_validate(model, data_numerical, target, cv=10)

print(f"{cv['test_score'].mean():.3f} ± {cv['test_score'].std():.3f}")

### All data Model implementation

data_categorical = data.drop(columns=numerical_features)
data_categorical

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore"), data_categorical.columns.values),
    (StandardScaler(), data_numerical.columns.values)
)

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

model

cv = cross_validate(model, data, target, cv=10)

print(f"{cv['test_score'].mean():.3f} ± {cv['test_score'].std():.3f}")