# Data exploration :

In [1]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

In [2]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal


In [3]:
data.dtypes.value_counts()

object     43
int64      33
float64     3
dtype: int64

# Only numeric data :

In [4]:
numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

data_numeric = data[numerical_features]

model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))

cv_result_categorical = cross_validate(model, data_numeric, target, cv=10)
cv_result_categorical

{'fit_time': array([0.02083182, 0.0199523 , 0.05039549, 0.01588416, 0.01397157,
        0.01379466, 0.01295853, 0.01694322, 0.01096439, 0.00995994]),
 'score_time': array([0.00398636, 0.00199461, 0.0019927 , 0.00199437, 0.00099707,
        0.00099492, 0.00199199, 0.00099635, 0.00100374, 0.00100327]),
 'test_score': array([0.9109589 , 0.89041096, 0.9109589 , 0.88356164, 0.90410959,
        0.88356164, 0.88356164, 0.87671233, 0.89726027, 0.87671233])}

In [19]:
scores = cv_result_categorical["test_score"]
print(f"The mean cross-validation accuracy is {scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is 0.892 +/- 0.013


# Both numerical and categorical data :

In [12]:
categorical_features = [_ for _ in data.columns.tolist() if _ not in numerical_features]

In [20]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

numerical_preprocessor = StandardScaler()
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer([
    ('numerical', numerical_preprocessor, numerical_features),
    ('categorical', categorical_preprocessor, categorical_features)
])

model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))

cv_result_both = cross_validate(model, data, target, cv=10)
cv_result_both

{'fit_time': array([0.19107294, 0.16220713, 0.13254285, 0.1232934 , 0.13721108,
        0.12709761, 0.15838528, 0.13206244, 0.14783669, 0.16663885]),
 'score_time': array([0.01221132, 0.00897336, 0.00997162, 0.00996542, 0.01097393,
        0.00997305, 0.01051784, 0.00997162, 0.0119617 , 0.01061869]),
 'test_score': array([0.95890411, 0.90410959, 0.89041096, 0.92465753, 0.9109589 ,
        0.93835616, 0.90410959, 0.91780822, 0.92465753, 0.89726027])}

In [21]:
scores = cv_result_both["test_score"]
print(f"The mean cross-validation accuracy is {scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is 0.917 +/- 0.019


# Comparing models :

In [24]:
cv_result_categorical["test_score"]

array([0.9109589 , 0.89041096, 0.9109589 , 0.88356164, 0.90410959,
       0.88356164, 0.88356164, 0.87671233, 0.89726027, 0.87671233])

In [23]:
cv_result_both["test_score"]

array([0.95890411, 0.90410959, 0.89041096, 0.92465753, 0.9109589 ,
       0.93835616, 0.90410959, 0.91780822, 0.92465753, 0.89726027])

In [27]:
(cv_result_both["test_score"] > cv_result_categorical["test_score"]).sum()

9