In [2]:
import pandas as pd

In [3]:
ames_housing = pd.read_csv('data/house_prices.csv', na_values="?")
ames_housing.drop(columns="Id", inplace=True)

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

In [4]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [6]:
from sklearn.compose import make_column_selector as selector

In [7]:
numerical_columns_selector = selector(dtype_exclude=object)
numerical_columns = numerical_columns_selector(data)
print(f"There are {len(numerical_columns)} numerical features.")

There are 36 numerical features.


In [8]:
numerical_columns

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [9]:
data[["YearBuilt"]]

Unnamed: 0,YearBuilt
0,2003
1,1976
2,2001
3,1915
4,2000
...,...
1455,1999
1456,1978
1457,1941
1458,1950


## Q5

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer

In [11]:
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('standard-scaler', numerical_preprocessor, numerical_columns)
])

model = make_pipeline(preprocessor, LogisticRegression())

In [12]:
cv_results = cross_validate(model, data, target, cv=5)
cv_results

{'fit_time': array([0.03507829, 0.03499317, 0.02942586, 0.02516913, 0.02655649]),
 'score_time': array([0.00271845, 0.00282931, 0.00298762, 0.00283599, 0.00300407]),
 'test_score': array([0.9109589 , 0.91780822, 0.94178082, 0.90068493, 0.92808219])}

In [13]:
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.920 +/- 0.014


## Explanation of Q5

In [14]:
data_numerical = data[numerical_columns]

model = make_pipeline(StandardScaler(), SimpleImputer(), LogisticRegression())
cv_results_num = cross_validate(model, data_numerical, target)
cv_results_num["test_score"].mean()

0.9198630136986301

# Q6

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
scaler_imputer_transformer = make_pipeline(StandardScaler(), SimpleImputer(strategy="most_frequent"))
categorical_columns = selector(dtype_include=object)
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard-scaler', scaler_imputer_transformer, numerical_columns)
])

model = make_pipeline(preprocessor, LogisticRegression())

In [17]:
cv_results_num = cross_validate(model, data, target)
cv_results_num["test_score"].mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9246575342465754

## Explanation of Q6

In [18]:
from sklearn.compose import make_column_transformer

In [19]:
categorical_columns = data.columns.difference(numerical_columns)

categorical_processor = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"),
)
numerical_processor = make_pipeline(StandardScaler(), SimpleImputer())

preprocessor = make_column_transformer(
    (categorical_processor, categorical_columns),
    (numerical_processor, numerical_columns),
)
model = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
cv_results_all = cross_validate(model, data, target, error_score="raise")
cv_results_all["test_score"].mean()

0.923972602739726

In [20]:
cv_results_all["test_score"].mean() - cv_results_num["test_score"].mean()

-0.00068493150684934

In [21]:
3 * cv_results_all["test_score"].std()

0.030059052407522862