# Import modules

In [1]:
import pandas as pd
import numpy as np

# preprocessing
from sklearn.impute          import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing   import StandardScaler, OneHotEncoder, OrdinalEncoder

# estimators
from sklearn.ensemble        import RandomForestClassifier
from sklearn.linear_model    import LogisticRegression
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.svm             import SVC

# pipeline
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline

# metrics
from sklearn.metrics         import accuracy_score, confusion_matrix, precision_score, precision_recall_curve, \
                                    recall_score, f1_score, fbeta_score, precision_recall_fscore_support, average_precision_score

from json import dumps
# import plotly.express as px

In [2]:
from sklearn import set_config
set_config(transform_output='pandas')

In [3]:
%pdb on

Automatic pdb calling has been turned ON


# Global settings

In [4]:
RANDOM_STATE = 123

# Load data

In [5]:
df = pd.read_csv('../housing-classification-iter6.csv')

In [6]:
df.columns = df.columns.str.strip()
scol = sorted(df.columns)
df = df[scol]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   1stFlrSF       1460 non-null   int64  
 1   2ndFlrSF       1460 non-null   int64  
 2   3SsnPorch      1460 non-null   int64  
 3   Alley          91 non-null     object 
 4   BedroomAbvGr   1460 non-null   int64  
 5   BldgType       1460 non-null   object 
 6   BsmtCond       1423 non-null   object 
 7   BsmtExposure   1422 non-null   object 
 8   BsmtFinSF1     1460 non-null   int64  
 9   BsmtFinSF2     1460 non-null   int64  
 10  BsmtFinType1   1423 non-null   object 
 11  BsmtFinType2   1422 non-null   object 
 12  BsmtFullBath   1460 non-null   int64  
 13  BsmtHalfBath   1460 non-null   int64  
 14  BsmtQual       1423 non-null   object 
 15  BsmtUnfSF      1460 non-null   int64  
 16  CentralAir     1460 non-null   object 
 17  Condition1     1460 non-null   object 
 18  Conditio

In [8]:
# drop nan columns
df = df.drop(columns=['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   1stFlrSF       1460 non-null   int64  
 1   2ndFlrSF       1460 non-null   int64  
 2   3SsnPorch      1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   BldgType       1460 non-null   object 
 5   BsmtCond       1423 non-null   object 
 6   BsmtExposure   1422 non-null   object 
 7   BsmtFinSF1     1460 non-null   int64  
 8   BsmtFinSF2     1460 non-null   int64  
 9   BsmtFinType1   1423 non-null   object 
 10  BsmtFinType2   1422 non-null   object 
 11  BsmtFullBath   1460 non-null   int64  
 12  BsmtHalfBath   1460 non-null   int64  
 13  BsmtQual       1423 non-null   object 
 14  BsmtUnfSF      1460 non-null   int64  
 15  CentralAir     1460 non-null   object 
 16  Condition1     1460 non-null   object 
 17  Condition2     1460 non-null   object 
 18  Electric

In [9]:
X = df.copy()
y = X.pop('Expensive')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

## Clean and prepare data

In [10]:
X_train

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
318,1372,1274,0,4,1Fam,TA,Gd,987,0,GLQ,...,WD,0,Pave,9,1347,AllPub,340,1993,1993,2009
580,1429,0,0,3,1Fam,TA,No,594,219,BLQ,...,WD,0,Pave,7,1144,AllPub,216,1960,1987,2007
961,1542,1330,0,4,1Fam,Gd,No,896,0,ALQ,...,WD,0,Pave,11,1330,AllPub,550,1977,1995,2008
78,1768,0,0,4,Duplex,TA,No,0,0,Unf,...,WD,0,Pave,8,1768,AllPub,0,1968,1968,2010
5,796,566,320,1,1Fam,TA,No,732,0,GLQ,...,WD,0,Pave,5,796,AllPub,40,1993,1995,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,800,832,0,4,1Fam,TA,No,400,64,GLQ,...,WD,0,Pave,7,800,AllPub,0,1966,2000,2008
1122,960,0,0,3,1Fam,TA,No,0,0,Unf,...,COD,160,Pave,5,672,AllPub,64,1956,1956,2009
1346,2156,0,290,3,1Fam,TA,No,297,68,BLQ,...,WD,0,Pave,9,1568,AllPub,0,1968,2003,2006
1406,768,0,0,2,1Fam,TA,Av,656,0,GLQ,...,WD,0,Pave,5,768,AllPub,58,1972,2007,2009


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 318 to 1389
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   1stFlrSF       1168 non-null   int64  
 1   2ndFlrSF       1168 non-null   int64  
 2   3SsnPorch      1168 non-null   int64  
 3   BedroomAbvGr   1168 non-null   int64  
 4   BldgType       1168 non-null   object 
 5   BsmtCond       1139 non-null   object 
 6   BsmtExposure   1138 non-null   object 
 7   BsmtFinSF1     1168 non-null   int64  
 8   BsmtFinSF2     1168 non-null   int64  
 9   BsmtFinType1   1139 non-null   object 
 10  BsmtFinType2   1139 non-null   object 
 11  BsmtFullBath   1168 non-null   int64  
 12  BsmtHalfBath   1168 non-null   int64  
 13  BsmtQual       1139 non-null   object 
 14  BsmtUnfSF      1168 non-null   int64  
 15  CentralAir     1168 non-null   object 
 16  Condition1     1168 non-null   object 
 17  Condition2     1168 non-null   object 
 18  Electr

In [12]:
X_train.isna().sum()

1stFlrSF        0
2ndFlrSF        0
3SsnPorch       0
BedroomAbvGr    0
BldgType        0
               ..
Utilities       0
WoodDeckSF      0
YearBuilt       0
YearRemodAdd    0
YrSold          0
Length: 75, dtype: int64

In [13]:
# categorical features

# cat_idx = list(range(9, 26))
# more_cats = list(range(50, 75))
# cat_idx = cat_idx + more_cats
# X_train_cat = X_train.iloc[:, cat_idx]
X_train_cat_cols = [
    'BldgType',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'BsmtQual',
    'CentralAir',
    'Condition1',
    'Condition2',
    'Electrical',
    'ExterCond',
    'ExterQual',
    'Exterior1st',
    'Exterior2nd',
    'FireplaceQu',
    'Foundation',
    'Functional',
    'GarageCond',
    'GarageFinish',
    'GarageQual',
    'GarageType',
    'Heating',
    'HeatingQC',
    'HouseStyle',
    'KitchenQual',
    'LandContour',
    'LandSlope',
    'LotConfig',
    'LotShape',
    'MSSubClass',
    'MSZoning',
    'MasVnrType',
    'Neighborhood',
    'OverallCond',
    'OverallQual',
    'PavedDrive',
    'RoofMatl',
    'RoofStyle',
    'SaleCondition',
    'SaleType',
    'Street',
    'Utilities',
]

X_train_cat = X_train[X_train_cat_cols]

In [14]:
# numerical features

X_train_num_cols = list(set(X_train.columns) - set(X_train_cat_cols))
X_train_num = X_train[X_train_num_cols]

# Pipeline

## categorical pipeline

In [15]:
X_train_cat_ord  = [
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'KitchenQual',
    'FireplaceQu',
    'OverallQual',
    'OverallCond',
    'GarageQual',
    'GarageCond'
    # 'PoolQC',
    # 'Fence'
]
X_train_cat_nonord  = list(set(X_train_cat_cols) - set(X_train_cat_ord))

In [16]:
ord_categories = [
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['No', 'Mn', 'Av', 'Gd'],
    ['Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    list(range(1, 11)),
    list(range(1, 11)),
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ['Po', 'Fa', 'TA', 'Gd', 'Ex']
    # ['Fa', 'TA', 'Gd', 'Ex'],
    # ['MvWw', 'GdWo', 'MnPrv', 'GdPrv']
]
unknown_value  = -1

In [17]:
encoders = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=ord_categories, handle_unknown='use_encoded_value', unknown_value=unknown_value), X_train_cat_ord),
        ('nonordinal', OneHotEncoder(handle_unknown='ignore', sparse_output=False), X_train_cat_nonord)
    ]
)

In [18]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='N_A')),
    ('encoders', encoders)
])

## numerical pipeline

In [19]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

## estimators

In [56]:
estimators = {
    'KNN': [KNeighborsClassifier(), {
        'estimator__n_neighbors': range(3, 9),
        'estimator__weights': ['uniform', 'distance'],
        'estimator__leaf_size': range(5, 15)
    }],
    # 'LR': [LogisticRegression(random_state=RANDOM_STATE), {
    #     'estimator__tol': np.arange(5e-6, 5e-5, 1e-6)
    # }],
    # 'SVC': [SVC(random_state=RANDOM_STATE), {
    #     'estimator__gamma': ['scale', 'auto']
    # }],
    # 'RF': [RandomForestClassifier(random_state=RANDOM_STATE), {
    #     'estimator__n_estimators': range(90, 100, 1),
    #     'estimator__criterion': ['gini', 'entropy', 'log_loss']
    # }]
}

## full pipeline

In [57]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat_pipe', cat_pipe, X_train_cat_cols),
        ('num_pipe', num_pipe, X_train_num_cols),
    ]
)

In [58]:
def create_pipeline_and_cvparams(preprocessor, estimator):

    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('estimator', estimator[1][0])
    ])

    cvparams = estimator[1][1]

    return pipe, cvparams

In [59]:
# nord_stream, params_est = create_pipeline_and_cvparams(preprocessor, estimator=estimators['KNN'])

In [60]:
# nord_stream.fit(X_train, y_train)

In [61]:
# y_pred = nord_stream.predict(X_test)

# Cross validation

In [62]:
params_pre = {
    "preprocessor__num_pipe__imputer__strategy": ["mean", "median"],
    "preprocessor__num_pipe__scaler__with_mean": [True, False],
    "preprocessor__num_pipe__scaler__with_std": [True, False]
}

# Model evaluation

In [63]:
def evaluate_model(y_true, y_pred, beta=1):

    output = {}

    output['conf_matrix'] = confusion_matrix(y_true=y_true, y_pred=y_pred)
    output['acc_score']   = accuracy_score(  y_true=y_true, y_pred=y_pred)
    output['prec_score']  = precision_score( y_true=y_true, y_pred=y_pred)
    output['f1_score']    = f1_score(        y_true=y_true, y_pred=y_pred)
    output['fbeta_score'] = fbeta_score(     y_true=y_true, y_pred=y_pred, beta=beta)

    # support                      = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred, beta=beta)
    # precision, recall, threshold = precision_recall_curve(y_true=y_true, y_score=y_score)
    # ave_score                    = average_precision_score(y_true=y_true, y_score=y_score)

    return output

# Create and run model

In [64]:
def run_model(xtrain, ytrain, xtest, ytest, params_pre, preprocessor, estimator, cv):

    pipe, params_est = create_pipeline_and_cvparams(preprocessor, estimator=estimator)

    if params_est:
        params = params_pre | params_est
    else:
        params = params_pre

    grid = GridSearchCV(pipe, params, cv=cv, verbose=1)
    grid.fit(xtrain, ytrain)

    grid_score       = grid.best_score_
    grid_best_params = grid.best_params_
    best_estimator   = grid.best_estimator_

    acc_score = accuracy_score(y_true=ytest, y_pred=grid.predict(xtest))

    # estimator_evaluation = evaluate_model(ytest, best_estimator.predict(xtest))

    return best_estimator, grid_best_params, grid_score, acc_score #, estimator_evaluation

In [65]:
total = {}
for estimator in estimators.items():
    model, best_params, grid_score, acc_score = run_model(
        xtrain=X_train,
        ytrain=y_train,
        xtest=X_test,
        ytest=y_test,
        params_pre=params_pre,
        preprocessor=preprocessor,
        estimator=estimator,
        cv=3
    )

    total[estimator[0]] = {
        'model': model,
        'best_params': best_params,
        'grid_score': grid_score,
        'acc_score': acc_score
        # 'eval_results': eval_results
    }

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fda8aee85e0>
Traceback (most recent call last):
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
 

Fitting 3 folds for each of 960 candidates, totalling 2880 fits


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fda8ae8e160>
Traceback (most recent call last):
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
 

In [66]:
for key in total.keys():
    print(key)
    print('='*50)
    for i, j in total[key].items():
        if (i != 'model'):
            print(i)
            print(j)
            print()

KNN
best_params
{'estimator__leaf_size': 5, 'estimator__n_neighbors': 7, 'estimator__weights': 'uniform', 'preprocessor__num_pipe__imputer__strategy': 'mean', 'preprocessor__num_pipe__scaler__with_mean': True, 'preprocessor__num_pipe__scaler__with_std': True}

grid_score
0.9356252975159355

acc_score
0.9691780821917808



In [32]:
model = total['KNN']['model']
model

In [33]:
print(model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat_pipe',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='N_A',
                                                                                 strategy='constant')),
                                                                  ('encoders',
                                                                   ColumnTransformer(transformers=[('ordinal',
                                                                                                    OrdinalEncoder(categories=[['Po',
                                                                                                                                'Fa',
                                                                                                                                'TA',
                                

# Final testing

## Load test data

In [34]:
test = pd.read_csv('test.csv')

In [35]:
test.columns = test.columns.str.strip()
tscol = sorted(test.columns)
test = test[tscol]
test

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,896,0,0,,2,1Fam,TA,No,468.0,144.0,...,WD,120,Pave,5,882.0,AllPub,140,1961,1961,2010
1,1329,0,0,,3,1Fam,TA,No,923.0,0.0,...,WD,0,Pave,6,1329.0,AllPub,393,1958,1958,2010
2,928,701,0,,3,1Fam,TA,No,791.0,0.0,...,WD,0,Pave,6,928.0,AllPub,212,1997,1998,2010
3,926,678,0,,3,1Fam,TA,No,602.0,0.0,...,WD,0,Pave,7,926.0,AllPub,360,1998,1998,2010
4,1280,0,0,,2,TwnhsE,TA,No,263.0,0.0,...,WD,144,Pave,5,1280.0,AllPub,0,1992,1992,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,546,546,0,,3,Twnhs,TA,No,0.0,0.0,...,WD,0,Pave,5,546.0,AllPub,0,1970,1970,2006
1455,546,546,0,,3,TwnhsE,TA,No,252.0,0.0,...,WD,0,Pave,6,546.0,AllPub,0,1970,1970,2006
1456,1224,0,0,,4,1Fam,TA,No,1224.0,0.0,...,WD,0,Pave,7,1224.0,AllPub,474,1960,1996,2006
1457,970,0,0,,3,1Fam,TA,Av,337.0,0.0,...,WD,0,Pave,6,912.0,AllPub,80,1992,1992,2006


In [36]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   1stFlrSF       1459 non-null   int64  
 1   2ndFlrSF       1459 non-null   int64  
 2   3SsnPorch      1459 non-null   int64  
 3   Alley          107 non-null    object 
 4   BedroomAbvGr   1459 non-null   int64  
 5   BldgType       1459 non-null   object 
 6   BsmtCond       1414 non-null   object 
 7   BsmtExposure   1415 non-null   object 
 8   BsmtFinSF1     1458 non-null   float64
 9   BsmtFinSF2     1458 non-null   float64
 10  BsmtFinType1   1417 non-null   object 
 11  BsmtFinType2   1417 non-null   object 
 12  BsmtFullBath   1457 non-null   float64
 13  BsmtHalfBath   1457 non-null   float64
 14  BsmtQual       1415 non-null   object 
 15  BsmtUnfSF      1458 non-null   float64
 16  CentralAir     1459 non-null   object 
 17  Condition1     1459 non-null   object 
 18  Conditio

In [37]:
test = test.drop(columns=['Id.1', 'Alley', 'PoolQC', 'Fence', 'MiscFeature'])

In [38]:
test_id = test.pop('Id')

In [39]:
X_test_final = test.copy()

## Train model an full dataset

In [40]:
X_train = X.copy()
y_train = y.copy()

In [41]:
model.fit(X_train, y_train)

In [42]:
result = model.predict(X_test_final)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fda8b208cc0>
Traceback (most recent call last):
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/hristo/miniconda3/envs/wbs_bootcamp/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
 

In [49]:
len(result)

1459

In [50]:
len(test_id)

1459

In [51]:
final = pd.DataFrame({
    'Id': test_id,
    'Expensive': result
})
final

Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0


In [53]:
final.to_csv('winner.csv', index=False)