## Libraries and Data

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
    cohen_kappa_score
)

from sklearn import set_config
set_config(transform_output='pandas')

# Random key for the rest of notebook
rk = 314

In [None]:
path1 = "/Housing Project/competition/labeled_data.csv"

data = pd.read_csv(path1)
data = data.set_index('Id')

path2 = "/Housing Project/competition/unlabeled_data.csv"

unlabeled_data = pd.read_csv(path2)
unlabeled_data = unlabeled_data.set_index('Id')

#this column acts as categorical data, but stored as integer
data['MSSubClass'] = data['MSSubClass'].astype(str)
unlabeled_data['MSSubClass'] = unlabeled_data['MSSubClass'].astype(str)


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual     

In [25]:
# feature engineering labeled

data['Qual_x_GrLivArea'] = data['OverallQual'] * data['GrLivArea']
data['Qual_x_Bsmt'] = data['OverallQual'] * data['TotalBsmtSF']

data['Total_SF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

data['Total_Bathrooms'] = (data['FullBath'] + (0.5 * data['HalfBath']) +
                           data['BsmtFullBath'] + (0.5 * data['BsmtHalfBath']))
                           
data['House_Age'] = data['YrSold'] - data['YearBuilt']
data['Remod_Age'] = data['YrSold'] - data['YearRemodAdd']

# feature engineering unlabeled

unlabeled_data['Qual_x_GrLivArea'] = unlabeled_data['OverallQual'] * unlabeled_data['GrLivArea']
unlabeled_data['Qual_x_Bsmt'] = unlabeled_data['OverallQual'] * unlabeled_data['TotalBsmtSF']

unlabeled_data['Total_SF'] = unlabeled_data['TotalBsmtSF'] + unlabeled_data['1stFlrSF'] + unlabeled_data['2ndFlrSF']

unlabeled_data['Total_Bathrooms'] = (unlabeled_data['FullBath'] + (0.5 * unlabeled_data['HalfBath']) +
                           unlabeled_data['BsmtFullBath'] + (0.5 * unlabeled_data['BsmtHalfBath']))
                           
unlabeled_data['House_Age'] = unlabeled_data['YrSold'] - unlabeled_data['YearBuilt']
unlabeled_data['Remod_Age'] = unlabeled_data['YrSold'] - unlabeled_data['YearRemodAdd']

In [None]:

# Feature and Target Separation

X = data
y = X.pop('Expensive')

# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=rk)

## Preprocessing

In [None]:

# Preprocessing Pipelines

# Identify numerical and categorical features
num_features = X.select_dtypes(include="number").columns
cat_features = X.select_dtypes(exclude="number").columns

# Numerical pipeline: Impute missing values using the mean.
num_pipe = make_pipeline(SimpleImputer(strategy='mean'))

# Categorical preprocessing:
# - Impute missing categorical values with a constant 'N_A'
cat_imputer = make_pipeline(SimpleImputer(strategy='constant', fill_value='N_A'))

# For ordinal encoding, specify the feature and its explicit order.

LotShape = ['N_A', 'IR3', 'IR2', 'IR1', 'Reg']
Utilities = ['N_A', 'ELO', 'NoSeWa', 'NoSewr', 'AllPub']
LandSlope = ['N_A', 'Sev', 'Mod', 'Gtl']
ExterQual = ['N_A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
ExterCond = ['N_A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtQual = ['N_A', 'NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtCond = ['N_A', 'NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
BsmtExposure = ['N_A', 'NA', 'No', 'Mn', 'Av', 'Gd']
BsmtFinType1 = ['N_A', 'NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
BsmtFinType2 = ['N_A', 'NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
HeatingQC = ['N_A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
KitchenQual = ['N_A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
FireplaceQu = ['N_A', 'NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
GarageFinish = ['N_A', 'NA', 'Unf', 'RFn', 'Fin']
GarageQual = ['N_A', 'NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
GarageCond = ['N_A', 'NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
PavedDrive = ['N_A', 'N', 'P', 'Y']
PoolQC = ['N_A', 'NA', 'Fa', 'TA', 'Gd', 'Ex']
Street = ['N_A', 'Grvl', 'Pave']
Alley = ['N_A', 'NA', 'Grvl', 'Pave']
CentralAir = ['N_A', 'N', 'Y']

#careful with this columns
Functional = ['N_A', 'Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']


ordinal_features = [LotShape, Utilities, LandSlope, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure, 
                   BsmtFinType1, BsmtFinType2, HeatingQC, KitchenQual, Functional, 
                   FireplaceQu, GarageFinish, GarageQual, GarageCond, PavedDrive,
                   PoolQC, Street, Alley, CentralAir]

ordinal_features_col_names = ['LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                   'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 
                   'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                   'PoolQC', 'Street', 'Alley', 'CentralAir']


ord_encoder = OrdinalEncoder(categories=ordinal_features)

# Identify nominal categorical features (those that will be one-hot encoded)
nominal_features = list(set(cat_features)-set(ordinal_features_col_names))
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Combine ordinal and nominal encoders using a column transformer.
cat_encoder = make_column_transformer(
    (ord_encoder, ordinal_features_col_names),
    (oh_encoder, nominal_features)
)

# Create a categorical pipeline: imputation followed by encoding.
cat_pipe = make_pipeline(cat_imputer, cat_encoder)


preprocessor = make_column_transformer(
    (num_pipe, num_features),
    (cat_pipe, cat_features)
)

preprocessor

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,steps,"[('simpleimputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'N_A'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinalencoder', ...), ('onehotencoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['N_A', 'IR3', ...], ['N_A', 'ELO', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [28]:
std_scaler = StandardScaler()

scaled_preprocessor = make_pipeline(preprocessor, std_scaler)

scaled_preprocessor


0,1,2
,steps,"[('columntransformer', ...), ('standardscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,steps,"[('simpleimputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'N_A'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinalencoder', ...), ('onehotencoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['N_A', 'IR3', ...], ['N_A', 'ELO', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [14]:
scaled_preprocessor.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('pipeline-1',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer())]),
                                    Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
          'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'OverallQual',
          'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
          'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF',...
          'MSSubClass', 'Alley', 'LotShape', 'LandContour', 'Utilities',
          'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2', 'BldgType',
          'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
          'MasVnrType', 'BsmtFinType2', 'HeatingQC', 'Electrical', 'Functional',
          'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
          'PoolQC', 'Fen

## Decision Tree Classifier

In [15]:
# Define a function to perform a grid search, which helps to avoid duplicating code for different models
def run_grid_search(model, param_grid, X_train, y_train, preprocessor, cv=5, verbose=1):
    # Create a pipeline that first applies the data preprocessing steps, then fits the model
    pipe = make_pipeline(preprocessor, model)

    # GridSearchCV will test all possible combinations of parameters defined in 'param_grid'
    grid_search = GridSearchCV(pipe, param_grid, cv=cv, verbose=verbose)

    # Fit the model on the training data with the various parameter combinations
    grid_search.fit(X_train, y_train)

    # Return the trained GridSearchCV object which holds the best parameters and model
    return grid_search

# Define a dictionary of hyperparameters to tune for the decision tree model
dt_param_grid = {
    "pipeline__columntransformer__pipeline-1__simpleimputer__strategy": ["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 20, 2),
    "decisiontreeclassifier__min_samples_leaf": range(3, 20, 2)
}

# Run the grid search for the DecisionTreeClassifier using the specified parameters
dt_search = run_grid_search(
    DecisionTreeClassifier(random_state=rk),
    dt_param_grid,
    X_train,
    y_train,
    scaled_preprocessor
)

# Display the process
dt_search

Fitting 5 folds for each of 162 candidates, totalling 810 fits


0,1,2
,estimator,Pipeline(step..._state=314))])
,param_grid,"{'decisiontreeclassifier__max_depth': range(2, 20, 2), 'decisiontreeclassifier__min_samples_leaf': range(3, 20, 2), 'pipeline__columntransf...simpleimputer__strategy': ['mean', 'median']}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,steps,"[('columntransformer', ...), ('standardscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,steps,"[('simpleimputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'N_A'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinalencoder', ...), ('onehotencoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['N_A', 'IR3', ...], ['N_A', 'ELO', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,8
,min_samples_split,2
,min_samples_leaf,7
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,314
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [16]:
dt_search.best_params_

{'decisiontreeclassifier__max_depth': 8,
 'decisiontreeclassifier__min_samples_leaf': 7,
 'pipeline__columntransformer__pipeline-1__simpleimputer__strategy': 'median'}

In [None]:
# Function to get the scores for our models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    scores = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Specificity": recall_score(y_test, y_pred, pos_label=0),
        "F1 Score": f1_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred)
    }
    return scores

# Create an empty DataFrame to store model evaluation results
model_scores_df = pd.DataFrame(columns=[
    "Model", "Accuracy", "Recall", "Precision",
    "Specificity", "F1 Score", "Balanced Accuracy", "Cohen's Kappa"
])

# Evaluate the Decision Tree model
dt_scores = evaluate_model(dt_search, X_test, y_test)
dt_scores["Model"] = "Decision Tree"

# Convert the dictionary to a Series matching the DataFrame columns, then assign as a new row
model_scores_df.loc[len(model_scores_df)] = pd.Series(dt_scores, index=model_scores_df.columns)

# Display the DataFrame
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Tree,0.958904,0.878049,0.837209,0.972112,0.857143,0.92508,0.833159


## KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

# Define the hyperparameter grid to be searched by the grid search
knn_param_grid = {
    "kneighborsclassifier__n_neighbors": range(1, 11),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

# Run a grid search to find the optimal combination of hyperparameters
knn_search = run_grid_search(
    KNeighborsClassifier(),
    knn_param_grid,
    X_train,
    y_train,
    scaled_preprocessor
)

# Display the grid search results
knn_search

Fitting 5 folds for each of 20 candidates, totalling 100 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"{'kneighborsclassifier__n_neighbors': range(1, 11), 'kneighborsclassifier__weights': ['uniform', 'distance']}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,steps,"[('columntransformer', ...), ('standardscaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pipeline-1', ...), ('pipeline-2', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,steps,"[('simpleimputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'N_A'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ordinalencoder', ...), ('onehotencoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['N_A', 'IR3', ...], ['N_A', 'ELO', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,7
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [19]:
# Evaluate the K-Nearest Neighbours (KNN) model using the testing dataset and obtain performance metrics
knn_scores = evaluate_model(knn_search, X_test, y_test)

# Label the metrics to indicate they belong to the KNN model
knn_scores["Model"] = "KNN"

# Append the KNN metrics as a new row to the existing DataFrame of model scores
model_scores_df.loc[len(model_scores_df)] = pd.Series(knn_scores, index=model_scores_df.columns)

# Display the updated DataFrame containing all model performance metrics
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Decision Tree,0.958904,0.878049,0.837209,0.972112,0.857143,0.92508,0.833159
1,KNN,0.938356,0.609756,0.925926,0.992032,0.735294,0.800894,0.702075


In [20]:
# Select the best trained model (pipeline) from your grid search
best_model = dt_search.best_estimator_

# Reset index so 'Id' is a column for prediction
unlabeled_data_reset = unlabeled_data.reset_index()

# Use the pipeline to preprocess the new data and make predictions
unlabeled_data['Expensive'] = best_model.predict(unlabeled_data_reset)

unlabeled_data['Expensive'].to_csv('./submission.csv')

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define a new parameter grid for the Random Forest
rf_param_grid = {
    "randomforestclassifier__n_estimators": [100, 200, 300],
    "randomforestclassifier__max_depth": [10, 20, 30],
    "randomforestclassifier__min_samples_leaf": [3, 5, 7]
}

# Run the same grid search function with the new model and grid
rf_search = run_grid_search(
    RandomForestClassifier(random_state=rk),
    rf_param_grid,
    X_train, y_train, scaled_preprocessor
)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [33]:
from xgboost import XGBClassifier

xgb_param_grid = {
    "xgbclassifier__n_estimators": [100, 200, 300],
    "xgbclassifier__learning_rate": [0.01, 0.1, 0.3],
    "xgbclassifier__max_depth": [3, 5, 7]
}

xgb_search = run_grid_search(
    XGBClassifier(random_state=rk, eval_metric='logloss', use_label_encoder=False),
    xgb_param_grid,
    X_train, y_train, scaled_preprocessor
)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


XGBooster did better job than other models. We will use it to predict the unlabeled data.

In [34]:
# Reset index so 'Id' is a column for prediction
unlabeled_data_reset = unlabeled_data.reset_index()

# Use the pipeline to preprocess the new data and make predictions
unlabeled_data['Expensive'] = xgb_search.predict(unlabeled_data_reset)

unlabeled_data['Expensive'].to_csv('./submission.csv')