# Load libraries

In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.22.3 xgboost-2.1.1


In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.32-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)
  Downloading greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23

In [3]:
# standard
import pandas as pd
import numpy as np
import random

# ML libraries
import tensorflow as tf
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import optuna
import optuna.visualization as vis
from functools import partial

# plots
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

# filter warnings
import warnings
warnings.filterwarnings('ignore')

random.seed(42)

# Load Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def load_mlr_data():
  # Reading in 2016 property information
  df_properties = pd.read_csv('/content/drive/MyDrive/properties_2016.csv')

  # Reading in 2016 and 2017 transaction data and the sample submission file
  df_2016 = pd.read_csv('/content/drive/MyDrive/train_2016_v2.csv', parse_dates=["transactiondate"])
  df_2017 = pd.read_csv('/content/drive/MyDrive/train_2017.csv', parse_dates=["transactiondate"])
  sample = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')

  # Merging the 2016 and 2017 transaction data
  df_logs = pd.concat([df_2016, df_2017])

  # Merging the new transaction data with it's associated properties
  df_train = pd.merge(df_logs, df_properties, on='parcelid', how='inner')

  return df_properties, sample, df_logs, df_train

def load_xgb_data():
  # Read the data
  train = pd.read_csv('./data/train_2016_v2.csv')
  prop = pd.read_csv('./data/properties_2016.csv')
  sample = pd.read_csv('./data/sample_submission.csv')
  return train, prop, sample

def load_nn_data():
  # Read the data
  train = pd.read_csv("./data/train_2016_v2.csv", parse_dates=["transactiondate"])
  prop = pd.read_csv('./data/properties_2016.csv')
  sample = pd.read_csv('./data/sample_submission.csv')
  return train, prop, sample

def load_combo_data():
  # Read and clean dataframe
  df_properties = pd.read_csv('./data/properties_2017.csv')

  # Get logerrors of transactions and merge
  df_2016 = pd.read_csv('./data/train_2016_v2.csv', parse_dates=["transactiondate"])
  df_2017 = pd.read_csv('./data/train_2017.csv', parse_dates=["transactiondate"])
  sample = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
  df_logs = pd.concat([df_2016, df_2017])

  # Inner join transactions with the dataset of all properties
  df_all = pd.merge(df_logs, df_properties, on='parcelid', how='inner')

  return df_all, df_properties, sample



# Preprocess Data

processing functions

In [22]:
def process_mlr_data(df_train):
  # Dropping all rows where there is no longitude or latitidue data
  df_final = df_train[~df_train.regionidcounty.isnull()]

  # Want to use taxvaluedollarcnt, and since only 1 value is missing, will just drop that row
  df_final = df_final.drop(df_final[df_final.taxvaluedollarcnt.isnull()].index)

  # Getting column names of all columns w/o any missing values, and dropping parcelid
  selected = df_final.columns[df_final.apply(lambda c: c.isnull().sum() == 0)]

  # Getting selected columns
  data = df_final[selected]

  # Changing transactiondate to a datetime type
  data['transactiondate'] = pd.to_datetime(data['transactiondate'])

  # Extracting year and month based on the transaction date and setting them as
  # separate variables
  data['year'] = data['transactiondate'].dt.year

  # Setting train data to be all 2016 transactions
  X_train = data[data['year'] != 2017]
  y_train = X_train['logerror']

  # Dropping logerror (outcome variable), transactiondate (represented by month and day),
  # year and assessment year (since all observations in this subset have the same year value)
  X_train = X_train.drop(['parcelid', 'logerror', 'transactiondate', 'year', 'assessmentyear', 'fips'], axis=1)

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

  return X_train, X_val, y_train, y_val, data

def process_xgb_data(train, prop):

  # Convert float64 columns to float32 to reduce memory usage
  for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

  # Merge training data with property data
  df_train = train.merge(prop, how='left', on='parcelid')

  # Prepare features (X) and target variables (y) for training
  # Drop unnecessary columns
  x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
  y_train = df_train['logerror'].values

  # Store column names for later use
  train_columns = x_train.columns

  # Convert object (string) columns to boolean
  for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

  # Create training and validation dataset
  split = 80000
  x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

  return x_train, y_train, x_valid, y_valid, train_columns


def process_nn_data(train, prop, sample):

  # Fit Label Encoder on properties
  for c in prop.columns:
    prop[c]=prop[c].fillna(-1)
    if prop[c].dtype == 'object':
      lbl = LabelEncoder()
      lbl.fit(list(prop[c].values))
      prop[c] = lbl.transform(list(prop[c].values))

  #Create df_train and x_train y_train from that
  df_train = train.merge(prop, how='left', on='parcelid')
  df_train["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
  df_train["transactiondate_year"] = df_train["transactiondate"].dt.year
  df_train["transactiondate_month"] = df_train["transactiondate"].dt.month
  df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
  df_train["transactiondate"] = df_train["transactiondate"].dt.day

  select_qtr4 = df_train["transactiondate_quarter"] == 4

  # Fill NA/NaN values using suitable method
  df_train.fillna(-1.0)

  print('Create x_train and y_train from df_train' )
  x_train_all = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode','fireplacecnt', 'fireplaceflag'], axis=1)
  y_train_all = df_train["logerror"]
  y_train = y_train_all[~select_qtr4]
  x_train = x_train_all[~select_qtr4]
  x_valid = x_train_all[select_qtr4]
  y_valid = y_train_all[select_qtr4]

  y_mean = np.mean(y_train)
  train_columns = x_train.columns

  for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

  # Create df_test and test set
  sample['parcelid'] = sample['ParcelId']
  df_test = sample.merge(prop, on='parcelid', how='left')
  df_test["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
  df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
  df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
  df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
  df_test["transactiondate"] = df_test["transactiondate"].dt.day
  x_test = df_test[train_columns]

  for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

  return x_train, y_train, x_valid, y_valid, x_test

def process_combo_data(df_all):
  # Remove some rows missing important data
  df = df_all.dropna(subset=['regionidcounty','landtaxvaluedollarcnt', 'taxamount',
                              'regionidzip', 'structuretaxvaluedollarcnt'])

  # Get month, year, weekday
  df.transactiondate = pd.to_datetime(df.transactiondate)
  df['transactionmonth'] =  df['transactiondate'].dt.strftime('%Y%m')
  df['month'] = df.transactionmonth.str[4:]
  df['year'] = df.transactionmonth.str[:-2]
  df['weekday'] = df.transactiondate.dt.day_of_week

  # Select only certain features from full dataset
  X = df[['bedroomcnt','roomcnt','bathroomcnt','taxamount','landtaxvaluedollarcnt','taxvaluedollarcnt','structuretaxvaluedollarcnt',
          'latitude', 'longitude', 'month', 'year', 'weekday',
            'lotsizesquarefeet', 'calculatedfinishedsquarefeet', 'yearbuilt',
            ]]
  Y = df.logerror

  X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=1234)

  return X_train, X_val, Y_train, Y_val, X, Y

# Build Models

In [7]:
def build_mlr_model(num_features, learning_rate):
  """Build a TF linear regression model using Keras.

  Args:
    num_features: The number of input features.
    learning_rate: The desired learning rate for SGD.

  Returns:
    model: A tf.keras model (graph).
  """
  # This is not strictly necessary, but each time you build a model, TF adds
  # new nodes (rather than overwriting), so the colab session can end up
  # storing lots of copies of the graph when you only care about the most
  # recent. Also, as there is some randomness built into training with SGD,
  # setting a random seed ensures that results are the same on each identical
  # training run.
  tf.keras.backend.clear_session()
  tf.random.set_seed(42)

  # Build a model using keras.Sequential. While this is intended for neural
  # networks (which may have multiple layers), we want just a single layer for
  # linear regression.
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(
      units=1,        # output dim
      input_shape=[num_features],  # input dim
      use_bias=True,               # use a bias (intercept) param
      kernel_initializer=tf.ones_initializer,  # initialize params to 1
      bias_initializer=tf.ones_initializer,    # initialize bias to 1
  ))

  # We need to choose an optimizer
  optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

  # Finally, compile the model. This finalizes the graph for training.
  # We specify the loss and the optimizer above
  model.compile(
        optimizer=optimizer,
        loss='mae'
  )

  return model

def build_nn_model(learning_rate, x_train):

  tf.keras.backend.clear_session()
  tf.random.set_seed(42)

  len_x=int(x_train.shape[1])

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(units = 400 , kernel_initializer = 'normal', activation = 'relu', input_dim = len_x))
  model.add(tf.keras.layers.Dropout(.36))
  model.add(tf.keras.layers.Dense(units = 160 , kernel_initializer = 'normal', activation = 'relu'))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(.6))
  model.add(tf.keras.layers.Dense(units = 64 , kernel_initializer = 'normal', activation = 'relu'))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(.48))
  model.add(tf.keras.layers.Dense(units = 28, kernel_initializer = 'normal', activation = 'relu'))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(.48))
  model.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))

  optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

  model.compile(optimizer=optimizer, loss='mae')

  return model

def build_combo_model(lr, resolution_in_degrees):
    """Use Keras functional API to create neural network model"""

    tf.keras.backend.clear_session()
    tf.random.set_seed(1234)
    random.seed(42)
    mask_value = -999


    bedroomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='bedroomcnt')
    roomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='roomcnt')
    bathroomcnt = layers.Input(shape=(1,), dtype=tf.float32, name='bathroomcnt')
    taxamount = layers.Input(shape=(1,), dtype=tf.float32, name='taxamount')
    landtaxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='landtaxvaluedollarcnt')
    taxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='taxvaluedollarcnt')
    structuretaxvaluedollarcnt = layers.Input(shape=(1,), dtype=tf.float32, name='structuretaxvaluedollarcnt')
    latitude = layers.Input(shape=(1,), dtype=tf.float32, name='latitude')
    longitude = layers.Input(shape=(1,), dtype=tf.float32, name='longitude')
    month = layers.Input(shape=(1,), dtype=tf.string, name='month')
    year = layers.Input(shape=(1,), dtype=tf.string, name='year')
    weekday = layers.Input(shape=(1,), dtype=tf.int64, name='weekday')


    lotsizesquarefeet = layers.Input(shape=(1,), dtype=tf.float32, name='lotsizesquarefeet')
    lotsizemask = layers.Masking(mask_value=mask_value)(lotsizesquarefeet)

    calculatedfinishedsquarefeet = layers.Input(shape=(1,), dtype=tf.float32, name='calculatedfinishedsquarefeet')
    finishedsqftmask = layers.Masking(mask_value=mask_value)(calculatedfinishedsquarefeet)

    yearbuilt = layers.Input(shape=(1,), dtype=tf.float32, name='yearbuilt')
    yearblt = layers.Masking(mask_value=mask_value)(yearbuilt)

    bedroomcnt_masked = layers.Masking(mask_value=mask_value)(bedroomcnt)
    roomcnt_masked = layers.Masking(mask_value=mask_value)(roomcnt)
    bathroomcnt_masked = layers.Masking(mask_value=mask_value)(bathroomcnt)
    taxamount_masked = layers.Masking(mask_value=mask_value)(taxamount)
    landtaxvaluedollarcnt_masked = layers.Masking(mask_value=mask_value)(landtaxvaluedollarcnt)
    taxvaluedollarcnt_masked = layers.Masking(mask_value=mask_value)(taxvaluedollarcnt)
    structuretaxvaluedollarcnt_masked = layers.Masking(mask_value=mask_value)(structuretaxvaluedollarcnt)

    # One hot encode month, year and weekday
    month_id = tf.keras.layers.StringLookup(
      vocabulary=['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'],
      output_mode='one_hot')(month)

    year_id = tf.keras.layers.StringLookup(
      vocabulary=['2016', '2017'],
      output_mode='one_hot')(year)

    weekday_id = tf.keras.layers.IntegerLookup(
      vocabulary=[0,1,2,3,4,5,6],
      output_mode='one_hot')(weekday)


    # Create a list of numbers representing the bucket boundaries for latitude.
    latitude_boundaries = list(np.arange(-3, 3 + resolution_in_degrees, resolution_in_degrees))

    # Create a Discretization layer to separate the latitude data into buckets.
    latitude_discretized = tf.keras.layers.Discretization(
        bin_boundaries=latitude_boundaries,
        name='discretization_latitude')(latitude)

    # Create a list of numbers representing the bucket boundaries for longitude.
    longitude_boundaries = list(np.arange(-3, 3 + resolution_in_degrees, resolution_in_degrees))

    # Create a Discretization layer to separate the longitude data into buckets.
    longitude_discretized = tf.keras.layers.Discretization(
        bin_boundaries=longitude_boundaries,
        name='discretization_longitude')(longitude)

    # Cross the latitude and longitude features into a single one-hot vector.
    feature_cross = tf.keras.layers.HashedCrossing(
        num_bins=len(latitude_boundaries) * len(longitude_boundaries),
        output_mode='one_hot',
        name='cross_latitude_longitude')([latitude_discretized, longitude_discretized])

    features = layers.Concatenate()([
                    bedroomcnt_masked,
                    roomcnt_masked,
                    bathroomcnt_masked,
                    taxamount_masked,
                    landtaxvaluedollarcnt_masked,
                    taxvaluedollarcnt_masked,
                    structuretaxvaluedollarcnt_masked,
                    feature_cross,
                    month_id,
                    year_id,
                    weekday_id,
                    lotsizemask,
                    finishedsqftmask,
                    yearblt,
    ])

    x = layers.Dense(units=600, kernel_initializer='normal', activation='relu')(features)
    x = layers.Dropout(0.36)(x)
    x = layers.Dense(units=200, kernel_initializer='normal', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.6)(x)
    x = layers.Dense(1, kernel_initializer='normal')(x)
    x = layers.Dense(1, kernel_initializer='normal')(x)

    logerror = tf.keras.layers.Dense(
        units=1, activation='linear', name='logerror')(x)

    model = tf.keras.Model(inputs=[
        bedroomcnt,
        roomcnt,
        bathroomcnt,
        taxamount,
        landtaxvaluedollarcnt,
        taxvaluedollarcnt,
        structuretaxvaluedollarcnt,
        latitude,
        longitude,
        month,
        year,
        weekday,
        lotsizesquarefeet,
        calculatedfinishedsquarefeet,
        yearbuilt,
    ], outputs=logerror)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='mae',
        metrics=['mae'])

    return model

XGBoost Train and hyperparameter tune

In [8]:
# Define the objective function
def objective(trial, d_train, d_valid):
  params = {
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'eta': trial.suggest_float('eta', 0.01, 0.1),
    'max_depth': trial.suggest_int('max_depth', 1, 9),
    'subsample': trial.suggest_float('subsample', 0.6, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
    'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
  }

  watchlist = [(d_train, 'train'), (d_valid, 'valid')]

  # Train the model
  model = xgb.train(params, d_train, num_boost_round=1000, evals=watchlist,
                    early_stopping_rounds=100, verbose_eval=False)

  # Return the best validation MAE
  val_mae = model.best_score
  return val_mae

def train_tune_xgb_model(d_train, d_valid):

  # Create a study object and optimize the objective function
  objective_partial = partial(objective, d_train=d_train, d_valid=d_valid)
  study = optuna.create_study(direction='minimize')
  study.optimize(objective_partial, n_trials=20, timeout=600)

  # Print the best parameters
  print(f"Best Parameters: {study.best_params}")

  return study

# Predict on Test Data

In [28]:
def generate_mlr_prediction(test_mlr_model, df_properties, sample, df_logs, df_train, X_train, X_val, y_train, y_val, data):
  # Using sample submission to make test data for predictions for Kaggle submission
  X_test = data[data['parcelid'].isin(sample['ParcelId'])]
  sample['parcelid'] = sample['ParcelId']
  X_test = sample.merge(df_properties, on='parcelid', how='inner')

  # Getting number of rows with missing data
  num_missing = X_test[X_test[X_train.columns].isnull().any(axis=1)].shape[0]

  # Dropping rows with missing data
  X_test = X_test[X_train.columns].dropna()


  # Scaling the train and validation data
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_val_scaled = scaler.transform(X_val)

  # Creating predications based on the training and validation datasets
  train_preds = test_mlr_model.predict(X_train_scaled)
  val_preds = test_mlr_model.predict(X_val_scaled)

  # Printing out training and validation dataset mean absolute errors
  print("Train MAE:", mean_absolute_error(y_train, train_preds))
  print("Validation MAE:", mean_absolute_error(y_val, val_preds))


  # Scaling test data and making predictions
  X_test_scaled = scaler.transform(X_test)
  test_preds = test_mlr_model.predict(X_test_scaled)

  # Getting average of predictions
  average_preds = np.mean(test_preds)

  # Printing out mean absolute error between actual test data and test predictions
  y_test = df_logs['logerror']
  print("Test MAE:", mean_absolute_error(y_test, test_preds[len(test_preds)-len(y_test):]))

  return num_missing, average_preds, test_preds


def generate_xgb_prediction(clf, prop, sample, train_columns, combo=False, preds=None):
  # Merge sample data with property data
  sample['parcelid'] = sample['ParcelId']
  df_test = sample.merge(prop, on='parcelid', how = 'left')

  if combo:
     x_test = preds.reshape(-1, 1)
  else:
    # Convert object (string) columns to boolean
    x_test = df_test[train_columns]
    for c in x_test.dtypes[x_test.dtypes == object].index.values:
        x_test[c] = (x_test[c] == True)

  # Generate prediction
  d_test = xgb.DMatrix(x_test)
  p_test = clf.predict(d_test)

  return p_test

# Save Submission

In [10]:
# Create submission file
def save_submission(p_test):
  sub = pd.read_csv('./data/sample_submission.csv')
  for c in sub.columns[sub.columns != 'ParcelId']:
      sub[c] = p_test
  print('Save submission')
  sub.to_csv('submission.csv', index=False, float_format='%.4f')

# Run ML Pipeline

MLR pipeline

In [11]:
## Load Data ##
df_properties, sample, df_logs, df_train = load_mlr_data()

## MLR data processing ##
X_train, X_val, y_train, y_val, data = process_mlr_data(df_train)

# Standardizing all features in X_train, X_val, and X_test
X_train_std = (X_train-X_train.mean())/X_train.std()
X_val_std = (X_val-X_train.mean())/X_train.std()

# Standardizing Y_train, Y_val, and Y_test
y_train_std = (y_train-y_train.mean())/y_train.std()
y_val_std = (y_val-y_train.mean())/y_train.std()

## Build and Train Model ##
# Build and compile test_model
test_mlr_model = build_mlr_model(num_features=X_train.shape[1],learning_rate=0.0007)

# Fit test model
test_num_epochs=10
test_train_tf = test_mlr_model.fit(x=X_train_std, y=y_train_std, epochs=test_num_epochs, verbose=0,
                         validation_data=(X_val_std, y_val_std))

num_missing, average_preds, test_preds = generate_mlr_prediction(test_mlr_model, df_properties, sample, df_logs, df_train, X_train, X_val, y_train, y_val, data)

# Replacing missing predictions with average of test data predictions
for i in range(num_missing):
  test_preds = np.append(test_preds, average_preds)
save_submission(test_preds)

Train MAE: 0.0829004389358951
Validation MAE: 0.08181866026138399
Test MAE: 0.08669028368660582
Save submission


XGBoost pipeline

In [12]:
## Load Data ##
train, prop, sample = load_xgb_data()

## XGBoost processing ##
# Build XGBoost DMatrix objects for efficient processing
x_train, y_train, x_valid, y_valid, train_columns = process_xgb_data(train, prop)
d_train = xgb.DMatrix(x_train, label = y_train)
d_valid = xgb.DMatrix(x_valid, label = y_valid)

study = train_tune_xgb_model(d_train, d_valid)

# Train Model on Best Parameters
params = study.best_params
#params = {'eta': 0.022382530987582482, 'max_depth': 5, 'subsample': 0.6561243067188417, 'colsample_bytree': 0.6775779485416246, 'n_estimators': 640, 'lambda': 0.9942582014915469, 'alpha': 7.97377143057426e-08}
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist,
                early_stopping_rounds=100, verbose_eval=10)

p_test = generate_xgb_prediction(clf, prop, sample, train_columns)

save_submission(p_test)

[I 2024-08-10 13:11:59,520] A new study created in memory with name: no-name-6adf6e82-ab7c-4e40-b584-2ca8eaac4b59
[I 2024-08-10 13:12:08,602] Trial 0 finished with value: 0.06637668898584063 and parameters: {'eta': 0.03877340538171232, 'max_depth': 7, 'subsample': 0.9741607487326167, 'colsample_bytree': 0.9218486166222452, 'n_estimators': 442, 'lambda': 4.20478398047491e-06, 'alpha': 0.3514029185524345}. Best is trial 0 with value: 0.06637668898584063.
[I 2024-08-10 13:12:09,560] Trial 1 finished with value: 0.06642860410064277 and parameters: {'eta': 0.09759346292852776, 'max_depth': 7, 'subsample': 0.6052011079634776, 'colsample_bytree': 0.8935568028026359, 'n_estimators': 583, 'lambda': 0.25871024030461914, 'alpha': 2.653869253030209e-07}. Best is trial 0 with value: 0.06637668898584063.
[I 2024-08-10 13:12:10,196] Trial 2 finished with value: 0.06634855066432896 and parameters: {'eta': 0.09929497931413704, 'max_depth': 1, 'subsample': 0.929715333155764, 'colsample_bytree': 0.752826

Best Parameters: {'eta': 0.07213953058740072, 'max_depth': 3, 'subsample': 0.7429087840592212, 'colsample_bytree': 0.6039564532131543, 'n_estimators': 675, 'lambda': 0.015620163446276153, 'alpha': 0.0015834428302488246}
[0]	train-mae:0.06856	valid-mae:0.06648
[10]	train-mae:0.06822	valid-mae:0.06628
[20]	train-mae:0.06811	valid-mae:0.06628
[30]	train-mae:0.06801	valid-mae:0.06629
[40]	train-mae:0.06795	valid-mae:0.06630
[50]	train-mae:0.06793	valid-mae:0.06634
[60]	train-mae:0.06792	valid-mae:0.06641
[70]	train-mae:0.06790	valid-mae:0.06649
[80]	train-mae:0.06787	valid-mae:0.06652
[90]	train-mae:0.06783	valid-mae:0.06654
[100]	train-mae:0.06783	valid-mae:0.06658
[110]	train-mae:0.06780	valid-mae:0.06659
[117]	train-mae:0.06780	valid-mae:0.06662
Save submission


Neural Network (NN) pipeline

In [17]:
## Load Data ##
train, prop, sample = load_nn_data()

## NN processing ##
x_train, y_train, x_valid, y_valid, x_test = process_nn_data(train, prop, sample)

imputer = SimpleImputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
imputer.fit(x_valid.iloc[:, :])
x_valid = imputer.transform(x_valid.iloc[:, :])
imputer.fit(x_test.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_valid = sc.fit_transform(x_valid)
x_val = np.array(x_valid)
y_val = np.array(y_valid)

## Build and Train Model ##
# Build and compile test_model
nn = build_nn_model(learning_rate=1e-3, x_train=x_train)
# Fit test model
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 100, verbose=2,
       validation_data=(x_val,y_val))

y_pred_ann = nn.predict(x_test).flatten()

save_submission(y_pred_ann)


Create x_train and y_train from df_train
Epoch 1/100
2555/2555 - 11s - loss: 0.0722 - val_loss: 0.0677 - 11s/epoch - 4ms/step
Epoch 2/100
2555/2555 - 9s - loss: 0.0691 - val_loss: 0.0657 - 9s/epoch - 3ms/step
Epoch 3/100
2555/2555 - 9s - loss: 0.0685 - val_loss: 0.0658 - 9s/epoch - 3ms/step
Epoch 4/100
2555/2555 - 9s - loss: 0.0683 - val_loss: 0.0659 - 9s/epoch - 3ms/step
Epoch 5/100
2555/2555 - 8s - loss: 0.0682 - val_loss: 0.0656 - 8s/epoch - 3ms/step
Epoch 6/100
2555/2555 - 8s - loss: 0.0682 - val_loss: 0.0656 - 8s/epoch - 3ms/step
Epoch 7/100
2555/2555 - 9s - loss: 0.0681 - val_loss: 0.0658 - 9s/epoch - 4ms/step
Epoch 8/100
2555/2555 - 9s - loss: 0.0681 - val_loss: 0.0658 - 9s/epoch - 3ms/step
Epoch 9/100
2555/2555 - 9s - loss: 0.0681 - val_loss: 0.0656 - 9s/epoch - 3ms/step
Epoch 10/100
2555/2555 - 9s - loss: 0.0681 - val_loss: 0.0655 - 9s/epoch - 3ms/step
Epoch 11/100
2555/2555 - 8s - loss: 0.0680 - val_loss: 0.0654 - 8s/epoch - 3ms/step
Epoch 12/100
2555/2555 - 8s - loss: 0.0680

Combo pipeline

In [18]:
# THIS IS THE FUNCTION TO PREDICT MAE
def get_loss(y_true, y_pred):
    return tf.keras.losses.MAE(y_true, y_pred).numpy()

In [29]:
#df_all, df_properties, sample = load_combo_data()
df_properties, sample, df_logs, df_all = load_mlr_data()

X_train, X_val, Y_train, Y_val, X, Y = process_combo_data(df_all)

# Applying standardization to inputs

numeric_columns = ['bedroomcnt', 'roomcnt', 'bathroomcnt', 'taxamount',
      'landtaxvaluedollarcnt', 'taxvaluedollarcnt',
#      'structuretaxvaluedollarcnt', 'latitude', 'longitude', 'LXXRNSA',
      'structuretaxvaluedollarcnt', 'latitude', 'longitude',
        'lotsizesquarefeet', 'calculatedfinishedsquarefeet', 'yearbuilt',
        ]

# Standardize numeric columns
sc_x = StandardScaler()
X_train_std = X_train.copy()
X_val_std = X_val.copy()

X_train_std[numeric_columns] = sc_x.fit(X_train[numeric_columns]).transform(X_train[numeric_columns])
X_val_std[numeric_columns] = sc_x.fit(X_train[numeric_columns]).transform(X_val[numeric_columns])

# Applying standardization to outputs
Y_train_std = (Y_train - Y_train.mean())/Y_train.std()
Y_val_std = (Y_val - Y_train.mean())/Y_train.std()

# Mask missing data in last three columns - does this help? Apparently a tiny bit
mask_value = -999
X_train_std = X_train_std.fillna(mask_value)
X_val_std = X_val_std.fillna(mask_value)

## Build NN model ##
model = build_combo_model(lr=0.001, resolution_in_degrees=0.239)
random.seed(42)
tf.random.set_seed(1234)

# Defining train_x and val_x to avoid multiple re-definitions
train_x = {
        'bedroomcnt': X_train_std[['bedroomcnt']],
        'roomcnt': X_train_std[['roomcnt']],
        'bathroomcnt': X_train_std[['bathroomcnt']],
        'taxamount': X_train_std[['taxamount']],
        'landtaxvaluedollarcnt': X_train_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_train_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_train_std[['structuretaxvaluedollarcnt']],
        'latitude': X_train_std[['latitude']],
        'longitude': X_train_std[['longitude']],
#        'lxxrnsa': X_train_std[['LXXRNSA']],
        'year': X_train_std[['year']],
        'month': X_train_std[['month']],
        'weekday': X_train_std[['weekday']],
        'lotsizesquarefeet': X_train_std[['lotsizesquarefeet']],
        'calculatedfinishedsquarefeet': X_train_std[['calculatedfinishedsquarefeet']],
        'yearbuilt': X_train_std[['yearbuilt']],

    }

val_x = {
        'bedroomcnt': X_val_std[['bedroomcnt']],
        'roomcnt': X_val_std[['roomcnt']],
        'bathroomcnt': X_val_std[['bathroomcnt']],
        'taxamount': X_val_std[['taxamount']],
        'landtaxvaluedollarcnt': X_val_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_val_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_val_std[['structuretaxvaluedollarcnt']],
        'latitude': X_val_std[['latitude']],
        'longitude': X_val_std[['longitude']],
#        'lxxrnsa': X_val_std[['LXXRNSA']],
        'year': X_val_std[['year']],
        'month': X_val_std[['month']],
        'weekday': X_val_std[['weekday']],
        'lotsizesquarefeet': X_val_std[['lotsizesquarefeet']],
        'calculatedfinishedsquarefeet': X_val_std[['calculatedfinishedsquarefeet']],
        'yearbuilt': X_val_std[['yearbuilt']],
        }

# Fit model
history = model.fit(
    x=train_x,
    y=Y_train_std,
    epochs=10,
    batch_size=2000,
    validation_data=(val_x,
        Y_val_std
    )
)

## predict ##
val_preds = model.predict(val_x)

# Convert to regular scale from scaled-standardized scale and print loss
val_preds= (val_preds[:,0]*Y_train.std()) + Y_train.mean()
print(get_loss(y_pred=val_preds, y_true=Y_val))

## Send to XGBoost model ##
# Get train and validation predictions from neural network as inputs for XGBoost

train_preds = model.predict(train_x)
val_preds = model.predict(val_x)

# Convert to orginal scale from standardized scale
train_preds = (train_preds[:,0]*Y_train.std()) + Y_train.mean()
val_preds = (val_preds[:,0]*Y_train.std()) + Y_train.mean()

# Create XGBoost matrices
#d_train = xgb.DMatrix(train_preds, label = Y_train)
#d_valid = xgb.DMatrix(val_preds, label = Y_val)
d_train = xgb.DMatrix(train_preds.reshape(-1, 1), label=Y_train)
d_valid = xgb.DMatrix(val_preds.reshape(-1, 1), label=Y_val)

# Train XGBoost model
study = train_tune_xgb_model(d_train, d_valid)

# Train Model on Best Parameters
params = study.best_params
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist,
                early_stopping_rounds=100, verbose_eval=10)


# Get final loss, it's near-identical
# xgb_val = (clf.predict(d_valid)*Y_train.std()) + Y_train.mean()
# get_loss(y_pred=xgb_val, y_true=Y_val)

# Run on test data
prop = pd.read_csv('./data/properties_2017.csv')
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how = 'left')
train_columns =  ['bedroomcnt', 'roomcnt', 'bathroomcnt', 'taxamount',
       'landtaxvaluedollarcnt', 'taxvaluedollarcnt',
       'structuretaxvaluedollarcnt', 'latitude', 'longitude',
        'lotsizesquarefeet', 'calculatedfinishedsquarefeet', 'yearbuilt',
         ]
x_test = df_test[train_columns]

# Set the transaction date dependent columns to constants
x_test['month'] = "12"
x_test['year'] = "2016"
x_test['weekday'] = 4

X_test_std = x_test.copy()

# Scale-standardize features
sc_x = StandardScaler()
sc_x.fit(X_train[numeric_columns])

# Transform the test data using the same fitted scaler
X_test_std[numeric_columns] = sc_x.transform(X_test_std[numeric_columns])

# For latitude and longitude, we can't mask, so fill NAs with zeros
X_test_std.longitude = X_test_std.longitude.fillna(0)
X_test_std.latitude = X_test_std.latitude.fillna(0)

# Mask missing data in columns
mask_value = -999
X_test_std = X_test_std.fillna(mask_value)

# Generate neural network predictions for all parcel IDs
preds = model.predict({
        'bedroomcnt': X_test_std[['bedroomcnt']],
        'roomcnt': X_test_std[['roomcnt']],
        'bathroomcnt': X_test_std[['bathroomcnt']],
        'taxamount': X_test_std[['taxamount']],
        'landtaxvaluedollarcnt': X_test_std[['landtaxvaluedollarcnt']],
        'taxvaluedollarcnt': X_test_std[['taxvaluedollarcnt']],
        'structuretaxvaluedollarcnt': X_test_std[['structuretaxvaluedollarcnt']],
        'latitude': X_test_std[['latitude']],
        'longitude': X_test_std[['longitude']],
#        'lxxrnsa': X_test_std[['LXXRNSA']],
        'year': X_test_std[['year']],
        'month': X_test_std[['month']],
        'weekday': X_test_std[['weekday']],
        'lotsizesquarefeet': X_test_std[['lotsizesquarefeet']],
        'calculatedfinishedsquarefeet': X_test_std[['calculatedfinishedsquarefeet']],
        'yearbuilt': X_test_std[['yearbuilt']],
    })

# Convert to regular scale
preds = (preds[:,0]*Y_train.std()) + Y_train.mean()
save_submission(preds) # For neural network

p_test = generate_xgb_prediction(clf, prop, sample, train_columns, combo=True, preds=preds) # For XGBoost
save_submission(p_test)

# for c in sample.columns[sample.columns != 'ParcelId']:
#     sample[c] = ((preds[:,0]*Y_train.std()) + Y_train.mean())

# # Save neural network predictions to csv
# sample.to_csv('nn1.csv', index=False, float_format='%.4f')

# Create XGBoost predictions from neural network predictions
# d_test_1 = xgb.DMatrix(preds)
# xg_preds_1 = (clf.predict(d_test_1)*Y_train.std()) + Y_train.mean()

# save_submission(xg_preds_1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.068317585


[I 2024-08-10 16:03:09,648] A new study created in memory with name: no-name-47605e8a-deb7-49f3-b4ae-90126e285bda
[I 2024-08-10 16:03:13,568] Trial 0 finished with value: 0.0688076078868838 and parameters: {'eta': 0.03097143971974744, 'max_depth': 3, 'subsample': 0.633851409615319, 'colsample_bytree': 0.6291088019484112, 'n_estimators': 205, 'lambda': 2.2517212117520144e-05, 'alpha': 4.322385747236755e-06}. Best is trial 0 with value: 0.0688076078868838.
[I 2024-08-10 16:03:14,248] Trial 1 finished with value: 0.06882311512858151 and parameters: {'eta': 0.09998270414855857, 'max_depth': 3, 'subsample': 0.9992048917025254, 'colsample_bytree': 0.7996873941915905, 'n_estimators': 542, 'lambda': 8.913067553369856e-05, 'alpha': 6.644566801313804e-05}. Best is trial 0 with value: 0.0688076078868838.
[I 2024-08-10 16:03:15,787] Trial 2 finished with value: 0.06888970069663063 and parameters: {'eta': 0.03497941274452101, 'max_depth': 9, 'subsample': 0.8931708835676501, 'colsample_bytree': 0.94

Best Parameters: {'eta': 0.07207650337879785, 'max_depth': 2, 'subsample': 0.8709468113329673, 'colsample_bytree': 0.675516894136017, 'n_estimators': 308, 'lambda': 0.15303371335005056, 'alpha': 3.0284959625819058e-05}
[0]	train-mae:0.06885	valid-mae:0.06925
[10]	train-mae:0.06790	valid-mae:0.06880
[20]	train-mae:0.06775	valid-mae:0.06894
[30]	train-mae:0.06778	valid-mae:0.06910
[40]	train-mae:0.06784	valid-mae:0.06923
[50]	train-mae:0.06787	valid-mae:0.06931
[60]	train-mae:0.06789	valid-mae:0.06934
[70]	train-mae:0.06790	valid-mae:0.06937
[80]	train-mae:0.06791	valid-mae:0.06938
[90]	train-mae:0.06792	valid-mae:0.06939
[100]	train-mae:0.06792	valid-mae:0.06940
[109]	train-mae:0.06793	valid-mae:0.06941
Save submission
Save submission


Save models

In [30]:
test_mlr_model.save('/content/drive/MyDrive/mlr.keras')
nn.save('/content/drive/MyDrive/nn.keras')

Load models

In [31]:
mlr_model = tf.keras.models.load_model('/content/drive/MyDrive/mlr.keras')
nn_model = tf.keras.models.load_model('/content/drive/MyDrive/nn.keras')

# Visualizations

In [32]:
def plot_hyperparameter_optimization(study):

  # Optimization history plot
  opt_history_fig = vis.plot_optimization_history(study)
  opt_history_fig.show()

  # Parameter importance plot
  param_importance_fig = vis.plot_param_importances(study)
  param_importance_fig.show()

  # Slice plot
  slice_fig = vis.plot_slice(study)
  slice_fig.show()

  # Slice plot
  slice_fig = vis.plot_slice(study)
  slice_fig.show()

plot_hyperparameter_optimization(study)