# Load libraries

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [3]:
# standard
import pandas as pd
import numpy as np
import random

# ML libraries
import tensorflow as tf
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import optuna
import optuna.visualization as vis

# plots
import seaborn as sns
sns.set_theme()
import matplotlib.pyplot as plt

# filter warnings
import warnings
warnings.filterwarnings('ignore')

random.seed(42)

# Load Data

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# Load data
"""
def load_combo_data():
  df_properties = pd.read_csv('./data/properties_2017.csv')

  # Get logerrors of transactions and merge
  df_2016 = pd.read_csv('./data/train_2016_v2.csv', parse_dates=["transactiondate"])
  df_2017 = pd.read_csv('./data/train_2017.csv', parse_dates=["transactiondate"])
  df_logs = pd.concat([df_2016, df_2017])

  # Inner join transactions with the dataset of all properties
  df_all = pd.merge(df_logs, df_properties, on='parcelid', how='inner')

  return df_all

df_all = load_combo_data()
"""

def load_mlr_data():
  # Reading in 2016 property information
  df_properties = pd.read_csv('/content/drive/MyDrive/properties_2016.csv')

  # Reading in 2016 and 2017 transaction data and the sample submission file
  df_2016 = pd.read_csv('/content/drive/MyDrive/train_2016_v2.csv', parse_dates=["transactiondate"])
  df_2017 = pd.read_csv('/content/drive/MyDrive/train_2017.csv', parse_dates=["transactiondate"])
  sample = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')

  # Merging the 2016 and 2017 transaction data
  df_logs = pd.concat([df_2016, df_2017])

  # Merging the new transaction data with it's associated properties
  df_train = pd.merge(df_logs, df_properties, on='parcelid', how='inner')

  return df_properties, sample, df_logs, df_train


def load_xgb_data():
  # Read the data
  train = pd.read_csv('./data/train_2016_v2.csv')
  prop = pd.read_csv('./data/properties_2016.csv')
  sample = pd.read_csv('./data/sample_submission.csv')
  return train, prop, sample

def load_nn_data():
  # Read the data
  train = pd.read_csv("./data/train_2016_v2.csv", parse_dates=["transactiondate"])
  prop = pd.read_csv('./data/properties_2016.csv')
  sample = pd.read_csv('./data/sample_submission.csv')
  return train, prop, sample

# Preprocess Data

processing functions

In [5]:
def process_mlr_data(df_train):
  # Dropping all rows where there is no longitude or latitidue data
  df_final = df_train[~df_train.regionidcounty.isnull()]

  # Want to use taxvaluedollarcnt, and since only 1 value is missing, will just drop that row
  df_final = df_final.drop(df_final[df_final.taxvaluedollarcnt.isnull()].index)

  # Getting column names of all columns w/o any missing values, and dropping parcelid
  selected = df_final.columns[df_final.apply(lambda c: c.isnull().sum() == 0)]

  # Getting selected columns
  data = df_final[selected]

  # Changing transactiondate to a datetime type
  data['transactiondate'] = pd.to_datetime(data['transactiondate'])

  # Extracting year and month based on the transaction date and setting them as
  # separate variables
  data['year'] = data['transactiondate'].dt.year

  # Setting train data to be all 2016 transactions
  X_train = data[data['year'] != 2017]
  y_train = X_train['logerror']

  # Dropping logerror (outcome variable), transactiondate (represented by month and day),
  # year and assessment year (since all observations in this subset have the same year value)
  X_train = X_train.drop(['parcelid', 'logerror', 'transactiondate', 'year', 'assessmentyear', 'fips'], axis=1)

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

  return X_train, X_val, y_train, y_val, data

def process_xgb_data(train, prop):

  # Convert float64 columns to float32 to reduce memory usage
  for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

  # Merge training data with property data
  df_train = train.merge(prop, how='left', on='parcelid')

  # Prepare features (X) and target variables (y) for training
  # Drop unnecessary columns
  x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
  y_train = df_train['logerror'].values

  # Store column names for later use
  train_columns = x_train.columns

  # Convert object (string) columns to boolean
  for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

  # Create training and validation dataset
  split = 80000
  x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

  return x_train, y_train, x_valid, y_valid, train_columns


def process_nn_data(train, prop, sample):

  # Fit Label Encoder on properties
  for c in prop.columns:
    prop[c]=prop[c].fillna(-1)
    if prop[c].dtype == 'object':
      lbl = LabelEncoder()
      lbl.fit(list(prop[c].values))
      prop[c] = lbl.transform(list(prop[c].values))

  #Create df_train and x_train y_train from that
  df_train = train.merge(prop, how='left', on='parcelid')
  df_train["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
  df_train["transactiondate_year"] = df_train["transactiondate"].dt.year
  df_train["transactiondate_month"] = df_train["transactiondate"].dt.month
  df_train['transactiondate_quarter'] = df_train['transactiondate'].dt.quarter
  df_train["transactiondate"] = df_train["transactiondate"].dt.day

  select_qtr4 = df_train["transactiondate_quarter"] == 4

  # Fill NA/NaN values using suitable method
  df_train.fillna(-1.0)

  print('Create x_train and y_train from df_train' )
  x_train_all = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode','fireplacecnt', 'fireplaceflag'], axis=1)
  y_train_all = df_train["logerror"]
  y_train = y_train_all[~select_qtr4]
  x_train = x_train_all[~select_qtr4]
  x_valid = x_train_all[select_qtr4]
  y_valid = y_train_all[select_qtr4]

  y_mean = np.mean(y_train)
  train_columns = x_train.columns

  for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

  # Create df_test and test set
  sample['parcelid'] = sample['ParcelId']
  df_test = sample.merge(prop, on='parcelid', how='left')
  df_test["transactiondate"] = pd.to_datetime(df_train["transactiondate"])
  df_test["transactiondate_year"] = df_test["transactiondate"].dt.year
  df_test["transactiondate_month"] = df_test["transactiondate"].dt.month
  df_test['transactiondate_quarter'] = df_test['transactiondate'].dt.quarter
  df_test["transactiondate"] = df_test["transactiondate"].dt.day
  x_test = df_test[train_columns]

  for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

  return x_train, y_train, x_valid, y_valid, x_test

# Build Models

In [13]:
def build_mlr_model(num_features, learning_rate):
  """Build a TF linear regression model using Keras.

  Args:
    num_features: The number of input features.
    learning_rate: The desired learning rate for SGD.

  Returns:
    model: A tf.keras model (graph).
  """
  # This is not strictly necessary, but each time you build a model, TF adds
  # new nodes (rather than overwriting), so the colab session can end up
  # storing lots of copies of the graph when you only care about the most
  # recent. Also, as there is some randomness built into training with SGD,
  # setting a random seed ensures that results are the same on each identical
  # training run.
  tf.keras.backend.clear_session()
  tf.random.set_seed(42)

  # Build a model using keras.Sequential. While this is intended for neural
  # networks (which may have multiple layers), we want just a single layer for
  # linear regression.
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(
      units=1,        # output dim
      input_shape=[num_features],  # input dim
      use_bias=True,               # use a bias (intercept) param
      kernel_initializer=tf.ones_initializer,  # initialize params to 1
      bias_initializer=tf.ones_initializer,    # initialize bias to 1
  ))

  # We need to choose an optimizer
  optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

  # Finally, compile the model. This finalizes the graph for training.
  # We specify the loss and the optimizer above
  model.compile(
        optimizer=optimizer,
        loss='mae'
  )

  return model

def build_nn_model(learning_rate):

  tf.keras.backend.clear_session()
  tf.random.set_seed(42)

  len_x=int(x_train.shape[1])

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(units = 400 , kernel_initializer = 'normal', activation = 'relu', input_dim = len_x))
  model.add(tf.keras.layers.Dropout(.36))
  model.add(tf.keras.layers.Dense(units = 160 , kernel_initializer = 'normal', activation = 'relu'))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(.6))
  model.add(tf.keras.layers.Dense(units = 64 , kernel_initializer = 'normal', activation = 'relu'))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(.48))
  model.add(tf.keras.layers.Dense(units = 28, kernel_initializer = 'normal', activation = 'relu'))
  model.add(tf.keras.layers.BatchNormalization())
  model.add(tf.keras.layers.Dropout(.48))
  model.add(tf.keras.layers.Dense(1, kernel_initializer='normal'))

  optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate)

  model.compile(optimizer=optimizer, loss='mae')

  return model

XGBoost Train and hyperparameter tune

In [30]:
# Define the objective function
def objective(trial):
  params = {
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'eta': trial.suggest_float('eta', 0.01, 0.1),
    'max_depth': trial.suggest_int('max_depth', 1, 9),
    'subsample': trial.suggest_float('subsample', 0.6, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
    'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
  }

  # Train the model
  model = xgb.train(params, d_train, num_boost_round=1000, evals=watchlist,
                    early_stopping_rounds=100, verbose_eval=False)

  # Return the best validation MAE
  val_mae = model.best_score
  return val_mae

def train_tune_xgb_model(d_train, d_valid):
  watchlist = [(d_train, 'train'), (d_valid, 'valid')]

  # Create a study object and optimize the objective function
  study = optuna.create_study(direction='minimize')
  study.optimize(objective, n_trials=100, timeout=600)

  # Print the best parameters
  print(f"Best Parameters: {study.best_params}")

  return study

# Predict on Test Data

In [7]:
def generate_mlr_prediction(test_mlr_model, data, df_properties, df_logs):
  # Using sample submission to make test data for predictions for Kaggle submission
  X_test = data[data['parcelid'].isin(sample['ParcelId'])]
  sample['parcelid'] = sample['ParcelId']
  X_test = sample.merge(df_properties, on='parcelid', how='inner')

  # Getting number of rows with missing data
  num_missing = X_test[X_test[X_train.columns].isnull().any(axis=1)].shape[0]

  # Dropping rows with missing data
  X_test = X_test[X_train.columns].dropna()


  # Scaling the train and validation data
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_val_scaled = scaler.transform(X_val)

  # Creating predications based on the training and validation datasets
  train_preds = test_mlr_model.predict(X_train_scaled)
  val_preds = test_mlr_model.predict(X_val_scaled)

  # Printing out training and validation dataset mean absolute errors
  print("Train MAE:", mean_absolute_error(y_train, train_preds))
  print("Validation MAE:", mean_absolute_error(y_val, val_preds))


  # Scaling test data and making predictions
  X_test_scaled = scaler.transform(X_test)
  test_preds = test_mlr_model.predict(X_test_scaled)

  # Getting average of predictions
  average_preds = np.mean(test_preds)

  # Printing out mean absolute error between actual test data and test predictions
  y_test = df_logs['logerror']
  print("Test MAE:", mean_absolute_error(y_test, test_preds[len(test_preds)-len(y_test):]))

  return num_missing, average_preds, test_preds


def generate_xgb_prediction(clf):
  # Merge sample data with property data
  sample['parcelid'] = sample['ParcelId']
  df_test = sample.merge(prop, on='parcelid', how = 'left')

  # Convert object (string) columns to boolean
  x_test = df_test[train_columns]
  for c in x_test.dtypes[x_test.dtypes == object].index.values:
      x_test[c] = (x_test[c] == True)

  # Generate prediction
  d_test = xgb.DMatrix(x_test)
  p_test = clf.predict(d_test)

  return p_test

# Save Submission

In [8]:
# Create submission file
def save_submission(p_test):
  sub = pd.read_csv('./data/sample_submission.csv')
  for c in sub.columns[sub.columns != 'ParcelId']:
      sub[c] = p_test
  print('Save submission')
  sub.to_csv('submission.csv', index=False, float_format='%.4f')

# Run ML Pipeline

MLR pipeline

In [9]:
## Load Data ##
df_properties, sample, df_logs, df_train = load_mlr_data()

## MLR data processing ##
X_train, X_val, y_train, y_val, data = process_mlr_data(df_train)

# Standardizing all features in X_train, X_val, and X_test
X_train_std = (X_train-X_train.mean())/X_train.std()
X_val_std = (X_val-X_train.mean())/X_train.std()

# Standardizing Y_train, Y_val, and Y_test
y_train_std = (y_train-y_train.mean())/y_train.std()
y_val_std = (y_val-y_train.mean())/y_train.std()

## Build and Train Model ##
# Build and compile test_model
test_mlr_model = build_mlr_model(num_features=X_train.shape[1],learning_rate=0.0007)

# Fit test model
test_num_epochs=10
test_train_tf = test_mlr_model.fit(x=X_train_std, y=y_train_std, epochs=test_num_epochs, verbose=0,
                         validation_data=(X_val_std, y_val_std))

num_missing, average_preds, test_preds = generate_mlr_prediction(test_mlr_model, data, df_properties, df_logs)

# Replacing missing predictions with average of test data predictions
for i in range(num_missing):
  test_preds = np.append(test_preds, average_preds)
save_submission(test_preds)

[1m2257/2257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Train MAE: 0.0835997740585624
Validation MAE: 0.08256123866667166
[1m91958/91958[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 2ms/step
Test MAE: 0.08963253848735737
Save submission


XGBoost pipeline

In [34]:
## Load Data ##
train, prop, sample = load_xgb_data()

## XGBoost processing ##
# Build XGBoost DMatrix objects for efficient processing
x_train, y_train, x_valid, y_valid, train_columns = process_xgb_data(train, prop)
d_train = xgb.DMatrix(x_train, label = y_train)
d_valid = xgb.DMatrix(x_valid, label = y_valid)

study = train_tune_xgb_model(d_train, d_valid)

# Train Model on Best Parameters
params = study.best_params
#params = {'eta': 0.022382530987582482, 'max_depth': 5, 'subsample': 0.6561243067188417, 'colsample_bytree': 0.6775779485416246, 'n_estimators': 640, 'lambda': 0.9942582014915469, 'alpha': 7.97377143057426e-08}
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist,
                early_stopping_rounds=100, verbose_eval=10)

p_test = generate_xgb_prediction(clf)

save_submission(p_test)

[I 2024-08-09 23:36:20,663] A new study created in memory with name: no-name-d332fd8e-20db-4b32-a7f2-b5caae1322ce
[I 2024-08-09 23:36:29,644] Trial 0 finished with value: 0.06649880749335481 and parameters: {'eta': 0.08958739412238047, 'max_depth': 9, 'subsample': 0.631332413844385, 'colsample_bytree': 0.8095935253111474, 'n_estimators': 655, 'lambda': 2.768022240867756e-06, 'alpha': 0.0011388160439047951}. Best is trial 0 with value: 0.06649880749335481.
[I 2024-08-09 23:36:33,952] Trial 1 finished with value: 0.06635356492578186 and parameters: {'eta': 0.02074735193447349, 'max_depth': 6, 'subsample': 0.8146660501968624, 'colsample_bytree': 0.9066917820045082, 'n_estimators': 435, 'lambda': 0.007989180396699571, 'alpha': 0.03200032517135309}. Best is trial 1 with value: 0.06635356492578186.
[I 2024-08-09 23:36:45,156] Trial 2 finished with value: 0.06641389597114403 and parameters: {'eta': 0.0969617112196372, 'max_depth': 6, 'subsample': 0.8103170007649859, 'colsample_bytree': 0.9140

Best Parameters: {'eta': 0.0778153518495892, 'max_depth': 4, 'subsample': 0.9486358030978457, 'colsample_bytree': 0.7941565301616168, 'n_estimators': 983, 'lambda': 1.9219191586723523e-07, 'alpha': 1.814147294571489e-05}
[0]	train-mae:0.06853	valid-mae:0.06648
[10]	train-mae:0.06805	valid-mae:0.06625
[20]	train-mae:0.06789	valid-mae:0.06637
[30]	train-mae:0.06779	valid-mae:0.06640
[40]	train-mae:0.06772	valid-mae:0.06651
[50]	train-mae:0.06766	valid-mae:0.06660
[60]	train-mae:0.06763	valid-mae:0.06672
[70]	train-mae:0.06757	valid-mae:0.06676
[80]	train-mae:0.06752	valid-mae:0.06679
[90]	train-mae:0.06743	valid-mae:0.06682
[100]	train-mae:0.06737	valid-mae:0.06687
[110]	train-mae:0.06732	valid-mae:0.06699
[113]	train-mae:0.06729	valid-mae:0.06701
Save submission


Neural Network (NN) pipeline

In [14]:
## Load Data ##
train, prop, sample = load_nn_data()

## NN processing ##
x_train, y_train, x_valid, y_valid, x_test = process_nn_data(train, prop, sample)

imputer = SimpleImputer()
imputer.fit(x_train.iloc[:, :])
x_train = imputer.transform(x_train.iloc[:, :])
imputer.fit(x_valid.iloc[:, :])
x_valid = imputer.transform(x_valid.iloc[:, :])
imputer.fit(x_test.iloc[:, :])
x_test = imputer.transform(x_test.iloc[:, :])

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
x_valid = sc.fit_transform(x_valid)
x_val = np.array(x_valid)
y_val = np.array(y_valid)

## Build and Train Model ##
# Build and compile test_model
nn = build_nn_model(learning_rate=1e-3)
# Fit test model
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 100, verbose=2,
       validation_data=(x_val,y_val))

y_pred_ann = nn.predict(x_test).flatten()

save_submission(y_pred_ann)


Create x_train and y_train from df_train
Epoch 1/100
2555/2555 - 22s - 8ms/step - loss: 0.0727 - val_loss: 0.0666
Epoch 2/100
2555/2555 - 23s - 9ms/step - loss: 0.0691 - val_loss: 0.0666
Epoch 3/100
2555/2555 - 16s - 6ms/step - loss: 0.0686 - val_loss: 0.0667
Epoch 4/100
2555/2555 - 18s - 7ms/step - loss: 0.0683 - val_loss: 0.0663
Epoch 5/100
2555/2555 - 19s - 7ms/step - loss: 0.0682 - val_loss: 0.0663
Epoch 6/100
2555/2555 - 16s - 6ms/step - loss: 0.0682 - val_loss: 0.0662
Epoch 7/100
2555/2555 - 18s - 7ms/step - loss: 0.0682 - val_loss: 0.0663
Epoch 8/100
2555/2555 - 22s - 9ms/step - loss: 0.0681 - val_loss: 0.0664
Epoch 9/100
2555/2555 - 19s - 8ms/step - loss: 0.0681 - val_loss: 0.0666
Epoch 10/100
2555/2555 - 14s - 6ms/step - loss: 0.0680 - val_loss: 0.0666
Epoch 11/100
2555/2555 - 20s - 8ms/step - loss: 0.0680 - val_loss: 0.0669
Epoch 12/100
2555/2555 - 19s - 8ms/step - loss: 0.0680 - val_loss: 0.0666
Epoch 13/100
2555/2555 - 20s - 8ms/step - loss: 0.0680 - val_loss: 0.0668
Epoch 

Save models

In [15]:
test_mlr_model.save('/content/drive/MyDrive/mlr.keras')
nn.save('/content/drive/MyDrive/nn.keras')

Load models

In [16]:
mlr_model = tf.keras.models.load_model('/content/drive/MyDrive/mlr.keras')
nn_model = tf.keras.models.load_model('/content/drive/MyDrive/nn.keras')

# Visualizations

In [35]:
def plot_hyperparameter_optimization(study):

  # Optimization history plot
  opt_history_fig = vis.plot_optimization_history(study)
  opt_history_fig.show()

  # Parameter importance plot
  param_importance_fig = vis.plot_param_importances(study)
  param_importance_fig.show()

  # Slice plot
  slice_fig = vis.plot_slice(study)
  slice_fig.show()

  # Slice plot
  slice_fig = vis.plot_slice(study)
  slice_fig.show()

plot_hyperparameter_optimization(study)