# Phase 3: Submitting to Kaggle

The only way for us to test the strength of our model is by uploading the test predictions to Kaggle

## Setting up Kaggle

If you haven't set up authentication with Kaggle yet (you can test this by running the cell below), follow these steps:

1. Go to the Account tab of your [Kaggle profile](https://www.kaggle.com/settings/account)
2. Select 'Create New Token' (which will download a file `kaggle.json`)
3. If you are on a UNIX-based OS, place this at `~/.kaggle/kaggle.json`
    - For Windows, place this at `C:\Users\<Windows-username>\.kaggle\kaggle.json`

In [1]:
from dotenv import load_dotenv
load_dotenv()

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

competition = "house-prices-advanced-regression-techniques"

## Generate Predictions for Test Set

Finally, we can use our built pipeline to generate predictions for the test set which can be uploaded to Kaggle.

In [2]:
import ames_notebooks
from app.data_ingestion.read_data import DataReader
from app.pipelines.preprocessing import get_fitted_pipelines

print("Loading data...")
reader = DataReader()
train_data, test_data = reader.load_train_test()
print("Test shape:", test_data.shape)

feature_preprocessor, target_transformer = get_fitted_pipelines(train_data)


model_name = "xgboost-optimized"
from app.inference.predict import AmesPredictor
predictor = AmesPredictor(feature_engineer=feature_preprocessor, model_name=model_name)
predictor.model

[32m2025-11-23 22:32:46.132[0m | [34m[1mDEBUG   [0m | [36mapp.config.settings[0m:[36m<module>[0m:[36m29[0m - [34m[1mloaded settings: {
    "DATA_DIRECTORY": "data",
    "RAW_DATA_DIRECTORY": "data/raw",
    "PROCESSED_DATA_DIRECTORY": "data/processed",
    "KAGGLE_COMPETITION": "house-prices-advanced-regression-techniques",
    "KAGGLE_DOWNLOAD_PATH": "data/house-prices-advanced-regression-techniques.zip",
    "PROD_MODEL_NAME": "prod",
    "LOG_LEVEL": "INFO",
    "LOG_FILE": "logs/app.log",
    "MLFLOW_EXPERIMENT_NAME": "ames-housing-pricing-experiment",
    "MLFLOW_TRACKING_URI": "http://127.0.0.1:8500"
}[0m


Loading data...
Test shape: (1459, 79)


[32m2025-11-23 22:32:47.973[0m | [1mINFO    [0m | [36mapp.inference.predict[0m:[36m__init__[0m:[36m45[0m - [1mmlflow tracking uri set to http://127.0.0.1:8500[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  self.get_booster().load_model(fname)
[32m2025-11-23 22:32:55.375[0m | [1mINFO    [0m | [36mapp.inference.predict[0m:[36mget_model[0m:[36m24[0m - [1mloaded model with id m-95ba7c9b35d9439aa02704c9b335895d[0m


mlflow.pyfunc.loaded_model:
  artifact_path: s3://mlflow-artifacts/mlruns/7/models/m-95ba7c9b35d9439aa02704c9b335895d/artifacts
  flavor: mlflow.xgboost
  run_id: 0eee8885b7204cc88b3aa162e2f954b3

In [3]:
import pandas as pd

y_test = predictor.predict(test_data, target_transform=target_transformer.inverse_transform)

# format for Kaggle
submission = pd.DataFrame({
    'Id': test_data.index,
    'SalePrice': y_test
})

submission



Unnamed: 0,Id,SalePrice
0,1461,119361.054688
1,1462,159678.296875
2,1463,181127.531250
3,1464,193152.750000
4,1465,182362.140625
...,...,...
1454,2915,90742.960938
1455,2916,89349.484375
1456,2917,176669.671875
1457,2918,130614.984375


In [4]:
import app.pipelines.kaggle_utils as ku

response = ku.submit_to_kaggle(submission)
response

[32m2025-11-23 22:32:55.490[0m | [34m[1mDEBUG   [0m | [36mapp.pipelines.kaggle_utils[0m:[36msubmit_to_kaggle[0m:[36m38[0m - [34m[1mKaggle submission file saved to ../submissions/submission_11-23-25_22:32:55.csv[0m
[32m2025-11-23 22:32:55.490[0m | [34m[1mDEBUG   [0m | [36mapp.pipelines.kaggle_utils[0m:[36msubmit_to_kaggle[0m:[36m42[0m - [34m[1mSubmitting submission 11-23-25_22:32:55 to Kaggle[0m
100%|██████████| 21.2k/21.2k [00:00<00:00, 50.2kB/s]


{"message": "Successfully submitted to House Prices - Advanced Regression Techniques", "ref": 48431750}

In [5]:
ku.get_kaggle_submission_score(response.ref)

'0.12915'

In [18]:
from app.pipelines.training import submit_to_kaggle_and_tag

submit_to_kaggle_and_tag(submission, model_name)

[32m2025-11-23 22:38:12.005[0m | [34m[1mDEBUG   [0m | [36mapp.pipelines.kaggle_utils[0m:[36msubmit_to_kaggle[0m:[36m38[0m - [34m[1mKaggle submission file saved to ../submissions/submission_11-23-25_22:38:11.csv[0m
[32m2025-11-23 22:38:12.006[0m | [34m[1mDEBUG   [0m | [36mapp.pipelines.kaggle_utils[0m:[36msubmit_to_kaggle[0m:[36m42[0m - [34m[1mSubmitting submission 11-23-25_22:38:11 to Kaggle[0m
100%|██████████| 21.2k/21.2k [00:00<00:00, 35.8kB/s]


('xgboost-optimized', '4', '0.12915')

In [None]:
import os

from datetime import datetime
now = datetime.now().strftime("%D_%T").replace('/', '-')
# save submission file
os.makedirs('../submissions', exist_ok=True)
submission_filename = f"submission_{now}.csv"
submission_path = f"../submissions/{submission_filename}"
submission.to_csv(submission_path, index=False)
print(f"Submission file saved to {submission_path}")

print("\nFirst few predictions:")
print(submission.head())

Submission file saved to ../submissions/submission_11-23-25_14:07:42.csv

First few predictions:
     Id      SalePrice
0  1461  119361.054688
1  1462  159678.296875
2  1463  181127.531250
3  1464  193152.750000
4  1465  182362.140625


In [8]:
message = f"submission {now}"
response = api.competition_submit(submission_path, message, competition)

# to solve latency with submission/query
from time import sleep
sleep(3)

response

100%|██████████| 21.2k/21.2k [00:00<00:00, 49.3kB/s]


{"message": "Successfully submitted to House Prices - Advanced Regression Techniques", "ref": 48421259}

In [9]:
leaderboard = api.competition_submissions(competition)
submission = [s for s in leaderboard if s.ref == response.ref][0]
other_submissions = [s for s in leaderboard if s.ref != response.ref]
other_submissions.sort(key = lambda x: x.date, reverse=True)

score = float(submission.public_score)
print(f"submission returned score of {score}")

print("\nLast 5 submissions:")
for s in other_submissions[:5]:
    print(f"\tSCORE: {s.public_score}")
    print(f"\tref: {s.ref}")
    print(f"\tdate: {s.date}")
    print(f"\tfile name: {s.file_name}")
    print(f"\tsubmitted by {s.submitted_by}\n")

submission returned score of 0.12915

Last 5 submissions:
	SCORE: 0.12921
	ref: 48333960
	date: 2025-11-20 14:18:56.977000
	file name: submission_11-20-25_091830.csv
	submitted by nicbolton

	SCORE: 0.12412
	ref: 48057103
	date: 2025-11-10 18:53:51
	file name: submission_11-10-25_135328.csv
	submitted by nicbolton

	SCORE: 0.12412
	ref: 48057094
	date: 2025-11-10 18:53:30.163000
	file name: submission_11-10-25_135328.csv
	submitted by nicbolton

	SCORE: 0.12623
	ref: 47994766
	date: 2025-11-08 20:01:21
	file name: submission_11-08-25_200120.csv
	submitted by nicbolton

	SCORE: 0.12977
	ref: 47991291
	date: 2025-11-08 17:02:32.513000
	file name: submission_11-08-25_170232.csv
	submitted by nicbolton



In [11]:
submission.public_score

'0.12915'

In [17]:
import mlflow

model_name = 'xgboost-baseline'

search = mlflow.search_registered_models(filter_string=f"name = '{model_name}'")

result = search[0]

version = result.latest_versions[0].version
source = result.latest_versions[0].source
model = mlflow.pyfunc.load_model(source)
model

mlflow.set_model_version_tag(
    name=model_name,
    version=version,
    key='kaggle',
    value=submission.public_score
)

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  self.get_booster().load_model(fname)
