In [None]:
!pip install lightgbm xgboost catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

df = pd.read_csv("merged_clean.csv")
X = df.drop(["logerror"], axis=1)
y = df["logerror"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)
data = {
    'logerror': np.random.rand(100) * 0.1 - 0.05,  
    'feature1': np.random.rand(100) * 100,
    'feature2': np.random.randint(0, 10, 100),
    'feature3': np.random.rand(100) * 50,
    'hashottuborspa': np.random.choice([0, 1], 100) 
}
dummy_df = pd.DataFrame(data)

dummy_df.to_csv('merged_clean.csv', index=False)
print("Dummy 'merged_clean.csv' created successfully.")

Dummy 'merged_clean.csv' created successfully.


In [9]:
X = X.drop(columns=['hashottuborspa'], errors='ignore')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.dtypes[X_train.dtypes == 'object']

In [10]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "num_leaves": [31, 63, 127],
    "learning_rate": [0.05, 0.03, 0.01],
    "n_estimators": [500, 1000, 1500],
    "min_child_samples": [10, 20, 30]
}

model = LGBMRegressor(objective="regression", random_state=42, device='gpu')

grid = GridSearchCV(model, params, scoring="neg_mean_absolute_error",
                    cv=3, verbose=1, n_jobs=-1)

grid.fit(X_train, y_train)

print("Best MAE:", -grid.best_score_)
print("Best Params:", grid.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 68
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 4
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 4 dense feature groups (0.00 MB) transferred to GPU in 0.000215 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.003190
Best MAE: 0.02556682569735416
Best Params: {'learning_rate': 0.05, 'min_child_samples': 30, 'n_estimators': 500, 'num_leaves': 31}


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "n_estimators": [500, 800, 1200],
    "learning_rate": [0.05, 0.03, 0.01],
    "max_depth": [4, 6, 8],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

model = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    tree_method="hist" 
)

grid = GridSearchCV(
    model,
    params,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best MAE for XGBoost:", -grid.best_score_)
print("Best Params for XGBoost:", grid.best_params_)
pred_xgb = grid.best_estimator_.predict(X_val)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best MAE for XGBoost: 0.026496266160348055
Best Params for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 500, 'subsample': 0.9}


In [14]:
train17 = pd.read_csv("train_2017.csv")

In [18]:
merged17 = train17.merge(df_clean, on="parcelid", how="left")

In [None]:
train16 = merged16   
train17 = merged17   

train_full = pd.concat([train16, train17], ignore_index=True)

X_full = train_full.drop(["logerror", "parcelid"], axis=1)
y_full = train_full["logerror"]

In [None]:
X_full = X_full.drop(columns=['transactiondate'], errors='ignore')
binary_flag_columns = ['hashottuborspa', 'fireplaceflag', 'taxdelinquencyflag']
for col in binary_flag_columns:
    if col in X_full.columns:
        X_full[col] = X_full[col].astype(str).str.contains('Y|1', na=False, regex=True).astype(int)

high_cardinality_cols = ['propertycountylandusecode', 'propertyzoningdesc']
for col in high_cardinality_cols:
    if col in X_full.columns:
        X_full = X_full.drop(columns=[col])

for col in X_full.select_dtypes(include=['number']).columns:
    X_full[col] = X_full[col].fillna(0)
object_cols_after_fix = X_full.select_dtypes(include='object').columns
if not object_cols_after_fix.empty:
    print(f"Warning: The following object columns still exist after preprocessing and might cause issues: {object_cols_after_fix.tolist()}")
else:
    print("X_full preprocessing complete. All object columns handled.")

print("Final X_full dtypes after preprocessing:")
print(X_full.dtypes)

X_full preprocessing complete. All object columns handled.
Final X_full dtypes after preprocessing:
airconditioningtypeid           float64
architecturalstyletypeid        float64
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid             float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
decktypeid                      float64
finishedfloor1squarefeet        float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15            float64
finishedsquarefeet50            float64
finishedsquarefeet6             float64
fips                            float64
fireplacecnt                    float64
fullbathcnt                     float64
garagecarcnt                    float64
garagetotalsqft                 float64
hashottuborspa                    int64
heatingorsystemtypei

In [22]:
train16 = pd.read_csv("train_2016_v2.csv")
print("train16 loaded successfully.")

train16 loaded successfully.


In [23]:
df_clean_2016 = pd.read_csv("properties_2016.csv")
print("df_clean_2016 loaded successfully.")

  df_clean_2016 = pd.read_csv("properties_2016.csv")


df_clean_2016 loaded successfully.


In [24]:
merged16 = train16.merge(df_clean_2016, on="parcelid", how="left")
print("merged16 created successfully.")

merged16 created successfully.


In [25]:
best_lgb_params = {
    "num_leaves": 31,
    "learning_rate": 0.05,
    "n_estimators": 500,
    "min_child_samples": 30
}

In [26]:
best_xgb_params = {
    "max_depth": 8,
    "learning_rate": 0.01,
    "n_estimators": 500,
    "subsample": 0.9,
    "colsample_bytree": 0.7
}

In [33]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

model_lgb = LGBMRegressor(
    objective="regression",
    random_state=42,
    **best_lgb_params
)

model_xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    **best_xgb_params
)

model_lgb.fit(X_full, y_full)
model_xgb.fit(X_full, y_full)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5308
[LightGBM] [Info] Number of data points in the train set: 167888, number of used features: 50
[LightGBM] [Info] Start training from score 0.013906


In [35]:
sub = pd.read_csv("sample_submission.csv")
test = sub[['ParcelId']]
test = test.rename(columns={'ParcelId': 'parcelid'})

test = test.merge(df_clean, on="parcelid", how="left")

X_test = test.drop(["parcelid"], axis=1)

In [36]:
pred_lgb = model_lgb.predict(X_test)
pred_xgb = model_xgb.predict(X_test)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: hashottuborspa: object, propertycountylandusecode: object, propertyzoningdesc: object, fireplaceflag: object, taxdelinquencyflag: object

In [None]:
# Preprocessing X_test to handle object dtypes, mirroring steps for X_full

# Drop 'transactiondate' if it exists in X_test
X_test = X_test.drop(columns=['transactiondate'], errors='ignore')

# Convert binary flag columns from object to numeric (0 or 1)
binary_flag_columns = ['hashottuborspa', 'fireplaceflag', 'taxdelinquencyflag']
for col in binary_flag_columns:
    if col in X_test.columns:
        # Ensure the column exists and then apply the conversion
        # Use .astype(str) to handle potential NaN values correctly before .str.contains
        X_test[col] = X_test[col].astype(str).str.contains('Y|1', na=False, regex=True).astype(int)

# Drop potentially high cardinality or hard-to-encode categorical columns
high_cardinality_cols = ['propertycountylandusecode', 'propertyzoningdesc']
for col in high_cardinality_cols:
    if col in X_test.columns:
        X_test = X_test.drop(columns=[col])

# Fill any remaining NaNs in numerical columns
for col in X_test.select_dtypes(include=['number']).columns:
    X_test[col] = X_test[col].fillna(0)

# Verify that there are no remaining object columns
object_cols_after_fix_test = X_test.select_dtypes(include='object').columns
if not object_cols_after_fix_test.empty:
    print(f"Warning: The following object columns still exist in X_test after preprocessing: {object_cols_after_fix_test.tolist()}")
else:
    print("X_test preprocessing complete. All object columns handled.")

print("Final X_test dtypes after preprocessing:")
print(X_test.dtypes)

X_test preprocessing complete. All object columns handled.
Final X_test dtypes after preprocessing:
airconditioningtypeid           float64
architecturalstyletypeid        float64
basementsqft                    float64
bathroomcnt                     float64
bedroomcnt                      float64
buildingclasstypeid             float64
buildingqualitytypeid           float64
calculatedbathnbr               float64
decktypeid                      float64
finishedfloor1squarefeet        float64
calculatedfinishedsquarefeet    float64
finishedsquarefeet12            float64
finishedsquarefeet13            float64
finishedsquarefeet15            float64
finishedsquarefeet50            float64
finishedsquarefeet6             float64
fips                            float64
fireplacecnt                    float64
fullbathcnt                     float64
garagecarcnt                    float64
garagetotalsqft                 float64
hashottuborspa                    int64
heatingorsystemtypei

In [40]:
pred_lgb = model_lgb.predict(X_test)
pred_xgb = model_xgb.predict(X_test)
print("Predictions generated successfully for LGBM and XGBoost.")

Predictions generated successfully for LGBM and XGBoost.


In [42]:
final_pred = 0.6 * pred_lgb + 0.4 * pred_xgb

In [43]:
sub["201610"] = final_pred
sub["201611"] = final_pred
sub["201612"] = final_pred
sub["201710"] = final_pred
sub["201711"] = final_pred
sub["201712"] = final_pred

sub.to_csv("submission.csv", index=False)
print("Saved submission.csv")

Saved submission.csv


In [44]:
import pandas as pd

submission_df = pd.read_csv('submission.csv')

print(f"Number of rows in submission.csv: {submission_df.shape[0]}")
print("First 5 rows of submission.csv:")
display(submission_df.head())

Number of rows in submission.csv: 2985217
First 5 rows of submission.csv:


Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.212996,0.212996,0.212996,0.212996,0.212996,0.212996
1,10759547,0.096288,0.096288,0.096288,0.096288,0.096288,0.096288
2,10843547,0.248386,0.248386,0.248386,0.248386,0.248386,0.248386
3,10859147,0.092475,0.092475,0.092475,0.092475,0.092475,0.092475
4,10879947,0.074219,0.074219,0.074219,0.074219,0.074219,0.074219
