In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Lasso, Ridge

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
import os
import cv2

train = pd.read_csv(r"C:\Users\bhuvi\OneDrive\cdc_project\train_with_images.csv")
test = pd.read_csv(r"C:\Users\bhuvi\OneDrive\cdc_project\test_with_images.csv")

train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

temp_date = pd.to_datetime(train['date'])
temp_date_t = pd.to_datetime(test['date'])

# feature engineering-----------------------------------------------------

sale_year = temp_date.dt.year + (temp_date.dt.month > 6).astype(int)
sale_year_t = temp_date_t.dt.year + (temp_date_t.dt.month > 6).astype(int)

train['age'] = sale_year - train['yr_built']
test['age'] = sale_year_t - test['yr_built']

train.drop('date', axis=1, inplace=True)
test.drop('date', axis=1, inplace=True)

# Calculate the age of the renovation
age_reno= sale_year - train['yr_renovated']
age_reno_t= sale_year_t - test['yr_renovated']

# Create 'renovated' column based on your 25% threshold
# Logic: If age_reno <= 0.25 * age, then 1, else 0
train['renovated'] = (age_reno.values <= 0.50 * train['age'].values).astype(int)
test['renovated'] = (age_reno_t.values <= 0.50 * test['age'].values).astype(int)

train.drop('yr_renovated',axis=1,inplace=True)
test.drop('yr_renovated',axis=1,inplace=True)

train['has_basement'] = np.where(train['sqft_basement']!=0,1,0)
test['has_basement'] = np.where(test['sqft_basement']!=0,1,0)

train.drop('sqft_basement', axis=1, inplace=True)
test.drop('sqft_basement', axis=1, inplace=True)

train.drop(columns=['zipcode','sqft_above','yr_built'], axis=1, inplace=True)
test.drop(columns=['zipcode','sqft_above','yr_built'], axis=1, inplace=True)

train['bedrooms'] = train['bedrooms'].clip(upper=10)
train['bathrooms'] = train['bathrooms'].clip(upper=6)
test['bedrooms'] = test['bedrooms'].clip(upper=10)
test['bathrooms'] = test['bathrooms'].clip(upper=6)

lot_cap = train['sqft_living'].quantile(0.98)
train['sqft_living'] = train['sqft_living'].clip(upper=lot_cap)
test['sqft_living'] = test['sqft_living'].clip(upper=lot_cap)

lot_cap = train['sqft_lot'].quantile(0.987)
train['sqft_lot'] = train['sqft_lot'].clip(upper=lot_cap)
test['sqft_lot'] = test['sqft_lot'].clip(upper=lot_cap)
train['sqft_lot_log'] = np.log1p(train['sqft_lot'])
test['sqft_lot_log'] = np.log1p(test['sqft_lot'])
train.drop(columns=['sqft_lot'], axis=1, inplace=True)
test.drop(columns=['sqft_lot'], axis=1, inplace=True)

price_cap = train['price'].quantile(0.983)
train['price'] = train['price'].clip(upper=price_cap)
train['price_log'] = np.log1p(train['price'])

# ----------------------------
# 1. IMAGE PATH & VALIDATION
# ----------------------------
BASE_DIR = r"C:\Users\bhuvi\OneDrive\cdc_project"
TRAIN_IMG_DIR = os.path.join(BASE_DIR, "train")
TEST_IMG_DIR  = os.path.join(BASE_DIR, "test")

# Fix image paths
train['image_path'] = train['image_path'].apply(lambda x: os.path.join(TRAIN_IMG_DIR, os.path.basename(x)))
test['image_path'] = test['image_path'].apply(lambda x: os.path.join(TEST_IMG_DIR, os.path.basename(x)))

def is_valid_image(path):
    """Returns True if the image exists and can be opened."""
    if not isinstance(path, str) or not os.path.exists(path):
        return False
    try:
        img = cv2.imread(path)
        return img is not None
    except:
        return False

# CORRECT FILTERING: Filter rows where images are missing or broken
train = train[train['image_path'].apply(is_valid_image)].copy()
test = test[test['image_path'].apply(is_valid_image)].copy()

print(f"Verified Train size: {train.shape}")
print(f"Verified Test size: {test.shape}")

# print(train.shape)
# print(test.shape)

print(train.columns)

train.to_csv(os.path.join(BASE_DIR, "train_processed.csv"), index=False)
test.to_csv(os.path.join(BASE_DIR, "test_processed.csv"), index=False)

print("processing done")

  train['price'] = train['price'].clip(upper=price_cap)


Verified Train size: (16209, 20)
Verified Test size: (5404, 18)
Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'image_path', 'age', 'renovated',
       'has_basement', 'sqft_lot_log', 'price_log'],
      dtype='object')
processing done
