# Tabular Data

In [14]:
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
ROOT_DIR=Path(__file__).resolve().parents[1]
INP_DIR = ROOT_DIR/"data"/"raw"
OUT_DIR = ROOT_DIR/"data"/"processed"

In [3]:
df=pd.read_csv(INP_DIR/'train(1).csv',usecols=[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,19,20])

In [5]:
df_test=pd.read_csv(INP_DIR/'test2.csv',usecols=[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,18,19])


In [6]:
CURRENT_YEAR = 2015

def process_renovation(df, current_year=CURRENT_YEAR):
    df = df.copy()

    
    df["is_renovated"] = (df["yr_renovated"] > 0).astype(int)

    
    df["years_since_renovation"] = np.where(
        df["yr_renovated"] > 0,
        current_year - df["yr_renovated"],
        0
    )

    
    df["house_age"] = current_year - df["yr_built"]

    # Drop raw year columns
    df.drop(columns=["yr_renovated", "yr_built"], inplace=True)

    return df

In [7]:
df = process_renovation(df)
df_test = process_renovation(df_test)

In [8]:
TARGET1 = "price"
TARGET2="id"
X_train = df.drop(columns=[TARGET1,TARGET2])
y_train = np.log1p(df[TARGET1])

X_test = df_test.copy()

In [10]:
zip_price_mean = df.groupby("zipcode")["price"].mean()
global_price_mean = df["price"].mean()

X_train["zipcode_price_mean"] = df["zipcode"].map(zip_price_mean)
X_test["zipcode_price_mean"] = df_test["zipcode"].map(zip_price_mean)

X_train["zipcode_price_mean"] = X_train["zipcode_price_mean"].fillna(global_price_mean)
X_test["zipcode_price_mean"] = X_test["zipcode_price_mean"].fillna(global_price_mean)



df["price_per_sqft"] = df["price"] / df["sqft_living"]

zip_ppsqft_mean = df.groupby("zipcode")["price_per_sqft"].mean()
global_ppsqft_mean = df["price_per_sqft"].mean()

X_train["zipcode_price_per_sqft"] = df["zipcode"].map(zip_ppsqft_mean)
X_test["zipcode_price_per_sqft"] = df_test["zipcode"].map(zip_ppsqft_mean)

X_train["zipcode_price_per_sqft"] = X_train["zipcode_price_per_sqft"].fillna(global_ppsqft_mean)
X_test["zipcode_price_per_sqft"] = X_test["zipcode_price_per_sqft"].fillna(global_ppsqft_mean)

df.drop(columns=["price_per_sqft"], inplace=True)


X_train.drop(columns=["zipcode"], inplace=True, errors="ignore")
X_test.drop(columns=["zipcode"], inplace=True, errors="ignore")

In [11]:
X_test = X_test[X_train.columns]

In [39]:
X_train.to_csv(OUT_DIR/'X_train.csv' , index=False)

In [40]:
X_test.to_csv(OUT_DIR/'X_test.csv', index=False)

# Extracting Image embeddings using Pretrained CNN

In [68]:
import os
import torch
from PIL import Image
from tqdm import tqdm
import torchvision.transforms as transforms
import torchvision.models as models
from pathlib import Path

In [69]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [None]:
TRAIN_IMG_DIR = ROOT_DIR / "data" / "images" / "train"
TEST_IMG_DIR  = ROOT_DIR / "data" / "images" / "test"

OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet50(pretrained=True)

model = torch.nn.Sequential(*list(model.children())[:-1])

for p in model.parameters():
    p.requires_grad_(False)

model = model.to(device)
model.eval()

In [None]:
features = []
ids = []

with torch.no_grad():
    for pid in tqdm(df["id"]):
        img_path = TRAIN_IMG_DIR / f"{pid}.jpeg"

        if not img_path.exists():
            continue   # skip missing images

        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0).to(device)

        feat = model(img)                    
        feat = feat.view(-1).cpu().numpy()   

        features.append(feat)
        ids.append(pid)

In [None]:
if len(features) == 0:
    raise RuntimeError("No image embeddings were generated.")
features = np.vstack(features)   

df_img = pd.DataFrame(features)

df_img.insert(0, "id", ids)

In [None]:
df_img.to_csv(OUT_DIR / "image_embeddings_resnet50.csv", index=False)

In [None]:
embeddings = []
ids1 = []

with torch.no_grad():
    for pid in tqdm(df_test["id"]):
        img_path = TEST_IMG_DIR / f"{pid}.jpeg"

        if not img_path.exists():
            continue   # skip missing images

        img = Image.open(img_path).convert("RGB")
        img = transform(img).unsqueeze(0).to(device)

        feat = model(img)                    
        feat = feat.view(-1).cpu().numpy()   

        embeddings.append(feat)
        ids1.append(pid)

In [None]:
if len(embeddings) == 0:
    raise RuntimeError("No image embeddings were generated.")
embeddings_array = np.vstack(embeddings)   

df_test_img = pd.DataFrame(embeddings_array)

df_test_img.insert(0, "id", ids1)

In [None]:
df_test_img.to_csv(OUT_DIR / "test_image_embeddings_resnet50.csv", index=False)

# Image Data

In [None]:
df_img = pd.read_csv(OUT_DIR/"image_embeddings_resnet50.csv")

In [42]:
df_train = df.merge(df_img, on="id", how="inner")

In [54]:
df_train["zipcode_price_mean"] = df_train["zipcode"].map(zip_price_mean)
df_train["zipcode_price_per_sqft"] = df_train["zipcode"].map(zip_ppsqft_mean)
df_train["zipcode_price_mean"].fillna(df_train["price"].mean(), inplace=True)
df_train["zipcode_price_per_sqft"].fillna(df_train["zipcode_price_per_sqft"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["zipcode_price_mean"].fillna(df_train["price"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["zipcode_price_per_sqft"].fillna(df_train["zipcode_price_per_sqft"].mean(), inplace=True)


In [66]:
df_train.to_csv(OUT_DIR/'df_train.csv', index = False)