# Load Data

In [1]:
from pathlib import Path
import sys

# Go up one folder from the current notebook directory
sys.path.append(str(Path().resolve().parent))

from src.data_access import load_housing_data

In [2]:
train, test = load_housing_data()

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# EDA

- 1460 instances, 80 features
- data types: mostly int64 and object (text, since loaded from csv), and some float64

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [12]:
cols_with_null = train.isnull().sum()[train.isnull().sum() > 0].index.tolist()
cols_with_null

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

# Rare category threshold for collapsing levels (optional)
RARE_CAT_THRESHOLD = 0.01  # 1%


In [13]:
def quick_audit(df, target="SalePrice"):
    s = df.nunique().rename("nunique")
    miss = df.isna().mean().rename("na_rate")
    zero = (df == 0).mean(numeric_only=True).rename("zero_rate")
    out = pd.concat([s, miss, zero], axis=1)
    if target in df:
        out["corr_num"] = df.select_dtypes("number").corr()[target]
    return out.sort_values(["nunique"], ascending=False)

In [17]:
def plot_target_by_category(df, col, target='SalePrice'):
    agg = (df.groupby(col)[target]
             .agg(['mean','count','std'])
             .rename(columns={'mean':'target_mean'}))

    ax = agg['target_mean'].plot(kind='bar', figsize=(6,4))

    # annotate counts on top of bars
    for i, (val, n) in enumerate(zip(agg['target_mean'], agg['count'])):
        ax.text(i, val, f'n={n}', ha='center', va='bottom', fontsize=8, rotation=0)

    ax.set_title(f'{col} vs {target} (mean)')
    ax.set_ylabel(target)
    plt.tight_layout()
    plt.show()

In [18]:
qual_cond_cols = [c for c in train_eda.columns 
                  if any(x in c.lower() for x in ['qual','qc','cond', 'condition'])]

NameError: name 'train_eda' is not defined

In [None]:
#lets take a look at the remaining obj cols 
obj_audit.loc[
    ~obj_audit.index.isin(qual_cond_cols) & 
    ~obj_audit.index.isin(nominal_cols)
]


In [None]:
import numpy as np

high_card_skewed = []
for col in df.select_dtypes(include='number').columns:
    if df[col].nunique() > 20:  # high-card threshold
        skewness = df[col].skew()
        if abs(skewness) > 1:   # skewness threshold
            high_card_skewed.append(col)

high_card_skewed


In [None]:
remaining = (~train_eda.columns.isin(nominal_cols + ordinal_cols)).sum()

len(train_eda.columns) == len(nominal_cols) + len(ordinal_cols) + remaining


In [None]:
low_card_cols = num_train_eda.nunique()[num_train_eda.nunique() < 50].index
num_train_eda[low_card_cols]


In [None]:
import pandas as pd

# Put all categories into a dict
categories = {
    "Nominal": list(nominal_cols),
    "Ordinal": list(ordinal_cols),
    "Year": list(year_cols),
    "Binary": list(binary_cols),
    "Binary + Log": list(binary_and_log_cols),
    "Log": list(log_cols),
    "Remaining": list(remaining_cols),
}

# Build DataFrame
cat_df = pd.DataFrame(dict([(k, pd.Series(v)) for k,v in categories.items()]))

# Add counts as a summary row
counts = {k: len(v) for k,v in categories.items()}
cat_df.loc["Count"] = counts

cat_df


In [None]:
def value_counts_summary(df, cols):
    summary = []
    for col in cols:
        vc = df[col].value_counts(dropna=False, normalize=True)
        top_cat = vc.index[0]
        top_pct = vc.iloc[0]
        n_cats = vc.shape[0]
        summary.append({
            "column": col,
            "n_categories": n_cats,
            "top_category": top_cat,
            "top_category_pct": round(top_pct, 3)
        })
    return pd.DataFrame(summary).sort_values("top_category_pct", ascending=False)

# now call it with df first, then column list
nominal_summary = value_counts_summary(train_eda, categories['Nominal'])
display(nominal_summary)
