In [11]:
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
df = pd.read_csv('../data/processed/train_cleaned.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  1460 non-null   int64  
 1   MSSubClass          1460 non-null   int64  
 2   MSZoning            1460 non-null   object 
 3   Street              1460 non-null   object 
 4   LotShape            1460 non-null   object 
 5   LandContour         1460 non-null   object 
 6   Utilities           1460 non-null   object 
 7   LotConfig           1460 non-null   object 
 8   LandSlope           1460 non-null   object 
 9   Neighborhood        1460 non-null   object 
 10  Condition1          1460 non-null   object 
 11  Condition2          1460 non-null   object 
 12  BldgType            1460 non-null   object 
 13  HouseStyle          1460 non-null   object 
 14  OverallQual         1460 non-null   int64  
 15  OverallCond         1460 non-null   int64  
 16  YearBu

In [12]:
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [13]:
# train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [14]:
X_train.shape

(1168, 74)

In [15]:
y_train.shape

(1168,)

In [16]:
X_test.shape

(292, 74)

In [17]:
y_test.shape

(292,)

In [18]:
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

In [19]:
numerical_features

['Id',
 'MSSubClass',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'MiscVal',
 'MoSold',
 'YrSold',
 'GarageYrBlt_NoNull',
 'MasVnrArea_NoNull',
 'LotFrontage_capped',
 'LotArea_capped',
 'TotalBsmtSF_capped',
 'GrLivArea_capped',
 'GarageArea_capped',
 '1stFlrSF_capped']

In [20]:
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition',
 'GarageType_NoNull']

In [None]:

def anova_feature_scores(df, categorical_features, target):
    scores = {}
    for col in categorical_features:
        groups = [group[target].values for _, group in df.groupby(col)]
        # Need at least 2 groups and groups must have >1 sample to compute f_oneway
        if len(groups) < 2 or any(len(g) < 2 for g in groups):
            scores[col] = np.nan
            continue
        f_stat, pval = stats.f_oneway(*groups)
        scores[col] = {"f_stat": float(f_stat), "pval": float(pval)}
    return pd.DataFrame.from_dict(scores, orient="index").sort_values("f_stat", ascending=False)

# Usage
anova_scores = anova_feature_scores(df, categorical_features, target)
print(anova_scores)
