In [2]:
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
df = pd.read_csv('../data/processed/train_cleaned.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  1460 non-null   int64  
 1   MSSubClass          1460 non-null   int64  
 2   MSZoning            1460 non-null   object 
 3   Street              1460 non-null   object 
 4   LotShape            1460 non-null   object 
 5   LandContour         1460 non-null   object 
 6   Utilities           1460 non-null   object 
 7   LotConfig           1460 non-null   object 
 8   LandSlope           1460 non-null   object 
 9   Neighborhood        1460 non-null   object 
 10  Condition1          1460 non-null   object 
 11  Condition2          1460 non-null   object 
 12  BldgType            1460 non-null   object 
 13  HouseStyle          1460 non-null   object 
 14  OverallQual         1460 non-null   int64  
 15  OverallCond         1460 non-null   int64  
 16  YearBu

# Train-Test split

In [5]:
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [6]:
# train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [7]:
X_train.shape

(1168, 74)

In [8]:
y_train.shape

(1168,)

In [9]:
X_test.shape

(292, 74)

In [10]:
y_test.shape

(292,)

In [11]:
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

In [12]:
numerical_features

['Id',
 'MSSubClass',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'MiscVal',
 'MoSold',
 'YrSold',
 'GarageYrBlt_NoNull',
 'MasVnrArea_NoNull',
 'LotFrontage_capped',
 'LotArea_capped',
 'TotalBsmtSF_capped',
 'GrLivArea_capped',
 'GarageArea_capped',
 '1stFlrSF_capped']

In [13]:
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition',
 'GarageType_NoNull']

# ANOVA test

In [14]:
import pandas as pd
from scipy.stats import f_oneway
from sklearn.preprocessing import LabelEncoder

def anova_categorical_features(X_train, y_train, categorical_features):
    """
    Perform ANOVA on categorical features only
    """
    results = {}
    
    for feature in categorical_features:
        # Encode categorical feature to numerical
        le = LabelEncoder()
        encoded_feature = le.fit_transform(X_train[feature].astype(str))
        
        # Group encoded values by target classes
        groups = []
        unique_classes = pd.unique(y_train)
        
        for class_val in unique_classes:
            mask = y_train == class_val
            group_data = encoded_feature[mask]
            groups.append(group_data)
        
        # Perform ANOVA
        f_stat, p_value = f_oneway(*groups)
        results[feature] = {'F-statistic': f_stat, 'p-value': p_value}
        
        print(f"{feature}: F={f_stat:.4f}, p={p_value:.4f}")
    
    return results

# Usage:


In [15]:
results = anova_categorical_features(X_train, y_train, categorical_features)

MSZoning: F=1.7452, p=0.0000
Street: F=1.5896, p=0.0000
LotShape: F=1.3772, p=0.0001
LandContour: F=1.1403, p=0.0564
Utilities: F=0.5223, p=1.0000
LotConfig: F=0.9271, p=0.8194
LandSlope: F=1.3218, p=0.0004
Neighborhood: F=1.4119, p=0.0000
Condition1: F=0.6955, p=1.0000
Condition2: F=0.7471, p=0.9998
BldgType: F=1.0629, p=0.2305
HouseStyle: F=1.1890, p=0.0183
RoofStyle: F=1.1323, p=0.0668
RoofMatl: F=0.9057, p=0.8838
Exterior1st: F=0.9602, p=0.6878
Exterior2nd: F=1.0442, p=0.3005
ExterQual: F=3.2048, p=0.0000
ExterCond: F=0.9551, p=0.7101
Foundation: F=1.3672, p=0.0001
BsmtQual: F=2.1929, p=0.0000
BsmtCond: F=1.0872, p=0.1564
BsmtExposure: F=1.4402, p=0.0000
BsmtFinType1: F=2.1929, p=0.0000
BsmtFinType2: F=0.7921, p=0.9975
Heating: F=1.4108, p=0.0000
HeatingQC: F=1.3030, p=0.0007
CentralAir: F=1.5494, p=0.0000
Electrical: F=1.0447, p=0.2985
KitchenQual: F=2.8033, p=0.0000
Functional: F=0.7851, p=0.9982
FireplaceQu: F=1.6510, p=0.0000
GarageFinish: F=1.6866, p=0.0000
GarageQual: F=1.078