In [93]:
import pandas as pd
import numpy as np
from scipy import stats

In [94]:
df = pd.read_csv('../data/interim/train_cleaned.csv')

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  1460 non-null   int64  
 1   MSSubClass          1460 non-null   int64  
 2   MSZoning            1460 non-null   object 
 3   Street              1460 non-null   object 
 4   LotShape            1460 non-null   object 
 5   LandContour         1460 non-null   object 
 6   Utilities           1460 non-null   object 
 7   LotConfig           1460 non-null   object 
 8   LandSlope           1460 non-null   object 
 9   Neighborhood        1460 non-null   object 
 10  Condition1          1460 non-null   object 
 11  Condition2          1460 non-null   object 
 12  BldgType            1460 non-null   object 
 13  HouseStyle          1460 non-null   object 
 14  OverallQual         1460 non-null   int64  
 15  OverallCond         1460 non-null   int64  
 16  YearBu

# Train-Test split

In [96]:
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [97]:
# train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [98]:
X_train.shape

(1168, 74)

In [99]:
y_train.shape

(1168,)

In [100]:
X_test.shape

(292, 74)

In [101]:
y_test.shape

(292,)

In [102]:
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

In [103]:
numerical_features

['Id',
 'MSSubClass',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'MiscVal',
 'MoSold',
 'YrSold',
 'GarageYrBlt_NoNull',
 'MasVnrArea_NoNull',
 'LotFrontage_capped',
 'LotArea_capped',
 'TotalBsmtSF_capped',
 'GrLivArea_capped',
 'GarageArea_capped',
 '1stFlrSF_capped']

In [104]:
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition',
 'GarageType_NoNull']

# ANOVA test

In [105]:
import pandas as pd
from scipy.stats import f_oneway
from sklearn.preprocessing import LabelEncoder

def anova_categorical_features(X_train, y_train, categorical_features):
    """
    Perform ANOVA on categorical features only
    """
    results = {}
    
    for feature in categorical_features:
        # Encode categorical feature to numerical
        le = LabelEncoder()
        encoded_feature = le.fit_transform(X_train[feature].astype(str))
        
        # Group encoded values by target classes
        groups = []
        unique_classes = pd.unique(y_train)
        
        for class_val in unique_classes:
            mask = y_train == class_val
            group_data = encoded_feature[mask]
            groups.append(group_data)
        
        # Perform ANOVA
        f_stat, p_value = f_oneway(*groups)
        results[feature] = {'F-statistic': f_stat, 'p-value': p_value}
        
        print(f"{feature}: F={f_stat:.4f}, p={p_value:.4f}")
    
    return results

# Usage:


In [106]:
results = anova_categorical_features(X_train, y_train, categorical_features)

MSZoning: F=1.7452, p=0.0000
Street: F=1.5896, p=0.0000
LotShape: F=1.3772, p=0.0001
LandContour: F=1.1403, p=0.0564
Utilities: F=0.5223, p=1.0000
LotConfig: F=0.9271, p=0.8194
LandSlope: F=1.3218, p=0.0004
Neighborhood: F=1.4119, p=0.0000
Condition1: F=0.6955, p=1.0000
Condition2: F=0.7471, p=0.9998
BldgType: F=1.0629, p=0.2305
HouseStyle: F=1.1890, p=0.0183
RoofStyle: F=1.1323, p=0.0668
RoofMatl: F=0.9057, p=0.8838
Exterior1st: F=0.9602, p=0.6878
Exterior2nd: F=1.0442, p=0.3005
ExterQual: F=3.2048, p=0.0000
ExterCond: F=0.9551, p=0.7101
Foundation: F=1.3672, p=0.0001
BsmtQual: F=2.1929, p=0.0000
BsmtCond: F=1.0872, p=0.1564
BsmtExposure: F=1.4402, p=0.0000
BsmtFinType1: F=2.1929, p=0.0000
BsmtFinType2: F=0.7921, p=0.9975
Heating: F=1.4108, p=0.0000
HeatingQC: F=1.3030, p=0.0007
CentralAir: F=1.5494, p=0.0000
Electrical: F=1.0447, p=0.2985
KitchenQual: F=2.8033, p=0.0000
Functional: F=0.7851, p=0.9982
FireplaceQu: F=1.6510, p=0.0000
GarageFinish: F=1.6866, p=0.0000
GarageQual: F=1.078

In [107]:
# Filter features with p-value < 0.05
significant_features_cat = [feature for feature, res in results.items() if res['p-value'] < 0.05]

In [108]:
significant_features_cat

['MSZoning',
 'Street',
 'LotShape',
 'LandSlope',
 'Neighborhood',
 'HouseStyle',
 'ExterQual',
 'Foundation',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinType1',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageType_NoNull']

In [109]:
X_train.shape

(1168, 74)

In [110]:
# drop non-significant features from X_train and X_test
X_train = X_train[significant_features_cat + numerical_features]
X_test = X_test[significant_features_cat + numerical_features]

In [111]:
X_train.shape

(1168, 54)

In [112]:
X_train.head()

Unnamed: 0,MSZoning,Street,LotShape,LandSlope,Neighborhood,HouseStyle,ExterQual,Foundation,BsmtQual,BsmtExposure,...,MoSold,YrSold,GarageYrBlt_NoNull,MasVnrArea_NoNull,LotFrontage_capped,LotArea_capped,TotalBsmtSF_capped,GrLivArea_capped,GarageArea_capped,1stFlrSF_capped
254,RL,Pave,Reg,Gtl,NAmes,1Story,TA,CBlock,TA,No,...,6,2010,1957.0,0.0,70.0,8400.0,1314.0,1314.0,294.0,1314.0
1066,RL,Pave,IR1,Gtl,Gilbert,2Story,Gd,PConc,Gd,No,...,5,2009,1993.0,0.0,59.0,7837.0,799.0,1571.0,380.0,799.0
638,RL,Pave,Reg,Gtl,Edwards,1Story,TA,CBlock,Fa,No,...,5,2008,2005.0,0.0,67.0,8777.0,796.0,796.0,0.0,796.0
799,RL,Pave,Reg,Gtl,SWISU,1.5Fin,TA,BrkTil,Gd,No,...,6,2007,1939.0,252.0,60.0,7200.0,731.0,1768.0,240.0,981.0
380,RL,Pave,Reg,Gtl,SWISU,1.5Fin,TA,BrkTil,TA,No,...,5,2010,1924.0,0.0,50.0,5000.0,1026.0,1691.0,308.0,1026.0


# pearsonr, spearmanr

In [113]:
numerical_features

['Id',
 'MSSubClass',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'MiscVal',
 'MoSold',
 'YrSold',
 'GarageYrBlt_NoNull',
 'MasVnrArea_NoNull',
 'LotFrontage_capped',
 'LotArea_capped',
 'TotalBsmtSF_capped',
 'GrLivArea_capped',
 'GarageArea_capped',
 '1stFlrSF_capped']

In [114]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [115]:
from scipy.stats import pearsonr, spearmanr
import pandas as pd

results = []

for col in numerical_features:
    # Drop NaN safely for this column + target
    valid_data = X_train[[col]].join(y).dropna()
    x = valid_data[col]
    target = valid_data[y.name] # Ensure target is a Series

    # Pearson correlation (linear)
    pearson_corr, pearson_p = pearsonr(x, target)

    # Spearman correlation (rank-based, for non-linear monotonic)
    spearman_corr, spearman_p = spearmanr(x, target)

    results.append({
        'Feature': col,
        'Pearson Correlation': pearson_corr,
        'Pearson P-value': pearson_p,
        'Spearman Correlation': spearman_corr,
        'Spearman P-value': spearman_p
    })

# Convert to DataFrame
corr_df = pd.DataFrame(results)

# Add flags for significance
corr_df['Pearson Significant (p<0.05)'] = corr_df['Pearson P-value'] < 0.05
corr_df['Spearman Significant (p<0.05)'] = corr_df['Spearman P-value'] < 0.05

# Sort by absolute correlation strength
corr_df = corr_df.sort_values(by='Pearson Correlation', key=lambda x: abs(x), ascending=False)

corr_df


Unnamed: 0,Feature,Pearson Correlation,Pearson P-value,Spearman Correlation,Spearman P-value,Pearson Significant (p<0.05),Spearman Significant (p<0.05)
2,OverallQual,0.785555,2.5961399999999997e-245,0.801016,4.393571e-262,True,True
33,GrLivArea_capped,0.705882,7.824090000000001e-177,0.723462,7.481305e-190,True,True
19,GarageCars,0.640991,3.931616e-136,0.686763,9.94478e-164,True,True
32,TotalBsmtSF_capped,0.629145,9.721332e-130,0.595039,8.524088e-113,True,True
34,GarageArea_capped,0.628319,2.6492470000000002e-129,0.638711,7.03142e-135,True,True
35,1stFlrSF_capped,0.606657,2.454636e-118,0.566285,5.183359e-100,True,True
13,FullBath,0.552546,2.457699e-94,0.625567,7.325603000000001e-128,True,True
17,TotRmsAbvGrd,0.520388,4.890384e-82,0.520834,3.3715730000000003e-82,True,True
4,YearBuilt,0.516501,1.225755e-80,0.643216,2.302134e-137,True,True
5,YearRemodAdd,0.508593,7.567350999999999e-78,0.562948,1.311313e-98,True,True


In [116]:
# Optional: show only significant features
significant_features_num = corr_df[corr_df['Pearson P-value'] < 0.05]['Feature'].tolist()
print("\n✅ Significant numerical features correlated with target:", significant_features_num)


✅ Significant numerical features correlated with target: ['OverallQual', 'GrLivArea_capped', 'GarageCars', 'TotalBsmtSF_capped', 'GarageArea_capped', '1stFlrSF_capped', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea_NoNull', 'Fireplaces', 'LotArea_capped', 'GarageYrBlt_NoNull', 'LotFrontage_capped', 'BsmtFinSF1', 'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'HalfBath', 'BsmtFullBath', 'BsmtUnfSF', 'BedroomAbvGr', 'EnclosedPorch', 'KitchenAbvGr', 'ScreenPorch', 'MSSubClass', 'OverallCond']


In [117]:
X_train.shape

(1168, 54)

In [118]:
# drop non-significant features from X_train and X_test
X_train = X_train[significant_features_num + significant_features_cat]
X_test = X_test[significant_features_num + significant_features_cat]

In [119]:
X_train.shape

(1168, 46)

In [120]:
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

In [121]:
numerical_features.shape

(28,)

In [122]:
from sklearn.ensemble import RandomForestRegressor

# dropping categorical feature from training for feature importance analysis
X_train_num = X_train[numerical_features]

# Train model with both features
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_num, y_train)

# Get feature importances
importances = pd.DataFrame({
    'feature': X_train_num.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(importances)

               feature  importance
0          OverallQual    0.563385
1     GrLivArea_capped    0.123067
3   TotalBsmtSF_capped    0.037274
17            2ndFlrSF    0.037130
15          BsmtFinSF1    0.035071
5      1stFlrSF_capped    0.028981
12      LotArea_capped    0.021456
4    GarageArea_capped    0.019566
8            YearBuilt    0.018723
14  LotFrontage_capped    0.015906
2           GarageCars    0.014178
18         OpenPorchSF    0.010663
9         YearRemodAdd    0.010593
7         TotRmsAbvGrd    0.008067
13  GarageYrBlt_NoNull    0.007541
21           BsmtUnfSF    0.006938
16          WoodDeckSF    0.006308
6             FullBath    0.006095
27         OverallCond    0.005690
10   MasVnrArea_NoNull    0.005257
11          Fireplaces    0.003856
26          MSSubClass    0.003795
25         ScreenPorch    0.003612
22        BedroomAbvGr    0.002063
19            HalfBath    0.001376
23       EnclosedPorch    0.001281
24        KitchenAbvGr    0.001067
20        BsmtFullBa

In [123]:
# Calculate cumulative importance
importances['cumulative'] = importances['importance'].cumsum()

# Select features that together explain up to 95% of total importance
threshold = 0.95
keep_features = importances[importances['cumulative'] <= threshold]['feature'].tolist()

# Include the next feature that pushes cumulative importance over 95%
if importances.loc[len(keep_features), 'cumulative'] <= 1.0:
    keep_features = importances.loc[:len(keep_features), 'feature'].tolist()

# Create reduced training set
X_train_reduced = X_train_num[keep_features]

In [124]:
#print which features are kept or dropped
print(f"Selected {len(keep_features)} features out of {len(importances)}")
print("Kept features:\n", keep_features)
print("\nDropped features:\n", [f for f in importances['feature'] if f not in keep_features])

Selected 10 features out of 28
Kept features:
 ['OverallQual', 'GrLivArea_capped', 'TotalBsmtSF_capped', '2ndFlrSF', 'BsmtFinSF1', '1stFlrSF_capped', 'LotArea_capped', 'GarageArea_capped', 'YearBuilt', 'LotFrontage_capped']

Dropped features:
 ['GarageCars', 'OpenPorchSF', 'YearRemodAdd', 'TotRmsAbvGrd', 'GarageYrBlt_NoNull', 'BsmtUnfSF', 'WoodDeckSF', 'FullBath', 'OverallCond', 'MasVnrArea_NoNull', 'Fireplaces', 'MSSubClass', 'ScreenPorch', 'BedroomAbvGr', 'HalfBath', 'EnclosedPorch', 'KitchenAbvGr', 'BsmtFullBath']


In [125]:
# reducing test set accordingly
X_test_num = X_test[numerical_features]
X_test_reduced = X_test_num[keep_features]

In [126]:
X_train.shape , X_train_reduced.shape, X_test.shape , X_test_reduced.shape

((1168, 46), (1168, 10), (292, 46), (292, 10))

In [127]:
# dropping numerical features from X_train and X_test
X_train = X_train.drop(columns=numerical_features)
X_test = X_test.drop(columns=numerical_features)

In [128]:
# concatenate reduced numerical features back to X_train and X_test
X_train = pd.concat([X_train, X_train_reduced], axis=1)
X_test = pd.concat([X_test, X_test_reduced], axis=1)

In [129]:
X_train.shape , X_train_reduced.shape, X_test.shape , X_test_reduced.shape

((1168, 28), (1168, 10), (292, 28), (292, 10))

In [132]:
X_train.to_csv("../data/interim/train_selected.csv", index=False)
X_test.to_csv("../data/interim/test_selected.csv", index=False)

In [133]:
y_train.to_csv("../data/interim/y_train.csv", index=False)
y_test.to_csv("../data/interim/y_test.csv", index=False)