In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle

In [None]:
# # Load the Ames Housing dataset
# url = '/....../....../AmesHousing.csv'

# data = pd.read_csv(url)

# # Separate features and target variable
# X = data.drop("SalePrice", axis=1)
# y = data["SalePrice"]

In [11]:

def load_ames_housing():
    df = fetch_openml(name="house_prices", as_frame=True)
    X = df.data
    y = df.target

    # features = [
    #     "YrSold",
    #     "HeatingQC",
    #     "Street",
    #     "YearRemodAdd",
    #     "Heating",
    #     "MasVnrType",
    #     "BsmtUnfSF",
    #     "Foundation",
    #     "MasVnrArea",
    #     "MSSubClass",
    #     "ExterQual",
    #     "Condition2",
    #     "GarageCars",
    #     "GarageType",
    #     "OverallQual",
    #     "TotalBsmtSF",
    #     "BsmtFinSF1",
    #     "HouseStyle",
    #     "MiscFeature",
    #     "MoSold",
    # ]

    # X = X.loc[:, features]
    X, y = shuffle(X, y, random_state=0)

    # X = X.iloc[:600]
    # y = y.iloc[:600]
    return X, np.log(y)


X, y = load_ames_housing()

In [12]:


# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data: impute missing values and scale
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [18]:
# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Initialize Lasso regression with a regularization parameter
lasso = Lasso(alpha=0.005)  # You can adjust alpha to control sparsity

# Fit the model
lasso.fit(X_train, y_train)

# Get the coefficients
lasso_coefficients = lasso.coef_

# Get the feature names
feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame to view the coefficients
coeff_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': lasso_coefficients
})

In [20]:
coeff_df

Unnamed: 0,Feature,Coefficient
0,num__Id,-0.000000
1,num__MSSubClass,-0.008983
2,num__LotFrontage,0.008050
3,num__LotArea,0.017173
4,num__OverallQual,0.105016
...,...,...
284,cat__SaleCondition_AdjLand,0.000000
285,cat__SaleCondition_Alloca,-0.000000
286,cat__SaleCondition_Family,-0.000000
287,cat__SaleCondition_Normal,0.000000


In [24]:
coeff_df[coeff_df['Coefficient'] != 0].sort_values(by='Coefficient', ascending=False).tail()

Unnamed: 0,Feature,Coefficient
36,num__YrSold,-0.002234
194,cat__BsmtExposure_No,-0.003874
22,num__KitchenAbvGr,-0.008153
1,num__MSSubClass,-0.008983
41,cat__MSZoning_RM,-0.03258


In [22]:
# Filter out the non-zero coefficients
significant_coefficients = coeff_df[coeff_df['Coefficient'] != 0].sort_values(by='Coefficient', ascending=False)



# Display the significant coefficients
print(significant_coefficients)

                   Feature  Coefficient
16          num__GrLivArea     0.115432
4         num__OverallQual     0.105016
6           num__YearBuilt     0.069690
5         num__OverallCond     0.050778
12        num__TotalBsmtSF     0.042625
26         num__GarageCars     0.033157
24         num__Fireplaces     0.026620
9          num__BsmtFinSF1     0.025342
7        num__YearRemodAdd     0.023665
213      cat__HeatingQC_Ex     0.020183
3             num__LotArea     0.017173
27         num__GarageArea     0.017007
17       num__BsmtFullBath     0.016763
179  cat__Foundation_PConc     0.016569
32        num__ScreenPorch     0.012545
40        cat__MSZoning_RL     0.011399
28         num__WoodDeckSF     0.010354
91    cat__Condition1_Norm     0.009644
2         num__LotFrontage     0.008050
23       num__TotRmsAbvGrd     0.008050
20           num__HalfBath     0.007113
13           num__1stFlrSF     0.006665
19           num__FullBath     0.004616
36             num__YrSold    -0.002234
