In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [2]:
# Load Boston housing dataset
boston = fetch_openml(name='boston', version=1, as_frame=True)
df = boston.frame
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [3]:
# Ensure all features are numeric
df = df.apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values (if any)
df = df.dropna()

In [7]:
# Features & Target
X = df.drop("MEDV", axis=1)   # features
y = df["MEDV"]                # target (home prices)

In [9]:
# Select feature
X_poly = X[['RM']]

# Transform to polynomial feature
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_poly)

In [12]:
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(['RM']))
X_poly_df

Unnamed: 0,1,RM,RM^2
0,1.0,6.575,43.230625
1,1.0,6.421,41.229241
2,1.0,7.185,51.624225
3,1.0,6.998,48.972004
4,1.0,7.147,51.079609
...,...,...,...
501,1.0,6.593,43.467649
502,1.0,6.120,37.454400
503,1.0,6.976,48.664576
504,1.0,6.794,46.158436


In [16]:
# Train-test split
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_poly, y, test_size=0.2, random_state=42
)

# Model
poly_reg = LinearRegression()
poly_reg.fit(X_train_p, y_train_p)

# Prediction
y_pred_p = poly_reg.predict(X_test_p)

# Evaluation
mse_p = mean_squared_error(y_test_p, y_pred_p)
print("Polynomial Regression MSE:", mse_p)

# Coefficients
print("Coefficients:", poly_reg.coef_)
print("Intercept:", poly_reg.intercept_)

Polynomial Regression MSE: 35.369773737317885
Coefficients: [  0.         -19.19134842   2.19434514]
Intercept: 55.37179190740248


In [17]:
# Select 2 features
X_sub = X[['RM', 'LSTAT']]

# Create polynomial features (degree 2)
poly = PolynomialFeatures(degree=2)
X_poly_array = poly.fit_transform(X_sub)

# Convert to DataFrame
X_poly_df = pd.DataFrame(X_poly_array,
                         columns=poly.get_feature_names_out(['RM', 'LSTAT']))

X_poly_df.head()

Unnamed: 0,1,RM,LSTAT,RM^2,RM LSTAT,LSTAT^2
0,1.0,6.575,4.98,43.230625,32.7435,24.8004
1,1.0,6.421,9.14,41.229241,58.68794,83.5396
2,1.0,7.185,4.03,51.624225,28.95555,16.2409
3,1.0,6.998,2.94,48.972004,20.57412,8.6436
4,1.0,7.147,5.33,51.079609,38.09351,28.4089
