In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

In [2]:
lr = LinearRegression()


ames = pd.read_csv("/Users/ben/Documents/GitHub/DSML/Data/AmesHousing.csv")
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

array([ 70400.38817805, -17195.91035361])

In [3]:
X_test_s = (X_test - X_train.mean())/X_train.std()
y_preds = lr_fitted.predict(X_test_s)

r2_score(y_test, y_preds)

0.48248792220574144

In [5]:
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

lr_pipeline

ct.fit_transform(X_train)

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
2380,1.0,0.0,0.0,0.0,0.0,2.150127,1.606173
2641,1.0,0.0,0.0,0.0,0.0,-0.146929,0.342290
310,1.0,0.0,0.0,0.0,0.0,-0.428161,-0.289652
926,1.0,0.0,0.0,0.0,0.0,-0.768393,-0.289652
2055,1.0,0.0,0.0,0.0,0.0,-1.197124,-0.921593
...,...,...,...,...,...,...,...
836,1.0,0.0,0.0,0.0,0.0,-1.008325,-0.921593
2131,1.0,0.0,0.0,0.0,0.0,0.667267,0.342290
856,1.0,0.0,0.0,0.0,0.0,-1.256124,-0.289652
710,1.0,0.0,0.0,0.0,0.0,-1.212857,-1.553535


In [6]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

lr_fitted = lr_pipeline.fit(X_train, y_train)

ct_fitted = ct.fit(X_train)

ct_fitted.transform(X_train)

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
2172,1.0,0.0,0.0,0.0,0.0,-0.633076,-0.923745
1968,1.0,0.0,0.0,0.0,0.0,-1.128183,0.349846
859,1.0,0.0,0.0,0.0,0.0,-1.188609,-0.286949
2780,1.0,0.0,0.0,0.0,0.0,0.072550,-0.286949
1880,1.0,0.0,0.0,0.0,0.0,-0.159410,-0.286949
...,...,...,...,...,...,...,...
1767,1.0,0.0,0.0,0.0,0.0,5.481692,2.260232
2919,0.0,0.0,0.0,1.0,0.0,-0.802659,-0.923745
2746,1.0,0.0,0.0,0.0,0.0,-0.401116,-0.923745
98,0.0,0.0,0.0,1.0,0.0,-0.521969,-1.560540


In [7]:
ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter.fit_transform(X_train)

Unnamed: 0,interaction__1,interaction__Gr Liv Area,interaction__TotRms AbvGrd,interaction__Gr Liv Area TotRms AbvGrd
2172,1.0,1179.0,5.0,5895.0
1968,1.0,925.0,7.0,6475.0
859,1.0,894.0,6.0,5364.0
2780,1.0,1541.0,6.0,9246.0
1880,1.0,1422.0,6.0,8532.0
...,...,...,...,...
1767,1.0,4316.0,10.0,43160.0
2919,1.0,1092.0,5.0,5460.0
2746,1.0,1298.0,5.0,6490.0
98,1.0,1236.0,4.0,4944.0


In [8]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__TotRms AbvGrd", "dummify__Bldg Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,remainder__Lot Frontage,...,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
2172,1.0,0.0,0.0,0.0,0.0,2173,908127070,20,RL,80.0,...,0,0,,GdPrv,,0,6,2007,WD,Normal
1968,1.0,0.0,0.0,0.0,0.0,1969,535457050,20,RL,70.0,...,0,0,,,,0,7,2007,WD,Normal
859,1.0,0.0,0.0,0.0,0.0,860,907227080,20,RL,60.0,...,0,0,,,,0,9,2009,WD,Normal
2780,1.0,0.0,0.0,0.0,0.0,2781,907196020,20,RL,118.0,...,0,0,,,,0,1,2006,WD,Normal
1880,1.0,0.0,0.0,0.0,0.0,1881,534252110,20,RL,,...,0,0,,GdWo,Shed,600,8,2007,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1767,1.0,0.0,0.0,0.0,0.0,1768,528351010,60,RL,104.0,...,0,0,,,,0,1,2007,WD,Normal
2919,0.0,0.0,0.0,1.0,0.0,2920,923228260,160,RM,21.0,...,0,0,,,,0,6,2006,WD,Normal
2746,1.0,0.0,0.0,0.0,0.0,2747,906202040,20,RL,,...,0,0,,,,0,5,2006,WD,Normal
98,0.0,0.0,0.0,1.0,0.0,99,533212060,160,FV,24.0,...,0,0,,,,0,5,2010,WD,Normal


In [9]:
ct_inter.fit_transform(X_train_dummified)

Unnamed: 0,interaction__1,interaction__remainder__TotRms AbvGrd,interaction__dummify__Bldg Type_1Fam,interaction__remainder__TotRms AbvGrd dummify__Bldg Type_1Fam
2172,1.0,5.0,1.0,5.0
1968,1.0,7.0,1.0,7.0
859,1.0,6.0,1.0,6.0
2780,1.0,6.0,1.0,6.0
1880,1.0,6.0,1.0,6.0
...,...,...,...,...
1767,1.0,10.0,1.0,10.0
2919,1.0,5.0,0.0,0.0
2746,1.0,5.0,1.0,5.0
98,1.0,4.0,0.0,0.0


In [10]:
from sklearn.model_selection import cross_val_score

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores
scores.mean()

0.5332506599039009

In [11]:
X = ames[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y = ames["SalePrice"]


In [16]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "passthrough"
)

ct_poly_2 = ColumnTransformer(
    [("polynomial", PolynomialFeatures(), ["remainder__TotRms AbvGrd"])],
    remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
   ("poly2", ct_poly_2),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': list(range(1, 11)), "poly2__polynomial__degree": list(range(1, 11))}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [17]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.01546488, 0.01472688, 0.01457648, 0.01441002, 0.01508183,
        0.01520982, 0.01472898, 0.01491523, 0.01529684, 0.0162405 ,
        0.01538777, 0.01479073, 0.01456318, 0.01442442, 0.01425872,
        0.01457825, 0.01469359, 0.01478009, 0.0146071 , 0.01472087,
        0.01414638, 0.01441736, 0.01481819, 0.0149332 , 0.014746  ,
        0.01544881, 0.01588783, 0.01516623, 0.01469345, 0.0154799 ,
        0.01438608, 0.01450176, 0.01554313, 0.01532807, 0.01487269,
        0.01617303, 0.01540904, 0.01505876, 0.01487684, 0.0165256 ,
        0.01559596, 0.0146894 , 0.01533322, 0.01471887, 0.01448779,
        0.01448002, 0.0156589 , 0.01470175, 0.0145884 , 0.01467085,
        0.01454067, 0.01482201, 0.01478233, 0.01471586, 0.01549664,
        0.0147624 , 0.01599078, 0.01522222, 0.0152535 , 0.01511502,
        0.01489248, 0.01444321, 0.01482415, 0.014853  , 0.01499848,
        0.01498246, 0.01511621, 0.01481709, 0.01491075, 0.01552911,
        0.01488848, 0.01495419,

In [18]:
gscv_fitted.cv_results_['mean_test_score']

array([ 2.30878254e-01,  2.30878254e-01,  2.30878254e-01,  2.30878254e-01,
        2.30878254e-01,  2.30878254e-01,  2.30878254e-01,  2.30878254e-01,
        2.30878254e-01,  2.30878254e-01,  2.26764296e-01,  2.26764296e-01,
        2.26764296e-01,  2.26764296e-01,  2.26764296e-01,  2.26764296e-01,
        2.26764296e-01,  2.26764296e-01,  2.26764296e-01,  2.26764296e-01,
        2.35220364e-01,  2.35220364e-01,  2.35220364e-01,  2.35220364e-01,
        2.35220364e-01,  2.35220364e-01,  2.35220364e-01,  2.35220364e-01,
        2.35220364e-01,  2.35220364e-01,  2.33667686e-01,  2.33667686e-01,
        2.33667686e-01,  2.33667686e-01,  2.33667686e-01,  2.33667686e-01,
        2.33667686e-01,  2.33667686e-01,  2.33667686e-01,  2.33667686e-01,
        2.21095352e-01,  2.21095352e-01,  2.21095352e-01,  2.21095352e-01,
        2.21095352e-01,  2.21095352e-01,  2.21095352e-01,  2.21095352e-01,
        2.21095352e-01,  2.21095352e-01,  1.36030155e-01,  1.36030155e-01,
        1.36030155e-01,  

In [27]:
degrees = []
for i in range(1, 11):
    for j in range(1, 11):
        degrees.append((i, j))

pd.DataFrame(data = {"degrees": degrees, "scores": gscv_fitted.cv_results_['mean_test_score']})