In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [16]:
ames = pd.read_csv("/Users/AustinLiu/Desktop/GSB544-ComputingandMachineLearning/In-Class-Data/AmesHousing.csv")
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [17]:
# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()
ames

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,...,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,31770,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,120,0,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,0,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,11160,Pave,Reg,Lvl,AllPub,Corner,...,0,0,0,0,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,7937,Pave,IR1,Lvl,AllPub,CulDSac,...,0,0,0,0,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,8885,Pave,IR1,Low,AllPub,Inside,...,0,0,0,0,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,10441,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,10010,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,4,2006,WD,Normal,170000


# Part 1

In [18]:
X = ames.drop('SalePrice', axis=1)
y = ames['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [19]:
# Pipeline 1
lr = LinearRegression()
std_s = StandardScaler()

ct = ColumnTransformer(
    [("standardize", std_s, ["Gr Liv Area", "TotRms AbvGrd"])],
    remainder = "drop" 
)

lr_pipeline = Pipeline(
  [("standardize", ct),
  ("linear_regression", lr)]
)

lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train)
y_preds = lr_pipeline_fitted.predict(X_test)

rmse1 = mean_squared_error(y_test, y_preds, squared = False)
rmse1

55659.17989912858

In [20]:
# Pipeline 2
enc = OneHotEncoder(sparse_output = False)

ct = ColumnTransformer(
    [("standardize", std_s, ["Gr Liv Area", "TotRms AbvGrd"]),
     ("dummify", enc, ["Bldg Type"])],
    remainder = "drop" 
)

lr_pipeline2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", lr)]
)

lr_pipeline_fitted2 = lr_pipeline2.fit(X_train, y_train)
y_preds2 = lr_pipeline_fitted2.predict(X_test)

rmse2 = mean_squared_error(y_test, y_preds2, squared = False)
rmse2

54456.27691201109

In [21]:
# Pipeline 3

enc = OneHotEncoder(sparse_output = False)

ct_dummy = ColumnTransformer(
    [("dummify", enc, ["Bldg Type"])], remainder = "passthrough"
).set_output(transform = "pandas")

ct_dummy.fit_transform(X_train)

ct_interact = ColumnTransformer(
    [("interaction", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction5", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_TwnhsE"])],
).set_output(transform = "pandas")

lr_pipeline_inter = Pipeline(
    [("dummify", ct_dummy),
     ("interacting", ct_interact),
      ("linear regression", lr)]
).set_output(transform = "pandas")


lr_pipeline_fitted3 = lr_pipeline_inter.fit(X_train, y_train)
y_preds3 = lr_pipeline_fitted3.predict(X_test)

rmse3 = mean_squared_error(y_test, y_preds3, squared = False)
rmse3

54076.09217134571

In [22]:
# Pipeline 4

enc = OneHotEncoder(sparse_output = False)

ct_preprocess = ColumnTransformer(
    [("dummify", enc, ["Bldg Type"]),
     ("polynomial", PolynomialFeatures(degree = 5), ["Gr Liv Area", "TotRms AbvGrd"])], remainder = "drop"
).set_output(transform = "pandas")

ct_preprocess.fit_transform(X_train)

lr_pipeline_poly = Pipeline(
    [("preprocessing", ct_preprocess),
      ("linear regression", lr)]
).set_output(transform = "pandas")

lr_pipeline_fitted4 = lr_pipeline_poly.fit(X_train, y_train) 
y_preds4 = lr_pipeline_fitted4.predict(X_test)

rmse4 = mean_squared_error(y_test, y_preds4, squared = False)
rmse4

113618.49579087284

Best RMSE was Model 3, 54076.

# Part 2 Cross_Val_Score

In [23]:
score1 = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
-score1.mean()

55483.717958704314

In [24]:
score2 = cross_val_score(lr_pipeline2, X, y, cv=5, scoring='neg_root_mean_squared_error')
-score2.mean()

54107.50773455426

In [25]:
score3 = cross_val_score(lr_pipeline_inter, X, y, cv=5, scoring='neg_root_mean_squared_error')
-score3.mean()

53324.75612669396

In [26]:
score4 = cross_val_score(lr_pipeline_poly, X, y, cv=5, scoring='neg_root_mean_squared_error')
-score4.mean()

58902.911810473175

Model 3 still had the best.

# Part 3

In [27]:
ct_tuning = ColumnTransformer(
  [("size_polynomial", PolynomialFeatures(), ["Gr Liv Area"]),
  ("rooms_polynomial", PolynomialFeatures(), ["TotRms AbvGrd"]),
   ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "drop"
)

lr_pipeline_tuning = Pipeline(
  [("preprocessing", ct_tuning),
  ("linear_regression", lr)]
).set_output(transform="pandas")

degrees = {'preprocessing__size_polynomial__degree': np.arange(1, 11),
           "preprocessing__rooms_polynomial__degree": np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_tuning, degrees, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.00697026, 0.00398431, 0.00362024, 0.00364566, 0.00370445,
        0.00372305, 0.00377741, 0.00379524, 0.00436339, 0.00635738,
        0.00483575, 0.00539465, 0.00540013, 0.00503063, 0.00579901,
        0.00667324, 0.00380645, 0.00555258, 0.00809798, 0.00705881,
        0.00570874, 0.00521059, 0.00438414, 0.00524235, 0.00506496,
        0.00546861, 0.00514212, 0.00544734, 0.00605974, 0.00821953,
        0.0062561 , 0.00483413, 0.00759377, 0.00530825, 0.00568347,
        0.00572896, 0.00535688, 0.00619941, 0.0052762 , 0.0061595 ,
        0.00620265, 0.00448532, 0.00486751, 0.00386553, 0.006949  ,
        0.00550823, 0.00738592, 0.00737309, 0.00761728, 0.00727921,
        0.00605106, 0.005793  , 0.00625052, 0.00502234, 0.00690107,
        0.00505962, 0.00687861, 0.00709839, 0.0055419 , 0.00625567,
        0.00456672, 0.0052505 , 0.00558958, 0.00640583, 0.00643249,
        0.00528979, 0.00539827, 0.0055449 , 0.00811195, 0.00717397,
        0.00817895, 0.00720263,

In [30]:
gscv_df = pd.DataFrame(gscv_fitted.cv_results_)
gscv_df = gscv_df[["param_preprocessing__rooms_polynomial__degree",
                   "param_preprocessing__size_polynomial__degree",
                   "mean_test_score"]]
gscv_df.rename(columns = {
    "param_preprocessing__rooms_polynomial__degree" : "room degree",
    "param_preprocessing__size_polynomial__degree" : "size degree",
    "mean_test_score": "r2"
}, inplace = True)

gscv_df.sort_values(by = "r2", ascending = False).head()

Unnamed: 0,room degree,size degree,r2
43,5,4,0.559322
33,4,4,0.559221
2,1,3,0.558259
12,2,3,0.557383
63,7,4,0.555882


The best model was a polynomial with 5th degree room and 4th degree for the size