### Cross Model Evaluation


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector

from sklearn import set_config

set_config(display="diagram") #setting this will display your pipelines as seen above

### The Data

We will look at the housing dataset used.

In [12]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
train.head()

In [None]:
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']


print(X.shape)
X_train, X_test, y_train, y_test = '', '', '', ''


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)


print(X_train.shape)
print(X_test.shape)
print(type(X_train), type(y_train))#should be DataFrame and Series


#### Baseline Predictions


In [None]:
y_train.values.mean()

In [None]:

baseline_train = ''
baseline_test = ''
mse_baseline_train = ''
mse_baseline_test = ''
baseline_train = np.ones(shape=y_train.shape) * y_train.mean()

baseline_test = np.ones(shape=y_test.shape) * y_test.mean()

mse_baseline_train = mean_squared_error(baseline_train, y_train)
mse_baseline_test = mean_squared_error(baseline_test, y_test)

print(baseline_train.shape, baseline_test.shape)
print(f'Baseline for training data: {mse_baseline_train}')
print(f'Baseline for testing data: {mse_baseline_test}')

#### Examining the Correlations

In [None]:
highest_corr = ''

correlations = train.corr()['SalePrice']
correlations = correlations.drop('SalePrice')
highest_corr = correlations.idxmax()

print(correlations.sort_values(ascending=False))

print(highest_corr)

#### Simple Model


In [None]:

model_1_train_mse = ''
model_1_test_mse = ''
X1 = X_train[['OverallQual']]
lr = LinearRegression().fit(X1, y_train)
model_1_train_mse = mean_squared_error(y_train, lr.predict(X1))
model_1_test_mse = mean_squared_error(y_test, lr.predict(X_test[['OverallQual']]))

print(f'Train MSE: {model_1_train_mse: .2f}')
print(f'Test MSE: {model_1_test_mse: .2f}')

#### Using `OneHotEncoder`


In [22]:
#extract the features
central_air_train = X_train[['CentralAir']]
central_air_test = X_test[['CentralAir']]

In [None]:
#a categorical feature
central_air_train.head()

In [None]:

ohe = OneHotEncoder(sparse = False, drop='if_binary')
print(ohe.fit_transform(central_air_train)[:5])

In [25]:
model_2_train = ohe.fit_transform(central_air_train)
model_2_test = ohe.transform(central_air_test)

In [None]:
model_2 = ''

model_2 = LinearRegression().fit(model_2_train, y_train)

print(model_2.coef_)

In [27]:
col_transformer = make_column_transformer((OneHotEncoder(drop = 'if_binary'), ['CentralAir']), 
                                          remainder='passthrough')
col_transformer.fit_transform(X_train[['OverallQual', 'CentralAir']])


#### Using `make_column_transformer`

In [None]:
pipe_1 = ''

linreg = LinearRegression()
pipe_1 = Pipeline([('col_transformer', col_transformer), 
                   ('linreg', linreg)])
pipe_1.fit(X_train[['OverallQual', 'CentralAir']], y_train)

print(pipe_1.named_steps)#col_transformer and linreg should be keys
pipe_1


#### Creating an `OrdinalEncoder`

In [None]:
oe = OrdinalEncoder(categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']])
oe.fit_transform(X_train[['HeatingQC']])

In [None]:
X_train['HeatingQC'].head()

In [33]:
ordinal_ohe_transformer = make_column_transformer((OneHotEncoder(drop = 'if_binary'), ['CentralAir']),
                                          (OrdinalEncoder(categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]), ['HeatingQC']),
                                          remainder='passthrough')

In [None]:
ordinal_ohe_transformer.fit_transform(X_train[['OverallQual', 'CentralAir', 'HeatingQC']])[:5]

In [None]:
X_train[['OverallQual', 'CentralAir', 'HeatingQC']].head()

#### Using `OrdinalEncoder`


In [None]:
pipe_2 = ''
pipe_2_train_mse = ''
pipe_2_test_mse = ''

linreg = LinearRegression()
pipe_2 = Pipeline([('transformer',ordinal_ohe_transformer), 
                  ('linreg', linreg)])
pipe_2.fit(X_train[['OverallQual', 'CentralAir', 'HeatingQC']], y_train)
pred_train = pipe_2.predict(X_train[['OverallQual', 'CentralAir', 'HeatingQC']])
pipe_2_train_mse = mean_squared_error(pred_train, y_train)



pipe_2_test_mse = ''
pred_test = pipe_2.predict(X_test[['OverallQual', 'CentralAir', 'HeatingQC']])
pipe_2_test_mse = mean_squared_error(pred_test, y_test)


print(pipe_2.named_steps)
print(f'Train MSE: {pipe_2_train_mse: .2f}')
print(f'Test MSE: {pipe_2_test_mse: .2f}')
pipe_2


#### Including `PolynomialFeatures`


In [21]:
poly_ordinal_ohe = make_column_transformer((OrdinalEncoder(categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]), ['HeatingQC']),
                                           (OneHotEncoder(drop = 'if_binary'), ['CentralAir']),
                                           (PolynomialFeatures(include_bias = False, degree = 2), ['OverallQual']))
pipe_3 = Pipeline([('transformer', poly_ordinal_ohe), 
                  ('linreg', LinearRegression())])

In [None]:
pipe_3.fit(X_train[['OverallQual', 'CentralAir', 'HeatingQC']], y_train)

In [None]:
quad_train_mse = ''
quad_test_mse = ''

quad_train_preds = pipe_3.predict(X_train[['OverallQual', 'CentralAir', 'HeatingQC']])
quad_train_mse = mean_squared_error(quad_train_preds, y_train)

quad_test_preds = pipe_3.predict(X_test[['OverallQual', 'CentralAir', 'HeatingQC']])
quad_test_mse = mean_squared_error(quad_test_preds, y_test)

print(f'Train MSE: {quad_train_mse: .2f}')
print(f'Test MSE: {quad_test_mse: .2f}')

In [40]:
features = ['CentralAir', 'HeatingQC', 'OverallQual', 'GrLivArea', 'KitchenQual', 'FullBath']

In [None]:
X_train[features].head()

In [42]:
poly_ordinal_ohe = make_column_transformer((PolynomialFeatures(), make_column_selector(dtype_include=np.number)),
                                           (OrdinalEncoder(categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]), ['HeatingQC', 'KitchenQual']),
                                               (OneHotEncoder(drop = 'if_binary', sparse = False), ['CentralAir']))

In [None]:
train_mses = []
test_mses = []

#for degree in 1 - 5
for i in range(1, 6):
    #create pipeline with PolynomialFeatures degree i 
    #ADD APPROPRIATE ARGUMENTS IN POLYNOMIALFEATURES
    poly_ordinal_ohe = make_column_transformer((PolynomialFeatures(degree=i), make_column_selector(dtype_include=np.number)),
                                           (OrdinalEncoder(categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]), ['HeatingQC']),
                                               (OneHotEncoder(drop = 'if_binary'), ['CentralAir']))
    

    
    pipe = Pipeline([('transformer', poly_ordinal_ohe),
                     ('linreg', LinearRegression())])
    
    #fit on train
    model = pipe.fit(X_train[features], y_train)
    #predict on train and test
    p_train = model.predict(X_train[features])
    train_mses.append(mean_squared_error(y_train, p_train))
    
    p_test = model.predict(X_test[features])
    #compute mean squared errors
    test_mses.append(mean_squared_error(y_test,  p_test))
    
    #append to train_mses and test_mses respectively

print(train_mses)
print(test_mses)
pipe