## Model 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# import numerical data
data_num = pd.read_csv('data/AmesHousingNumData.csv')

# 'index=False' as parameter for exporting the files is a better solution
data_num = data_num.drop('Unnamed: 0', axis=1) 
data_num.head()

In [None]:
# split nummerical data in input and target
target_num = data_num['SalePrice']
final_data_num = data_num.drop('SalePrice', axis=1)

# select the 2 most correlated features
final_data_numSelect = data_num[['Gr Liv Area', 'Overall Qual']]

display(final_data_numSelect.head())
display(final_data_num.head())


In [None]:
# import categorical data
data_cat = pd.read_csv('data/AmesHousingCatData.csv')
data_cat = data_cat.drop('Unnamed: 0', axis=1) 

# select the input and target from categorical data
target_cat = data_cat['SalePrice']
final_data_cat = data_cat.drop('SalePrice', axis=1)
display(final_data_cat.head())


In [None]:
# import all data
data_all = pd.read_csv('data/AmesHousingPreprocessed.csv')
data_all = data_all.drop('Unnamed: 0', axis=1) 

# split data into input and target
target_all = data_all['SalePrice']
final_data_all = data_all.drop(['SalePrice', 'SalePrice.1'], axis=1)
display(final_data_all.head())


In [None]:
def preprocess(df, target, train_size = 0.7):
    '''
    convert the pandas dataframes to numpy ndarrays
    '''
    X_np = df.to_numpy()
    y_np = target.to_numpy()
    
    # split the data into 70% training and 30% testing
    X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, train_size=train_size, random_state=1265599650)
    
    return X_train, X_test, y_train, y_test


### Setting up the model

#### Version 1: Multivariate Linear Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error


def calc_rmse(X_train, X_test, y_train, y_test, linear_model):
    '''
    Computes the Root mean squared error for the 
    '''
    linear_model.fit(X_train, y_train)
    
    predict = linear_model.predict(X_test)

    linear_test_rmse = np.sqrt(mean_squared_error(y_test, predict))
    
    print(f'The test RMSE for our multivariate linear model is {linear_test_rmse}')

    #print(f"The theta values for our multivariate linear model are:\n{linear_model.intercept_} and {linear_model.coef_}.")


#### Multivariate Polynomial Regression Model

Create new features from the existing ones. Check whether they add predictive value. 

1. Interaction Features

= Generate a new feature from the product of two features. Represents the interaction effect between two variables on dependent variable. 


2. Polynomial Feature Expansion

= Generate new features from the nth degree of each feature. 

Note: The second includes the first. The first is implemented to check whether including the interaction features alone improves the model or not. The second is implement to check whether features may have a non-linear relationship with the target feature.

Taken from http://www.eamonfleming.com/projects/housing-regression.html


#### Version 2: Interaction Features only

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
def poly_interact(X_train, X_test, y_train, y_test, poly_model_interact):

    # change feature matrix into polynomial feature matrix including interaction terms only
    poly_interact = PolynomialFeatures(interaction_only=True)
    X_train_interact = poly_interact.fit_transform(X_train)
    X_test_interact = poly_interact.fit_transform(X_test)
    
    poly_model_interact.fit(X_train_interact, y_train)

    predictions = poly_model_interact.predict(X_test_interact)

    poly_test_rmse = np.sqrt(mean_squared_error(y_test, predictions))

    print(f'The test RMSE for our interaction only polynomial model is {poly_test_rmse}')

    #print(f'The theta values for the interact linear model are:\n{poly_model_interact.coef_} and {poly_model_interact.intercept_}')


#### Version 3:  All Polynomial Features

In [None]:

def add_poly(X_train, X_test, y_train, y_test, poly_model, deg = 2):
    
    # change feature matrix into polynomial feature matrix
    poly = PolynomialFeatures(degree = deg)
    X_train_ = poly.fit_transform(X_train)
    X_test_ = poly.fit_transform(X_test)

    poly_model.fit(X_train_, y_train)

    predictions = poly_model.predict(X_test_)

    poly_test_rmse = np.sqrt(mean_squared_error(y_test, predictions))

    print(f'The test RMSE for our polynomial model is {poly_test_rmse}')

    #print(f'The theta values for our polynomial linear model are:\n{poly_model.coef_} and {poly_model.intercept_}')


### Testing and comparing the versions

In [None]:
X_train, X_test, y_train, y_test = preprocess(final_data_numSelect, target_num, train_size =0.7)

In [None]:
print(f'These are the outcomes for just a selection of the numerical data:\n')

linear_model_numSelect = LinearRegression()
calc_rmse(X_train, X_test, y_train, y_test, linear_model_numSelect)

poly_model_interact_numSelect = LinearRegression()
poly_interact(X_train, X_test, y_train, y_test, poly_model_interact_numSelect)

poly_model_numSelect = LinearRegression()
add_poly(X_train, X_test, y_train, y_test, poly_model_numSelect, deg = 4)

print(f'\nDegree of 4 turned out to be the best degree if only 2 features are selected')

In [None]:
X_train, X_test, y_train, y_test = preprocess(final_data_num, target_num, train_size =0.7)

In [None]:
print(f'These are the outcomes for all the numerical data:\n')
linear_model_num = LinearRegression()
calc_rmse(X_train, X_test, y_train, y_test, linear_model_num)

poly_model_interact_num = LinearRegression()
poly_interact(X_train, X_test, y_train, y_test, poly_model_interact_num)

poly_model_num = LinearRegression()
add_poly(X_train, X_test, y_train, y_test, poly_model_num, deg = 2)

print(f'\nDegree of 2 was found to be the best fit')

In [None]:
print(f'These are the outcomes for all the categorical data:\n')

X_train, X_test, y_train, y_test = preprocess(final_data_cat, target_cat, train_size =0.7)

linear_model_cat = LinearRegression()
calc_rmse(X_train, X_test, y_train, y_test, linear_model_cat)

poly_model_interact_cat = LinearRegression()
poly_interact(X_train, X_test, y_train, y_test, poly_model_interact_cat)

poly_model_cat = LinearRegression()
add_poly(X_train, X_test, y_train, y_test, poly_model_cat)

In [None]:
print(f'These are the outcomes for all the combined data\n')
X_train, X_test, y_train, y_test = preprocess(final_data_all, target_all, train_size =0.7)

linear_model_all = LinearRegression()
calc_rmse(X_train, X_test, y_train, y_test, linear_model_all)

poly_model_interact_all = LinearRegression()
poly_interact(X_train, X_test, y_train, y_test, poly_model_interact_all)

poly_model_all = LinearRegression()
add_poly(X_train, X_test, y_train, y_test, poly_model_all)

#### Optional: Evaluate Feature Importance

In [None]:
poly = PolynomialFeatures(degree=2)

# only implement for all combined data
X_train_ = poly.fit_transform(X_train)

# add appropriate names to polynomial features
X_train_ = pd.DataFrame(X_train_, columns = poly.get_feature_names(final_data_all.columns))

# generate list of polynomial features and their correlations with sale price
X_train_correlations = X_train_.corrwith(target_all)

# sort features from highest positively correlated with sale price
print('The highest positively correlated features with sale price are:')
X_train_correlations.sort_values(ascending = False).head(20)

In [None]:
# sort features from highest negatively correlated with sale price
print('The highest negatively correlated features with sale price are:')
X_train_correlations.sort_values().head(20)