# Polynomial model for predicting Ames housing prices

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

### Importing and splitting data

In [None]:
def import_data(filename):
    '''
    Imports the given csv file. 
    '''
    data = pd.read_csv(filename)
    data = data.drop(['SalePrice'], axis=1)
    
    return data

num_data = import_data('data/AmesHousingNumData.csv')
print('Overview of numeric features')
display(num_data.head())

cat_data = import_data('data/AmesHousingCatData.csv')
print('\nOverview of categorical features')
display(cat_data.head())

all_data = import_data('data/AmesHousingPreprocessed.csv')
print('\nOverview of all features')
display(all_data.head())

In [None]:
# select the 2 most correlated features
final_data_numSelect = num_data[['Gr Liv Area', 'Overall Qual']]
final_data_numSelect.head()

In [None]:
# split dataframe containing all data into input and target
target = all_data['SalePrice.1']
final_data_all = all_data.drop('SalePrice.1', axis=1)

final_data_all.head()

In [None]:
def preprocess(df, target, train_size = 0.7):
    '''
    Converts the pandas dataframes to numpy ndarrays.
    '''
    X_np = df.to_numpy()
    y_np = target.to_numpy()
    
    # split the data into 70% training and 30% testing
    X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, train_size=train_size, random_state=1265599650)
    
    return X_train, X_test, y_train, y_test


### Building the model

#### Version 1: Multivariate Linear Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error


def linear_model(X_train, X_test, y_train, y_test):
    '''
    Computes a linear model for the given data and calculates the root mean squared error for that model.
    '''
    linear_model = LinearRegression()
    linear_model.fit(X_train, y_train)
    
    # compute and store predictions for unseen feature values
    predict = linear_model.predict(X_test)

    # compute the root mean square error to evaluate performance
    linear_test_rmse = np.sqrt(mean_squared_error(y_test, predict))
    
    print(f'The test RMSE for our multivariate linear model is {linear_test_rmse}')


#### Multivariate Polynomial Regression Model

Create new features from the existing ones and check whether they add predictive value. 

- Interaction Features

   Generate a new feature from the product of two features. Represents the interaction effect between two variablesondependent variable. 


- Polynomial Feature Expansion

    Generate new features from the nth degree of each feature. 

Note: The second model includes the new interaction features as well. The first step is implemented to check whether including the interaction features alone improves the model or not. The second is implemented to check whether features may have a non-linear relationship with the target feature.

Taken from http://www.eamonfleming.com/projects/housing-regression.html


#### Version 2: Interaction Features only

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
def poly_interact(X_train, X_test, y_train, y_test):
    '''
    Builds and fits a polynomial model with only including interaction features. 
    Calculats and returns the rmse.
    '''
    # change feature matrix into polynomial feature matrix including interaction terms only
    poly_interact = PolynomialFeatures(interaction_only=True)
    X_train_interact = poly_interact.fit_transform(X_train)
    X_test_interact = poly_interact.fit_transform(X_test)
    
    # fit the model
    poly_model_interact = LinearRegression()
    poly_model_interact.fit(X_train_interact, y_train)

    # compute and store predictions for unseen feature values
    predictions = poly_model_interact.predict(X_test_interact)

    # compute the root mean square error to evaluate performance
    poly_test_rmse = np.sqrt(mean_squared_error(y_test, predictions))

    print(f'The test RMSE for our interaction only polynomial model is {poly_test_rmse}')

#### Version 3:  All Polynomial Features

In [None]:

def poly_model(X_train, X_test, y_train, y_test, max_degree=6):
    '''
    Builds and fits a polynomial model including all features. Searches for the best degree
    of the computed model and plots that degree. Returns the rmse of the best fit.
    '''
    rmses = []
    min_rmse = 10
    min_degree = None
    
    # test different degrees
    degrees = range(1, max_degree)
    for degree in degrees:
        
        # change feature matrix into polynomial feature matrix
        poly = PolynomialFeatures(degree = degree)
        X_train_ = poly.fit_transform(X_train)
        X_test_ = poly.fit_transform(X_test)

        poly_model = LinearRegression()
        poly_model.fit(X_train_, y_train)

        # compute and store predictions for unseen feature values
        predictions = poly_model.predict(X_test_)

        # compute the root mean square measure to evaluate performance
        poly_test_rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmses.append(poly_test_rmse)
        
        # cross-validate degree
        if poly_test_rmse < min_rmse:
            min_rmse = poly_test_rmse
            min_degree = degree
   
    print(f'The test RMSE for our polynomial model is {min_rmse}')

    # plot results
    plt.plot(degrees, rmses)

    # layout
    plt.title('Determine best degree')
    plt.xlabel('Degree')
    plt.ylabel('RMSE')
    
    plt.show()
    
    print(f'\nThe best model fit for the polynomial model has degree {min_degree}.')

### Testing and comparing the versions

In [None]:
# use only two numeric features to test the models
X_train, X_test, y_train, y_test = preprocess(final_data_numSelect, target, train_size =0.7)

In [None]:
print(f'These are the outcomes when using two numerical features only:\n')

linear_model(X_train, X_test, y_train, y_test)

poly_interact(X_train, X_test, y_train, y_test)

poly_model(X_train, X_test, y_train, y_test)

In [None]:
# use all numeric data to test the models
X_train, X_test, y_train, y_test = preprocess(num_data, target, train_size =0.7)

In [None]:
print(f'These are the outcomes for all the numerical data:\n')

linear_model(X_train, X_test, y_train, y_test)

poly_interact(X_train, X_test, y_train, y_test)

poly_model(X_train, X_test, y_train, y_test, max_degree=4)


#### Optional: Evaluate Feature Importance

In [None]:
poly = PolynomialFeatures(degree=2)

# only implement for all combined data
X_train_ = poly.fit_transform(X_train)

# add appropriate names to polynomial features
X_train_ = pd.DataFrame(X_train_, columns = poly.get_feature_names(num_data.columns))

# generate list of polynomial features and their correlations with sale price
X_train_correlations = X_train_.corrwith(target)

# sort features from highest positively correlated with sale price
print('The highest positively correlated features with sale price are:')
X_train_correlations.sort_values(ascending = False).head(20)

In [None]:
# sort features from highest negatively correlated with sale price
print('The highest negatively correlated features with sale price are:')
X_train_correlations.sort_values().head(20)

#### Note

It was attempted to also test the models with categorical features as well as categorical and numerical features combined. Unfortunately, there seems to be a bug which results in very high RMSE values. Since the main goal - to examine whether there exist non-linear relationships between independent and dependent variabale - was achieved, it was deemed unnecessary to continue. 