In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import scipy.linalg as sp_la

# Quiz

# Feature Selection (From Wednesday) will be **postponed to Monday**

# Polynomial regression!!

## Data

Today we will keep working with the set of Craigslist listings for used cars.

All of this section is *exactly the same* as Wednesday.

First I make my converters.

In [None]:
# these will be our columns
columns = ["price", "year", "manufacturer", "model", "condition", "fuel", "odometer", "title_status", "transmission"]
# this will contain our converters
colValues = {}

# first we load our data as strings so we can define the converters
data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), skip_header=1, dtype=str, encoding='utf-8'))  

# make a list of the unique values in each column of our data
for colIndex in range(data.shape[1]):
    colValues[colIndex] = np.unique(data[:, colIndex]).tolist()

# map values to their indices in the list of unique values
def converter(x, colIndex):
    return colValues[colIndex].index(x)

Then I load the data.

In [None]:
# This dataset is the mazda subsample from https://www.kaggle.com/austinreese/craigslist-carstrucks-data after some cleanup

data = np.array(np.genfromtxt('data/vehicles.csv', delimiter=',', usecols=(1,2,3,4,5,7,8,9,11), converters={3: lambda x: converter(x, 2), 4: lambda x: converter(x, 3), 5: lambda x: converter(x, 4), 7: lambda x: converter(x,5), 9: lambda x: converter(x, 7), 11: lambda x: converter(x, 8)}, skip_header=1, dtype=int, encoding='utf-8'))  

Let's get some summary statistics and do a **pairplot** so we can see what's going on.

In [None]:
def getSummaryStatistics(data):
    print("min, max, mean, std per variable")
    return pd.DataFrame([data.min(axis=0), data.max(axis=0), data.mean(axis=0), data.std(axis=0)])

def getShapeType(data):
    print("shape")
    return (data.shape, data.dtype)

print(getSummaryStatistics(data))
print(getShapeType(data))

In [None]:
df = pd.DataFrame(data, columns=columns)
seaborn.pairplot(df, y_vars = columns[0], x_vars = columns[1:])

plt.show()

# Let's review regression

Regression allows us to:
* determine the *nature* of a relationship between one (or more!) independent variables and a dependent variable
* determine the *strength* of the relationship

Regression *fits* a function to a dataset.

## Polynomial regression 

I want to predict price as a function of the *square* of age. 

It turns out I can do this using **polynomial regression**. The function I will want to fit will be: $\hat{y} = c_0 + c_1*x + c_2*x^2$, and I do this by minimizing the sum of the squares of the residuals $r_i = y_i - \hat{y_i}$.

In terms of matrix math, for $N$
 data points, $A$
 will just be a matrix of shape ($N, d+1$), where $d$ is the degree of the polynomial and $\vec{c}$ 
 will have shape ($d+1, 1$)
 (including $c_0$, the intercept) and $\vec{y}$ will have shape ($N, 1$) (as before). 

 This is still *linear regression*, because we are still solving a linear regression
 
 **However**, before I calculate the least squares solution, I first have to convert the age variable. For example, let's say I want the square of age: I have to find the square of each year in the year column.

Let's do it! 

### First, split our data

Let's split our data into **train** and **test**. Let's make sure and sort by time first, because we don't want to let the future predict the past.

In [None]:
data = data[data[:, 1].argsort()]
print(getSummaryStatistics(data))
print(getShapeType(data))

(train, test) = np.split(data, [int(len(data) / 10 * 8)])
print(train.shape, test.shape)

### Second, compute the polynomial degree of each independent variable.

For example, let's calculate the square of age.

In [1]:
def makePoly(x, poly):
    # make an empty array of size A
    A = np.zeros([x.shape[0], poly+1])
    for i in range(0, poly+1):
        # np.squeeze is a fun function :)
        # notice this is also a third way to get that leading column of ones!
        A[:, i] = np.squeeze(x**i)
    return A

### Third, define updated fit and predict functions that incorporate makePoly

In [2]:
def fit(data, independent, dependent, poly):
    # This is our independent variable, just one for now
    x = data[np.ix_(np.arange(data.shape[0]), independent)]

    # We add the polynomials, and a column of 1s for the intercept
    A = makePoly(x, poly)

    # This is the dependent variable 
    y = data[:, dependent]

    # This is the regression coefficients that were fit, plus some other results
    # We use _ when we don't want to remember something a function returns
    c, _, _, _ = sp_la.lstsq(A, y)
    return c

def predict(data, independent, poly, c):
    # These are our independent variable(s)
    x = data[np.ix_(np.arange(data.shape[0]), independent)]

    # We add the polynomials, and a column of 1s for the intercept
    A = makePoly(x, poly)

    return np.dot(A, c)

### Fourth, evaluate using $R^2$

In [None]:
def rsquared(y, yhat):
    if len(y) != len(yhat):
        print("Need y and yhat to be the same length!")
        return 0
    return 1 - (((y - yhat)**2).sum() / ((y - y.mean())**2).sum())

In [None]:
c = fit(train, [1], 0, 2)
yhat = predict(test, [1], 2, c)
rsquared(test[:, 0], yhat)

Notice this is no longer a line we are plotting!

In [None]:
def plotxyyhat(x, y, c):
    plt.clf()
    plt.plot(x, y, 'o', label='data')
    xCurve = np.linspace(np.min(x), np.max(x))
    yCurve = c[0]
    for i in range(1, len(c)):
        yCurve += c[i]*(xCurve**i)
    plt.plot(xCurve, yCurve, label='least squares fit, y polynomial')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.legend(framealpha=1, shadow=True)
    plt.grid(alpha=0.25)
    plt.show()

In [None]:
plotxyyhat(train[:, 1], train[:, 0], c)
plotxyyhat(test[:, 1], test[:, 0], c)

## Now let's try a cubic model

$$y = c_0 + c_1x_1 + c_2x_1^2 + c_3x_1^3$$

What will the shape of $A$ be? What about $\vec{c}$?

Do you think it will perform better or worse than a linear or quadratic model?

## Challenge

How would you fit a polynomial regression where you wanted the square of age and the square root of odometer?