In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.optimize import curve_fit

### Car data

In [None]:
filename = "../03-week3/lab/CarsRetailPrice.csv"
car_data = pd.read_csv(filename)
car_columns = list(car_data.columns)
print(car_columns)

In [None]:
price = car_data["Price"]
mileage = car_data["Mileage"]

plt.plot(mileage, price, "x")
plt.xlabel("Mileage")
plt.ylabel("Price")
plt.show()

In [None]:
coefs1 = np.polyfit(mileage, price, 1)

# Takes the coefs from the fit, and creates callable polynomial functions
y = np.poly1d(coefs1)

# For smooth plots:
x = np.linspace(min(mileage), max(mileage), 100)

plt.plot(mileage, price, "x")
plt.plot(x, y(x), label="fit")
plt.xlabel("Mileage")
plt.ylabel("Price")
plt.title(f"Price = {coefs1[1]:.1f} + {coefs1[0]:.2f} x milage")
plt.show()

## Other variables:

In [None]:
print(car_columns)

makes = car_data["Make"].unique()
print(makes)

In [None]:
for i, make in enumerate(makes):
    tmp_data = car_data[car_data.Make == make]
    x_data = tmp_data["Mileage"]
    y_data = tmp_data["Price"]
    plt.plot(x_data, y_data, "x", label=make)
plt.legend()
plt.xlabel("Mileage")
plt.ylabel("Price")
plt.show()

Fit seperate catagories (breaking multi-variate into several Bi-variate)

In [None]:
for i, make in enumerate(makes):
    tmp_data = car_data[car_data.Make == make]
    x_data = tmp_data["Mileage"]
    y_data = tmp_data["Price"]
    coefs1 = np.polyfit(x_data, y_data, 1)
    y = np.poly1d(coefs1)
    plt.plot(x_data, y_data, "x", color=f"C{i}")
    plt.plot(x, y(x), "-", label=make, color=f"C{i}")
plt.legend()
plt.xlabel("Mileage")
plt.ylabel("Price")
plt.show()

### Separate 'models' for given 'make'

In [None]:
chev_data = car_data[car_data["Make"] == "Chevrolet"]
chev_models = chev_data["Model"].unique()
print(chev_models)

plt.title("Chevrolet")
for i, model in enumerate(chev_models):
    tmp_data = car_data[car_data["Model"] == model]
    x_data = tmp_data["Mileage"]
    y_data = tmp_data["Price"]
    coefs1 = np.polyfit(x_data, y_data, 1)
    y = np.poly1d(coefs1)
    plt.plot(x_data, y_data, "x", color=f"C{i}")
    plt.plot(x, y(x), "-", label=model, color=f"C{i}")
plt.legend()
plt.xlabel("Mileage")
plt.ylabel("Price")
plt.show()

In [None]:
fig, axs2D = plt.subplots(3, 2, sharex=True, sharey=True)
fig.tight_layout(pad=2.0)  # add some space

# Flatten, so I can loop over with 1 index
axs = axs2D.flatten()

for i, make in enumerate(makes):
    make_data = car_data[car_data["Make"] == make]
    models = make_data["Model"].unique()

    axs[i].set_title(make)
    for j, model in enumerate(models):
        tmp_data = make_data[make_data["Model"] == model]
        x_data = tmp_data["Mileage"]
        y_data = tmp_data["Price"]
        coefs1 = np.polyfit(x_data, y_data, 1)
        y = np.poly1d(coefs1)
        axs[i].plot(x_data, y_data, "x", color=f"C{j}")
        axs[i].plot(x, y(x), "--", color=f"C{j}")
    coefs1 = np.polyfit(make_data["Mileage"], make_data["Price"], 1)
    y = np.poly1d(coefs1)
    axs[i].plot(x, y(x), "k-", linewidth=3)
[axs2D[2, k].set_xlabel("Mileage") for k in range(0, 2)]
[axs2D[k, 0].set_ylabel("Price") for k in range(0, 3)]
plt.show()

## Other variables: True multivariate

In [None]:
litres = car_data["Liter"]

ax = plt.axes(projection="3d")
ax.set_box_aspect(aspect=None, zoom=0.85)

ax.scatter(mileage, litres, price)

ax.set_xlabel("Milage")
ax.set_ylabel("Litres")
ax.set_zlabel("Price")


plt.show()

In [None]:
ax = plt.axes(projection="3d")
ax.set_box_aspect(aspect=None, zoom=0.85)

for make in makes:
    t_data = car_data[car_data["Make"] == make]
    ax.scatter(t_data["Mileage"], t_data["Liter"], t_data["Price"], label=make)
    ax.legend(loc="upper left")

ax.set_xlabel("Milage")
ax.set_ylabel("Litres")
ax.set_zlabel("Price")


plt.show()

Sometimes logscale is helpful

Can either set scale of axis to log, or simply take log of the data

In [None]:
ax = plt.axes(projection="3d")
ax.set_box_aspect(aspect=None, zoom=0.85)

for make in makes:
    t_data = car_data[car_data["Make"] == make]
    ax.scatter(
        np.log10(t_data["Mileage"]),
        t_data["Liter"],
        np.log10(t_data["Price"]),
        label=make,
    )
    ax.legend(loc="upper left")

ax.set_xlabel("$\\log_{10}$ Milage")
ax.set_ylabel("Litres")
ax.set_zlabel("$\\log_{10}$ Price")


plt.show()

## Multiple regression

In [None]:
xydata = np.column_stack((car_data["Mileage"], car_data["Liter"]))
zdata = car_data["Price"]


def price_f(data, a_offset, b_mil, c_lit):
    return a_offset + b_mil * data[:, 0] + c_lit * data[:, 1]


params, pcov = curve_fit(price_f, xydata, zdata)
print(params)

In [None]:
from matplotlib import cm

ax = plt.axes(projection="3d")

t_mil = np.linspace(0, 50000, 100)
t_L = np.linspace(1, 7, 20)
X, Y = np.meshgrid(t_mil, t_L)


def price_f2(x, y, a_offset, b_mil, c_lit):
    return a_offset + b_mil * x + c_lit * y


Z = price_f2(X, Y, params[0], params[1], params[2])

ax.set_box_aspect(aspect=None, zoom=0.85)
ax.plot_surface(X / 1000, Y, Z / 1000, cmap=cm.Blues, alpha=0.85)

for make in makes:
    t_data = car_data[car_data["Make"] == make]
    ax.scatter(
        t_data["Mileage"] / 1000, t_data["Liter"], t_data["Price"] / 1000, label=make
    )
    ax.legend(loc="upper left")

ax.set_xlabel("Milage ('000)")
ax.set_ylabel("Litres")
ax.set_zlabel("Price ('000)")

plt.show()

### Include catagorical variables

In this case, silly.
But still informative.

In [None]:
is_cadillac = car_data["Make"] == "Cadillac"
# print(is_cadillac)

is_cadillac = 1.0 * is_cadillac
# print(is_cadillac)

xydata = np.column_stack((car_data["Mileage"], car_data["Liter"], is_cadillac))
zdata = car_data["Price"]


def price_f(data, a_offset, b_mil, c_lit, d_cadi):
    return a_offset + b_mil * data[:, 0] + c_lit * data[:, 1] + d_cadi * data[:, 2]


params, pcov = curve_fit(price_f, xydata, zdata)
print(params)