In [None]:
from typing import Union, Tuple, Callable, Any
import inspect
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Data is previously retrieved from https://www.kaggle.com/datasets/CooperUnion/cardataset
DATA_PATH = '../data/data.csv'

In [None]:
def standardise_strings(
    target: Union[pd.core.strings.accessor.StringMethods, pd.core.indexes.base.Index]
) -> Union[pd.core.series.Series, pd.core.indexes.base.Index]:
    """
        Stage 1 cleaning for this price prediction:
        - Lower case for everything
        - Spaces replaced by underscores

        Can work on either Pandas indices (e.g. column headers) or Pandas series (e.g. row data)
        
        :param StringMethods | Index target: the target row or column to standardise
        :return StringMethods | Index result: the standardised row or column

        Note the return types are using typing since this was written pre-3.10.
    """
    result = (
        target
        .str
        .lower()
        .str
        .replace(' ', '_')
    )

    return result

def train_test_val_split(
    df_data: pd.DataFrame,
    validation_split: float=0.2,
    testing_split: float=0.2,
    shuffle_seed:int = 2
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
        Stage 2: Validation framework for this price prediction:
        - Source data shuffled with numpy
        - Split into validation/test sets in specified ratios
        - The remainder reserved for training.

        :param pd.DataFrame df_data: dataset to split
        :param float validation_split: ratio to split source data into a validation set: 20% by default
        :param float testing_split: ratio to split source data into a testing set: 20% by default
        :param int shuffle_seed: random seed for shuffling - default is 2

        :return Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame] df_train, df_val, df_test: all dataframes shuffled and split in their specific ratios.

        Note the return types are using typing since this was written pre-3.10.
    """
    n = len(df_data)
    n_val = int(n * validation_split)
    n_test = int(n * testing_split)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.seed(shuffle_seed)
    np.random.shuffle(idx)

    df_train = df_data.iloc[idx[:n_train]]
    df_val = df_data.iloc[idx[n_train:n_val + n_train]]
    df_test = df_data.iloc[idx[n_train + n_val:]]

    return df_train, df_val, df_test

def regression_setup(
    df_data: pd.DataFrame, 
    label: str
) -> Tuple[pd.DataFrame, np.ndarray]:
    """
        Stage 3: Extract y-value from the different train/test/val datasets
        The following is done to the source dataframe:
        - Index dropped
        - Y-value extracted to its own numpy array and returned
        - The column used for y-value is deleted from the source dataframe - the modified dataframe is also returned

        :param pd.DataFrame df_data: dataset to use
        :param str label: Desired y-value

        :return Tuple[pd.DataFrame, np.ndarray] df_data, y_data: dataframe with label removed, plus the label data in separate array

        Note the return types are using typing since this was written pre-3.10.
    """
    # Indexes not necessary - so drop them
    df_data = df_data.reset_index(drop=True)

    # Grab the label/y-value as a separate numpy array
    y_data = np.log1p(df_data[label].values)

    # Get rid of the y-value from the datasets so they don't accidentally get used as features...
    del df_data[label]

    return df_data, y_data    

def summary_printout(
    df: pd.DataFrame, 
    col: str
):
    """
        Summary printout for a Pandas dataframe column.

        :param pd.DataFrame df: The source dataframe
        :param str col: The column name

        No returns.
    """
    print(f"Summary printout for {col}")
    print(f"Total unique values: {df[col].nunique()}")
    print(f"Sample: {df[col].unique()[:5]}")    

In [None]:
df_cars = pd.read_csv(DATA_PATH)
df_cars.head()

In [None]:
# Stage 1 Cleaning on both column headers and values.
df_cars.columns = standardise_strings(df_cars.columns)
df_cars.columns

# Original code: df.dtypes == 'object' - not a fan.
# Get rid of magic strings and filters using those :)
object_filter = df_cars.dtypes == type(object)

# This is then used to do the same column conversions as before
string_cols = list(df_cars.dtypes[object_filter].index)

for next_col in string_cols:
    df_cars[next_col] = standardise_strings(df_cars[next_col])

df_cars.head()

In [None]:
for next_col in df_cars.columns:
    summary_printout(df=df_cars, col=next_col)

### First chart

We are expecting a long-tail distribution for this dataset - some cars will be very expensive (e.g. up to $2m) and will make the initial graph look skewed.

In [None]:
# First graph to show everything
sns.histplot(df_cars.msrp, bins=50)

In [None]:
# Second graph to show a better view of the distribution
# There are still a very large number of cars with $1000 price though
filter = df_cars.msrp < 100000
sns.histplot(df_cars.msrp[filter], bins=50)

In [None]:
# Turn prices into logarithmic values (np.log1p so divide by zero error doesn't occur)
# The resulting graph is closer to a normal distribution - despite the weird $1000 values (indicates this may be the minimum price to list)
log_price = np.log1p(df_cars.msrp)
sns.histplot(log_price, bins=50)

### Validation framework

In [None]:
df_cars_train, df_cars_val, df_cars_test = train_test_val_split(df_data=df_cars, validation_split=0.2, testing_split=0.2, shuffle_seed=2)

In [None]:
# Set up labels for each dataset
Y_VAL_LABEL = 'msrp'

df_cars_train, y_cars_train = regression_setup(df_cars_train, Y_VAL_LABEL)
df_cars_val, y_cars_val = regression_setup(df_cars_val, Y_VAL_LABEL)
df_cars_test, y_cars_test = regression_setup(df_cars_test, Y_VAL_LABEL)

### Linear Regression Training

#### Vector-Based Regression Formula Example
$g(x_i) = W_0 + W_1 \cdot x_{i1} + W_2 \cdot x_{i2} ... W_n \cdot x_{in}$

This can also be written as:  
  
$g(x_i) = W_0 + \sum_{j=1}^n W_j \cdot x_{ij}$  
  
or in our case, with 3 features selected:  
  
$g(x_i) = W_0 + \sum_{j=1}^3 W_j \cdot x_{ij}$  

But when all records are included, this can be simplified as:
$g(X) = X \cdot y$ where $X$ is a feature matrix, and y is a vector of weights.

So long as $W_0$ is integrated into the rest of the weights, and $x_0$ is 1.

In [None]:
def linear_regression(
    X: np.ndarray,
    w: list
) -> np.ndarray:
    
    prediction = X.dot(w)

    return prediction

In [None]:
x1 = [1, 148, 24, 1385]
x2 = [1, 132, 25, 2031]
x10 = [1, 453, 11, 86]
w = [7.17, 0.01, 0.04, 0.002]

X = [x1, x2, x10]
X = np.array(X)

# Reverse the log1p done on the MSRP in a previous step to get the "real" MSRP value
np.expm1(linear_regression(X, w))

#### Normal Regression Formula

The above vector formula was using hard-coded weights. Later, the Zoomcamp goes into deriving these ourselves.

This needs to be approximated with something called a **Gram matrix**:

$X \cdot X^T$ where $X^T$ is the feature matrix transposed.

This results in the final equation:

$w = (X^TX)^{-1} X^Ty$

In [64]:
def train_linear_regression(
    X: np.ndarray, 
    y: np.ndarray) -> Tuple[float, np.ndarray]:

    """
        Trains a linear regression model per the "normal" regression formula above.
        
        :param np.ndarray X: matrix of the features
        :param np.ndarray y: matrix of the values/labels

        :return Tuple[float, np.ndarray] - the bias weight + all the others

        Note the return types are using typing since this was written pre-3.10.    
    """

    # Add 1s
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # Normal equation with transposed X
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

def train_linear_regression_regularized(
    X: np.ndarray,
    y: np.ndarray,
    r: float=0.001
) -> Tuple[float, np.ndarray]:

    """
        Trains a linear regression model per the "normal" regression formula above.
        
        :param np.ndarray X: matrix of the features
        :param np.ndarray y: matrix of the values/labels

        :return Tuple[float, np.ndarray] - the bias weight + all the others

        Note the return types are using typing since this was written pre-3.10.    
    """   

    # Add 1s
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # Normal equation with transposed X
    XTX = X.T.dot(X)
    # Regularize!
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]    

In [None]:
def prepare_X(
    features: list,
    df_data: pd.DataFrame
) -> np.ndarray:
    #print(f"Features: {features}")
    df_features = df_data[features].fillna(0)
    X = df_features.values
    return X

In [None]:
list_features_base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
features_base = {x: None for x in list_features_base}

In [None]:
# Training
X_cars_train = prepare_X(list_features_base, df_cars_train)
# Already have y_cars_train earlier from the cleaning stage

w0, w = train_linear_regression(X_cars_train, y_cars_train)
y_cars_pred = w0 + X_cars_train.dot(w)

#### Comparing initial model vs. actual values

It's a bit off.

In [None]:
sns.histplot(y_cars_pred, color='red', alpha=0.5, bins=50)
sns.histplot(y_cars_train, color='blue', alpha=0.5, bins=50)

#### Using RMSE to figure things out

RMSE = "Root Mean Squared Error"

In [None]:
def rmse(
    y: np.ndarray, 
    y_pred: np.ndarray
) -> np.float64:
    squared_error = (y - y_pred) ** 2
    mean_squared_error = squared_error.mean()
    root_mean_squared_error = np.sqrt(mean_squared_error)
    return root_mean_squared_error

In [None]:
list_features_base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']

# Training
X_cars_train = prepare_X(list_features_base, df_cars_train)
# Already have y_cars_train earlier from the cleaning stage

w0, w = train_linear_regression(X_cars_train, y_cars_train)
y_cars_pred = w0 + X_cars_train.dot(w)

# Validation
X_cars_val = prepare_X(list_features_base, df_cars_val)
y_cars_val_pred = w0 + X_cars_val.dot(w)

rmse(y_cars_val, y_cars_val_pred)

#### Feature Engineering

The Zoomcamp goes really hard on copy/pasting code with new feature sets to do the same thing. 

In this notebook, a few things have been modularised:
- Adding a new feature to the dataframe
- Turning the above linear regression calc into its own method
- Returning the RMSE straight away.

Adding new features was originally done via dictionaries and lambdas to add support for any calculations. This became unusable once a more generic way to add large numbers of features was required.

These methods are commented out in case there's time to fix them.

In [None]:
# def add_new_feature_basic(
#     df_data: pd.DataFrame,
#     feature_name: str,
#     feature_target_col: str,
#     feature_target_val: Any,
#     feature_target_type: str
# ) -> pd.DataFrame:
#     feature_kwargs = {
#         feature_name: lambda x: (x[feature_target_col] == feature_target_val).astype(feature_target_type)
#     }
#     df_new = df_data.assign(**feature_kwargs)

#     return df_new

# def add_new_feature_dynamic(
#     df_data: pd.DataFrame,
#     feature_name: str,
#     feature_calc: Callable
# ) -> pd.DataFrame:
#     feature_kwargs = {
#         feature_name: feature_calc
#     }
#     df_new = df_data.assign(**feature_kwargs)
#     return df_new

In [78]:
def get_most_popular_values(
    df_data: pd.DataFrame,
    col: str
) -> list[str]:
    return list(df_data[col].value_counts().head().index)

def get_doors_numbers() -> list[int]:
    return [2, 3, 4]

def get_base_features() -> list[str]:
    return ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']

def get_age_feature() -> list[str]:
    return ['age']

def get_doors_features() -> list[str]:
    return [f'num_doors_{x}' for x in get_doors_numbers()]

def map_extra_features(
    df_to_map: pd.DataFrame
) -> dict:
    extra_categories = ['make', 'model', 'engine_fuel_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']
    popular_cat_values = {c: get_most_popular_values(df_to_map, c) for c in extra_categories}

    return popular_cat_values

def add_features_to_dataframe(
    df_data: pd.DataFrame,
    base: bool = True,
    age: bool = False,
    doors: bool = False,
    extra_categories: dict = None
) -> Tuple[pd.DataFrame, list[str]]:

    features = []
    df_new = df_data.copy()
    
    if (base):
        features += ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']

    if (age):
        df_new = df_new.assign(age=lambda x: 2017 - x.year)

    if (doors):
        for num_doors in get_doors_numbers():
            key = f'num_doors_{num_doors}'
            df_new[key] = (df_new['number_of_doors'] == num_doors).astype('int')
    
    if (extra_categories):
        for next_cat, values in extra_categories.items():
            for next_val in values:
                key = f'{next_cat}_{next_val}'
                df_new[key] = (df_new[next_cat] == next_val).astype('int')

    return df_new

def linear_regression_evaluation_workflow(
    features: list[str],
    df_train: pd.DataFrame,
    df_val: pd.DataFrame,
    y_train: np.ndarray,
    y_val_actual: np.ndarray,
    regularization_factor: float = None
) -> Tuple[pd.DataFrame, np.float64, np.float64]:
    X_train = prepare_X(features, df_train)

    w0, w = train_linear_regression_regularized(X_train, y_train, r=regularization_factor) if regularization_factor else train_linear_regression(X_train, y_train)

    # Validation
    X_val = prepare_X(features, df_val)
    y_val_pred = w0 + X_val.dot(w)

    return df_train, rmse(y_val_actual, y_val_pred), w0

def linear_regression_training_workflow(
    features: list[str],
    df_train: pd.DataFrame,
    y_train: np.ndarray,
    regularization_factor: float = None
) -> Tuple[np.float64, np.ndarray]:
    X_train = prepare_X(features, df_train)

    w0, w = train_linear_regression_regularized(X_train, y_train, r=regularization_factor) if regularization_factor else train_linear_regression(X_train, y_train)

    return w0, w

def linear_regression_testing_workflow(
    features: list[str],
    df_test: pd.DataFrame,
    y_test: np.ndarray,
    w0: np.float64,
    w: np.ndarray
) -> np.float64:
    X_test = prepare_X(features, df_test)

    y_pred = w0 + X_test.dot(w)

    score = rmse(y_test, y_pred)

    return score


In [None]:
# Again, removed this bit - it was working well until many list/dictionary comprehensions were needed

# With the lambda - must re-assign x to a default argument to evaluate it at the time this dictionary is built
# If not, it will evaluate when the loop below is done, which will have the following cols added to the dataframes:
# num_doors_2: lambda y: (y.number_of_doors == 4).astype('int')
# num_doors_3: lambda y: (y.number_of_doors == 4).astype('int')
# num_doors_4: lambda y: (y.number_of_doors == 4).astype('int')
# Read this for more info: https://stackoverflow.com/questions/36805071/dictionary-comprehension-with-lambda-functions-gives-wrong-results
# So x=x as default argument for lambda y forces evaluation immediately.
# features_doors = {f'num_doors_{x}': lambda y, x=x: (y.number_of_doors == x).astype('int') for x in [2, 3, 4]}

# features_doors_test = {
#     **features_base,
#     **features_age,
#     **features_doors
# }
# linear_regression_evaluation_workflow(
#     added_features=features_doors_test,
#     df_train_base=df_cars_train,
#     df_val_base=df_cars_val,
#     y_train=y_cars_train,
#     y_val_actual=y_cars_val   
# )

In [None]:
# # Same as before with makes this time, but only the 5 most popular?
# popular_makes = list(df_cars['make'].value_counts().head().index)
# features_popular_makes = {f'make_{x}': lambda y, x=x: (y.make == x).astype('int') for x in popular_makes}

# features_new_makes = {
#     **features_base,
#     **features_age,
#     **features_doors,
#     **features_popular_makes
# }

# linear_regression_evaluation_workflow(
#     added_features=features_new_makes,
#     df_train_base=df_cars_train,
#     df_val_base=df_cars_val,
#     y_train=y_cars_train,
#     y_val_actual=y_cars_val   
# )

In [61]:
print("Mapping extra features")
extra_features = map_extra_features(df_cars_train)

print("Creating main features list")
features_all = (get_base_features()
            + get_age_feature()
            + get_doors_features()
            + [f'{key}_{val}' for key, vals in extra_features.items() for val in vals]
)

get_age = True
get_doors = True
get_base = True

print("Creating training df")
df_train_all = add_features_to_dataframe(df_cars_train, base=get_base, age=get_age, doors=get_doors, extra_categories=extra_features)
print("Creating validation df")
df_val_all = add_features_to_dataframe(df_cars_val, base=get_base, age=get_age, doors=get_doors, extra_categories=extra_features)

print(features_all)

print("Running linear regression")
linear_regression_evaluation_workflow(
    features=features_all,
    df_train=df_train_all,
    df_val=df_val_all,
    y_train=y_cars_train,
    y_val_actual=y_cars_val
)

Mapping extra features
Creating main features list
Creating training df
Creating validation df
['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity', 'age', 'num_doors_2', 'num_doors_3', 'num_doors_4', 'make_chevrolet', 'make_ford', 'make_volkswagen', 'make_toyota', 'make_dodge', 'model_silverado_1500', 'model_tundra', 'model_f-150', 'model_sierra_1500', 'model_tacoma', 'engine_fuel_type_regular_unleaded', 'engine_fuel_type_premium_unleaded_(required)', 'engine_fuel_type_premium_unleaded_(recommended)', 'engine_fuel_type_flex-fuel_(unleaded/e85)', 'engine_fuel_type_diesel', 'driven_wheels_front_wheel_drive', 'driven_wheels_rear_wheel_drive', 'driven_wheels_all_wheel_drive', 'driven_wheels_four_wheel_drive', 'market_category_crossover', 'market_category_flex_fuel', 'market_category_luxury', 'market_category_hatchback', 'market_category_luxury,performance', 'vehicle_size_compact', 'vehicle_size_midsize', 'vehicle_size_large', 'vehicle_style_sedan', 'vehicle_style_4dr_

(             make        model  year                engine_fuel_type  \
 0       chevrolet       cobalt  2008                regular_unleaded   
 1          toyota       matrix  2012                regular_unleaded   
 2          subaru      impreza  2016                regular_unleaded   
 3      volkswagen      vanagon  1991                regular_unleaded   
 4            ford        f-150  2017        flex-fuel_(unleaded/e85)   
 ...           ...          ...   ...                             ...   
 7145          bmw     4_series  2015     premium_unleaded_(required)   
 7146   volkswagen       beetle  2015  premium_unleaded_(recommended)   
 7147          gmc  sierra_1500  2015        flex-fuel_(unleaded/e85)   
 7148  rolls-royce        ghost  2014     premium_unleaded_(required)   
 7149   volkswagen           cc  2017  premium_unleaded_(recommended)   
 
       engine_hp  engine_cylinders transmission_type      driven_wheels  \
 0         148.0               4.0            m

#### So how do we fix this?

#### Regularization!

I really don't understand the math behind this but the effects speak for themselves.

In [86]:
print("Mapping extra features")
extra_features = map_extra_features(df_cars_train)

print("Creating main features list")
features_all = (get_base_features()
            + get_age_feature()
            + get_doors_features()
            + [f'{key}_{val}' for key, vals in extra_features.items() for val in vals]
)

get_age = True
get_doors = True
get_base = True

print("Creating training df")
df_train_all = add_features_to_dataframe(df_cars_train, base=get_base, age=get_age, doors=get_doors, extra_categories=extra_features)
print("Creating validation df")
df_val_all = add_features_to_dataframe(df_cars_val, base=get_base, age=get_age, doors=get_doors, extra_categories=extra_features)

print(features_all)

print("Running linear regression")
linear_regression_evaluation_workflow(
    features=features_all,
    df_train=df_train_all,
    df_val=df_val_all,
    y_train=y_cars_train,
    y_val_actual=y_cars_val,
    regularization_factor=0.001
)

Mapping extra features
Creating main features list
Creating training df
Creating validation df
['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity', 'age', 'num_doors_2', 'num_doors_3', 'num_doors_4', 'make_chevrolet', 'make_ford', 'make_volkswagen', 'make_toyota', 'make_dodge', 'model_silverado_1500', 'model_tundra', 'model_f-150', 'model_sierra_1500', 'model_tacoma', 'engine_fuel_type_regular_unleaded', 'engine_fuel_type_premium_unleaded_(required)', 'engine_fuel_type_premium_unleaded_(recommended)', 'engine_fuel_type_flex-fuel_(unleaded/e85)', 'engine_fuel_type_diesel', 'driven_wheels_front_wheel_drive', 'driven_wheels_rear_wheel_drive', 'driven_wheels_all_wheel_drive', 'driven_wheels_four_wheel_drive', 'market_category_crossover', 'market_category_flex_fuel', 'market_category_luxury', 'market_category_hatchback', 'market_category_luxury,performance', 'vehicle_size_compact', 'vehicle_size_midsize', 'vehicle_size_large', 'vehicle_style_sedan', 'vehicle_style_4dr_

(             make        model  year                engine_fuel_type  \
 0       chevrolet       cobalt  2008                regular_unleaded   
 1          toyota       matrix  2012                regular_unleaded   
 2          subaru      impreza  2016                regular_unleaded   
 3      volkswagen      vanagon  1991                regular_unleaded   
 4            ford        f-150  2017        flex-fuel_(unleaded/e85)   
 ...           ...          ...   ...                             ...   
 7145          bmw     4_series  2015     premium_unleaded_(required)   
 7146   volkswagen       beetle  2015  premium_unleaded_(recommended)   
 7147          gmc  sierra_1500  2015        flex-fuel_(unleaded/e85)   
 7148  rolls-royce        ghost  2014     premium_unleaded_(required)   
 7149   volkswagen           cc  2017  premium_unleaded_(recommended)   
 
       engine_hp  engine_cylinders transmission_type      driven_wheels  \
 0         148.0               4.0            m

In [70]:
for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:
    df_result, error_result, bias = linear_regression_evaluation_workflow(
        features=features_all,
        df_train=df_train_all,
        df_val=df_val_all,
        y_train=y_cars_train,
        y_val_actual=y_cars_val,
        regularization_factor=r
    )
    print(f'r: {r}')
    print(f'rmse: {error_result}')
    print(f'bias: {bias}')

r: 0.0
rmse: 33.74198951691965
bias: -329305798173194.06
r: 1e-05
rmse: 0.4608152979726957
bias: 6.807088632309245
r: 0.0001
rmse: 0.46081536379466387
bias: 7.14146160450656
r: 0.001
rmse: 0.460815858442581
bias: 7.131081099941378
r: 0.1
rmse: 0.46087365491137133
bias: 7.000232410790203
r: 1
rmse: 0.4615812838275859
bias: 6.250747847552861
r: 10
rmse: 0.4726098772669202
bias: 4.729512585677039


In [72]:
# 0.001 won that.

df_result, error_result, bias = linear_regression_evaluation_workflow(
    features=features_all,
    df_train=df_train_all,
    df_val=df_val_all,
    y_train=y_cars_train,
    y_val_actual=y_cars_val,
    regularization_factor=0.001
)

print(error_result)

0.460815858442581


#### Time for Testing!

In [118]:
# Didn't realise testing was done on train + test instead of test alone
df_combined = pd.concat([df_cars_train, df_cars_test]).reset_index(drop=True)
y_full_train = np.concatenate([y_cars_train, y_cars_val])

extra_features = map_extra_features(df_combined)

features_all = (get_base_features()
            + get_age_feature()
            + get_doors_features()
            + [f'{key}_{val}' for key, vals in extra_features.items() for val in vals]
)

get_age = True
get_doors = True
get_base = True

df_full_train = add_features_to_dataframe(df_combined, base=get_base, age=get_age, doors=get_doors, extra_categories=extra_features)

In [119]:
w0, w = linear_regression_training_workflow(
    features=features_all,
    df_train=df_full_train,
    y_train=y_full_train,
    regularization_factor=0.001
)

w0

7.1407954427979075

In [103]:
extra_features = map_extra_features(df_cars_test)
features_all = (get_base_features()
            + get_age_feature()
            + get_doors_features()
            + [f'{key}_{val}' for key, vals in extra_features.items() for val in vals]
)
get_age = True
get_doors = True
get_base = True

df_features_test = add_features_to_dataframe(df_cars_test, base=get_base, age=get_age, doors=get_doors, extra_categories=extra_features)

score = linear_regression_testing_workflow(
    features=features_all,
    df_test=df_features_test,
    y_test=y_cars_test,
    w0=w0,
    w=w
)

score # Why is this out of whack compared to the tutorial? And when evaluating gave a score of 0.46?

0.5247955144722601

In [107]:
test_car = df_cars_test.iloc[20].to_dict()

In [128]:
df_car = pd.DataFrame([test_car])
single_car_extra_features = map_extra_features(df_cars_test)
single_car_features_all = (get_base_features()
            + get_age_feature()
            + get_doors_features()
            + [f'{key}_{val}' for key, vals in single_car_extra_features.items() for val in vals]
)
get_age = True
get_doors = True
get_base = True

df_features_single_car = add_features_to_dataframe(df_car, base=get_base, age=get_age, doors=get_doors, extra_categories=single_car_extra_features)

In [132]:
X_test_car = prepare_X(single_car_features_all, df_features_single_car)

y_pred = w0 + X_test_car.dot(w)
#s_single_car.shape, X_test_car.shape)
#print(single_car_features_all)
y_pred = y_pred[0]
y_pred

10.372266404600563

In [134]:
actual_pred = np.expm1(y_pred)
actual_pred

31959.831121668933

In [135]:
actual_cost = np.expm1(y_cars_test[20])
actual_cost

NameError: name 'actaul_cost' is not defined