In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
df = pd.read_csv("../data/Advertising.csv", index_col=0)

print(f"{df.shape[0]} samples")
print(f"{df.shape[1] -1} features") # subtract one as sales is the label

df.head()

In [None]:
# we drop the Sales column since Sales is a dependet variable, its value depends on the vaule of features
# so it can be considered the label
# x - design matrix/ feature matrix / features / independent variable
# y - 
X, y = df.drop("sales", axis = "columns"), df["sales"] # use tuple unpacking to drop sales from both x and y
X.head(2)

## Scikit-learn steps
1. Train|Test split or Train|val|Test split
2. Scale dataset
    - many algorithms require scaling, some don't
    - different types of scaling (e.g. feature standardization, min-max scaling)
    - scale training data and test data to the training datas parameters to avoid data leakage
3. Fit algorithm to training data
4. Predict on test data
5. Evaluation metrics

## 1. Train|Test

In [None]:
# split the data into a training set and test set
# to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #train_test_split shuffles the values

print(f"{X_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_train.shape = }")
print(f"{y_test.shape = }")

In [None]:
# now we see that the index have been shuffled
X_train.head()

In [None]:
# check too se that the index has been correct shuffled throu x and y
y_train.head()

## 2. Feature scaling

- min-max scaling
- values transformed into 0 to 1

In [None]:
# instansiate an instance from the class MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train) # using the training data to fit the scaler

# use training datas parameters to transform 
# both training and test data, else if we use test datas parameters to scale test data, we have 
# leaked data, which gives misleading results
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# we have now transformed the data so that we only have values between 0 and 1
print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}") # 2f so we only have 2 float numbers

# different numbers because we fit the test to X_train
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}")

In [None]:
# is now an 1 dimensional array instead of a dataframe
scaled_X_train

## 3. Linear regression
$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

In [None]:
# intansiate an instanve from the the class LinearRegression
model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}") # Estimated coefficients for the linear regression problem.
print(f"Intercept parameter: {model.intercept_}") # Independent term in the linear model

## Prediction

In [None]:
# create test samples
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]

print(f"Scaled features {test_sample_features}, label {test_sample_target}")
print(f"Prediction: {model.predict(test_sample_features)[0]:.2f}")

## 5. Evaluate

common metrics for regression case
- mae - mean absolut error
- mse mean squared error
- rmse - root mean squared error

In [69]:
# evaluate the regression 
y_pred = model.predict(scaled_X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae}, MSE: {mse}, RMSE: {rmse}")

MAE: 1.4937750024728984, MSE: 3.72792833068152, RMSE: 1.9307843822347228
