In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

df = pd.read_csv("../data/Advertising.csv", index_col=0)

print(f"{df.shape[0]} samples")
print(f"{df.shape[1] -1} features") # subtract one as sales is the label

df.head()

In [None]:
# we drop the Sales column since Sales is a dependet variable, its value depends on the vaule of features
# so it can be considered the label
X, y = df.drop("sales", axis = "columns"), df["sales"] # use tuple unpacking to drop sales from both x and y
X.head(2)

## Scikit-learn steps
1. Train|Test split or Train|val|Test split
2. Scale dataset
    - many algorithms require scaling, some don't
    - different types of scaling

In [None]:
# split the data into a training set and test set
# to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# normalization
# instansiate an object from the class MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train) # using the training data to fit the scaler

# use training datas parameters to transform 
# both training and test data, else if we use test datas parameters to scale test data, we have 
# leaked data, which might give misleading results
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}")

In [None]:
# this model uses SVD approach for solving normal equation
model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}") # Estimated coefficients for the linear regression problem.
print(f"Intercept parameter: {model.intercept_}") # Independent term in the linear model

In [None]:
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]

print(f"Scaled features {test_sample_features}, label {test_sample_target}")
print(f"Prediction: {model.predict(test_sample_features)[0]:.2f}")

In [28]:
# predict the test data
y_pred = model.predict(scaled_X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}")

MAE: 1.51, MSE: 3.80, RMSE: 1.95
