In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [37]:
df = pd.read_csv("../data/Advertising.csv", index_col=0)

print(f"{df.shape[0]} samples")
print(f"{df.shape[1] -1} features") # subtract one as sales is the label

df.head()

200 samples
3 features


Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [38]:
# we drop the Sales column since Sales is a dependet variable, its value depends on the vaule of features
# so it can be considered the label
# x - design matrix/ feature matrix / features / independent variable
# y - 
X, y = df.drop("sales", axis = "columns"), df["sales"] # use tuple unpacking to drop sales from both x and y
X.head(2)

Unnamed: 0,TV,radio,newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1


## Scikit-learn steps
1. Train|Test split or Train|val|Test split
2. Scale dataset
    - many algorithms require scaling, some don't
    - different types of scaling (e.g. feature standardization, min-max scaling)
    - scale training data and test data to the training datas parameters to avoid data leakage
3. Fit algorithm to training data
4. Predict on test data
5. Evaluation metrics

## 1. Train|Test

In [47]:
# split the data into a training set and test set
# to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #train_test_split shuffles the values

print(f"{X_train.shape = }")
print(f"{X_test.shape = }")
print(f"{y_train.shape = }")
print(f"{y_test.shape = }")

X_train.shape = (134, 3)
X_test.shape = (66, 3)
y_train.shape = (134,)
y_test.shape = (66,)


In [49]:
# now we see that the index have been shuffled
X_train.head()

Unnamed: 0,TV,radio,newspaper
43,293.6,27.7,1.8
190,18.7,12.1,23.4
91,134.3,4.9,9.3
137,25.6,39.0,9.3
52,100.4,9.6,3.6


In [41]:
# check too se that the index has been correct shuffled throu x and y
y_train.head()

43     20.7
190     6.7
91     11.2
137     9.5
52     10.7
Name: sales, dtype: float64

## 2. Feature scaling

- min-max scaling
- values transformed into 0 to 1

In [42]:
# instansiate an instance from the class MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train) # using the training data to fit the scaler

# use training datas parameters to transform 
# both training and test data, else if we use test datas parameters to scale test data, we have 
# leaked data, which gives misleading results
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

# we have now transformed the data so that we only have values between 0 and 1
print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}") # 2f so we only have 2 float numbers

# different numbers because we fit the test to X_train
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}")

0.00 ≤ scaled_X_train ≤ 1.00
0.01 ≤ scaled_X_test ≤ 1.13


In [43]:
# is now an 1 dimensional array instead of a dataframe
scaled_X_train

array([[0.99053094, 0.55846774, 0.01491054],
       [0.06087251, 0.24395161, 0.22962227],
       [0.45180927, 0.09879032, 0.08946322],
       [0.08420697, 0.78629032, 0.08946322],
       [0.33716605, 0.19354839, 0.03280318],
       [0.26885357, 0.        , 0.08846918],
       [0.63476496, 0.36491935, 0.25149105],
       [0.59621238, 0.6733871 , 0.38170974],
       [0.42272574, 0.74395161, 0.78429423],
       [0.70645925, 0.41532258, 0.10337972],
       [0.4808928 , 0.59072581, 0.1222664 ],
       [0.62292864, 0.88508065, 0.0139165 ],
       [0.74974636, 0.08669355, 0.49204771],
       [0.81501522, 0.76612903, 0.22763419],
       [0.0557998 , 0.92540323, 0.68588469],
       [0.40514034, 0.57459677, 0.13817097],
       [0.30098072, 0.19959677, 0.35188867],
       [0.64389584, 0.57862903, 0.17793241],
       [0.25295908, 0.21774194, 0.05666004],
       [0.65099763, 0.37096774, 0.6500994 ],
       [0.2874535 , 0.72177419, 0.48707753],
       [0.90023673, 0.88306452, 0.04671968],
       [0.

## 3. Linear regression
$y = w_0 + w_1x_1 + w_2x_2 + w_3x_3$

In [44]:
# intansiate an instance from the the class LinearRegression
model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}") # Estimated coefficients for the linear regression problem.
print(f"Intercept parameter: {model.intercept_}") # Independent term in the linear model

Parameters: [13.20747617  9.75285112  0.61108329]
Intercept parameter: 2.79115951962436


## Prediction

In [45]:
# create test samples
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]

print(f"Scaled features {test_sample_features}, label {test_sample_target}")
print(f"Prediction: {model.predict(test_sample_features)[0]:.2f}")

Scaled features [[0.54988164 0.63709677 0.52286282]], label 16.9
Prediction: 16.59


## 5. Evaluate

common metrics for regression case
- mae - mean absolut error
- mse mean squared error
- rmse - root mean squared error

In [46]:
# evaluate the regression 
y_pred = model.predict(scaled_X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae}, MSE: {mse}, RMSE: {rmse}")

MAE: 1.4937750024728984, MSE: 3.72792833068152, RMSE: 1.9307843822347228
