In [1]:
import numpy as np
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pl.read_csv("../../DATA/Advertising.csv")

In [3]:
df

TV,radio,newspaper,sales
f64,f64,f64,f64
230.1,37.8,69.2,22.1
44.5,39.3,45.1,10.4
17.2,45.9,69.3,9.3
151.5,41.3,58.5,18.5
180.8,10.8,58.4,12.9
…,…,…,…
38.2,3.7,13.8,7.6
94.2,4.9,8.1,9.7
177.0,9.3,6.4,12.8
283.6,42.0,66.2,25.5


In [4]:
X = df.drop(["sales"])
y = df["sales"]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=101)

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()

In [9]:
scaler.fit(X_train)

In [10]:
scaled_X_train = scaler.fit_transform(X_train)

In [11]:
scaled_X_test = scaler.transform(X_test)

In [12]:
from sklearn.linear_model import Ridge

In [13]:
ridge_model = Ridge(alpha=100)

In [14]:
ridge_model.fit(X_train, y_train)

In [15]:
y_predict = ridge_model.predict(X_test)

In [16]:
predict_df = pl.DataFrame({
    "y_test": y_test,
    "y_predict": y_predict,
    "error": y_test - y_predict,
    'error %': (y_test - y_predict) / y_test * 100
})

In [17]:
predict_df

y_test,y_predict,error,error %
f64,f64,f64,f64
14.7,15.725804,-1.025804,-6.978258
19.8,19.604452,0.195548,0.987615
11.9,11.454096,0.445904,3.747094
16.7,16.994599,-0.294599,-1.764068
9.5,9.167855,0.332145,3.496258
…,…,…,…
3.2,5.513072,-2.313072,-72.283496
25.4,23.276901,2.123099,8.358659
10.8,12.608394,-1.808394,-16.744386
17.4,18.771754,-1.371754,-7.883643


In [18]:
abs(predict_df['error %']).mean()

11.82462436542779

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
scores = cross_val_score(
    ridge_model,
    X_train,
    y_train,
    scoring='neg_mean_squared_error',
    cv=5,
)

In [21]:
scores

array([-3.1574411 , -1.61190525, -5.37588672, -2.23984591, -4.3264032 ])

In [22]:
abs(scores.mean())

3.3422964358412406

In [23]:
model = Ridge(alpha=1)

In [24]:
scores = cross_val_score(
    model,
    X_train,
    y_train,
    scoring='neg_mean_squared_error',
    cv=5
)

In [25]:
abs(scores.mean())

3.3438915436537484

In [26]:
scores

array([-3.13950859, -1.62235574, -5.37385624, -2.24222086, -4.34151629])

In [27]:
model.fit(X_train, y_train)

In [28]:
y_final_test_pred = model.predict(X_test)

In [29]:
y_final_test_pred

array([15.74115767, 19.61056366, 11.44894162, 17.00806141,  9.17280649,
        7.01252104, 20.28986495, 17.299505  ,  9.77595903, 19.22188514,
       12.40488831, 13.89236226, 13.72552082, 21.28780526, 18.42454998,
        9.98195268, 15.55240809,  7.68925252,  7.55621225, 20.40301552,
        7.79215438, 18.24203939, 24.68616024, 22.82189332,  7.979673  ,
       12.65221633, 21.46918628,  8.05238174, 12.42323143, 12.50710008,
       10.77755014, 19.24455241, 10.07036333,  6.70787914, 17.31486436,
        7.76779152,  9.25400524,  8.27843969, 10.58098509, 10.63601976,
       13.01005852,  9.77205953, 10.21476915,  8.04576111, 11.56707235,
       10.08368133,  8.99798999, 16.25396856, 13.23954944, 20.81480029,
       12.49718695, 13.96625435, 17.56292683, 11.14539184, 12.56274201,
        5.50874657, 23.29447311, 12.62393919, 18.77397718, 15.18786347])

In [30]:
pl.DataFrame({
    "y_test": y_test,
    "y_predict": y_final_test_pred,
    "error": y_test - y_final_test_pred,
    "error %": (y_test - y_final_test_pred) / y_test * 100
})

y_test,y_predict,error,error %
f64,f64,f64,f64
14.7,15.741158,-1.041158,-7.082705
19.8,19.610564,0.189436,0.956749
11.9,11.448942,0.451058,3.790407
16.7,17.008061,-0.308061,-1.844679
9.5,9.172806,0.327194,3.444142
…,…,…,…
3.2,5.508747,-2.308747,-72.14833
25.4,23.294473,2.105527,8.289476
10.8,12.623939,-1.823939,-16.888326
17.4,18.773977,-1.373977,-7.896421


In [31]:
model.fit(X_train,y_train)

In [32]:
y_final_test_pred = model.predict(X_test)

In [34]:
from sklearn.metrics import mean_squared_error

In [35]:
mean_squared_error(y_test,y_final_test_pred)

2.298727428933738

In [37]:
X = df.drop('sales')
y = df['sales']

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)


In [39]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
model = Ridge(alpha=100)

In [41]:
from sklearn.model_selection import cross_validate

In [42]:
scores = cross_validate(model,X_train,y_train,
                        scoring=['neg_mean_absolute_error','neg_mean_squared_error','max_error'],cv=5)

In [43]:
scores

{'fit_time': array([0.00103498, 0.00103521, 0.00051999, 0.0005095 , 0.00103068]),
 'score_time': array([0.00103545, 0.00052285, 0.0010314 , 0.00103188, 0.00052452]),
 'test_neg_mean_absolute_error': array([-2.31243044, -1.74653361, -2.56211701, -2.01873159, -2.27951906]),
 'test_neg_mean_squared_error': array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
         -8.38562723]),
 'test_max_error': array([ -6.44988486,  -5.58926073, -10.33914027,  -6.61950405,
         -7.75578515])}

In [46]:
pl.DataFrame(scores)

fit_time,score_time,test_neg_mean_absolute_error,test_neg_mean_squared_error,test_max_error
f64,f64,f64,f64,f64
0.001035,0.001035,-2.31243,-9.32553,-6.449885
0.001035,0.000523,-1.746534,-4.944962,-5.589261
0.00052,0.001031,-2.562117,-11.396652,-10.33914
0.00051,0.001032,-2.018732,-7.024211,-6.619504
0.001031,0.000525,-2.279519,-8.385627,-7.755785
