In [1]:
import numpy as np 

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.utils import shuffle 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold

In [2]:
# load training data
training_data = np.load('data/training_data.npy') 
prices = np.load('data/prices.npy') 

# print the first 4 samples 
print('The first 4 samples are:\n ', training_data[:4]) 
print('The first 4 prices are:\n ', prices[:4]) 

# shuffle 
training_data, prices = shuffle(training_data, prices, random_state=0)

The first 4 samples are:
  [[2.0150e+03 4.1000e+04 1.9670e+01 1.5820e+03 1.2620e+02 5.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0110e+03 4.6000e+04 1.8200e+01 1.1990e+03 8.8700e+01 5.0000e+00
  1.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0120e+03 8.7000e+04 2.0770e+01 1.2480e+03 8.8760e+01 7.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]
 [2.0130e+03 8.6999e+04 2.3080e+01 1.4610e+03 6.3100e+01 5.0000e+00
  1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.0000e+00 0.0000e+00]]
The first 4 prices are:
  [12.5  4.5  6.   3.5]


In [3]:
# Exercitiul 1
def normalize_data_v2(train_data, test_data, type=None):
    if type=='standard':
        scaler = StandardScaler()
        scaler.fit(train_data)
        train_data = scaler.transform(train_data)
        test_data = scaler.transform(test_data)
    elif type=='l2':
        normalizer = Normalizer(norm='l2')
        train_data = normalizer.transform(train_data)
        test_data = normalizer.transform(test_data)
    elif type =='l1':
        normalizer = Normalizer(norm='l1')
        train_data = normalizer.transform(train_data)
        test_data = normalizer.transform(test_data)
        
    return train_data, test_data

In [4]:
# Exercitiul 2
cv = KFold(n_splits=3, random_state=42, shuffle=True)

mse = []
mae = []
for train_index, test_index in cv.split(training_data):
    print("Train Index: ", train_index)
    print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = training_data[train_index], training_data[test_index], prices[train_index], prices[test_index]
    
    X_train, X_test = normalize_data_v2(X_train,X_test, type='standard')
    
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(X_train, y_train)
    output = linear_regression_model.predict(X_test)
   
    mse.append(mean_squared_error(y_test, output))
    mae.append(mean_absolute_error(y_test, output))
  
print("Mean mse:", np.mean(mse))
print("Mean mae:", np.mean(mae))

Train Index:  [   0    1    2 ... 4873 4874 4876]
Test Index:  [   6    8   12 ... 4875 4877 4878]
Train Index:  [   2    3    4 ... 4875 4877 4878]
Test Index:  [   0    1    7 ... 4869 4872 4876]
Train Index:  [   0    1    6 ... 4876 4877 4878]
Test Index:  [   2    3    4 ... 4871 4873 4874]
Mean mse: 3.1631038
Mean mae: 1.3170942


In [5]:
# Exercitiul 3
cv = KFold(n_splits=3,random_state=42,shuffle=True)

alphas = [1.,10.,100.,1000.]
mse_dict = {}
mae_dict = {}
for train_index, test_index in cv.split(training_data):
    print("Train Index: ", train_index)
    print("Test Index: ", test_index)
    
    X_train, X_test, y_train, y_test = training_data[train_index], training_data[test_index], prices[train_index], prices[test_index]
    X_train, X_test = normalize_data_v2(X_train,X_test, type='standard')
      
    for alpha in alphas:
      linear_regression_model = Ridge(alpha=alpha)
      linear_regression_model.fit(X_train, y_train)
      output = linear_regression_model.predict(X_test)
      
      if alpha not in mse_dict:
        mse_dict[alpha] = [mean_squared_error(y_test, output)]
      else:
        mse_dict[alpha].append(mean_squared_error(y_test, output))

      if alpha not in mae_dict:
        mae_dict[alpha] = [mean_absolute_error(y_test, output)]
      else:
        mae_dict[alpha].append(mean_absolute_error(y_test, output))
      
      
for alpha in alphas:
  print(f"Alpha:{alpha}, Mean mse:{np.mean(mse_dict[alpha])}")
  print(f"Alpha:{alpha}, Mean mae:{np.mean(mae_dict[alpha])}\n")
    

Train Index:  [   0    1    2 ... 4873 4874 4876]
Test Index:  [   6    8   12 ... 4875 4877 4878]
Train Index:  [   2    3    4 ... 4875 4877 4878]
Test Index:  [   0    1    7 ... 4869 4872 4876]
Train Index:  [   0    1    6 ... 4876 4877 4878]
Test Index:  [   2    3    4 ... 4871 4873 4874]
Alpha:1.0, Mean mse:3.1630916595458984
Alpha:1.0, Mean mae:1.317070722579956

Alpha:10.0, Mean mse:3.1630642414093018
Alpha:10.0, Mean mae:1.3168717622756958

Alpha:100.0, Mean mse:3.16888689994812
Alpha:100.0, Mean mae:1.3163594007492065

Alpha:1000.0, Mean mse:3.4315528869628906
Alpha:1000.0, Mean mae:1.365923285484314



In [None]:
# Exercitiul 4
linear_regression_model = Ridge(alpha=10)
X_train, _ = normalize_data_v2(training_data,training_data, type='standard')
linear_regression_model.fit(X_train, prices)

print(linear_regression_model.coef_)
print()
print(linear_regression_model.intercept_)
print()