<a href="https://colab.research.google.com/github/GiorgiaAuroraAdorni/ML-bachelor-course-assignments-sp23/blob/main/assignment%201/deliverable/example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 1
Student: Albert Cerfeda

--- 
# IMPORTANT: all the submitted code should be in 2 cells
1) How you trained, evaluated and saved your model
2) How to load your model from a file, load the data and evaluate the model. Cell 2) should be running independently (even if cell 1 is not run)

In [24]:
# Import libraries
import io
import requests
import numpy as np
import matplotlib.pyplot as plt # Library for plotting
import pickle

from sklearn.model_selection import train_test_split    # Function for splitting dataset into train and test
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error          # Function to calculate the Mean Squared Error (our performance measure)


# Load data 
data_path = '../data/data.npz' # path to the .npz file storing the data
data = np.load(data_path)

x = data.f.x
y = data.f.y.reshape(-1,1)


# T1
# Use the family of models f(x, theta) = theta_0 + theta_1 * x_1 + theta_2 * x_2 + theta_3 * sin(x_2) + theta_4 * x_1 * x_2 to fit the data:
# - Use the first 80% of the data for training and the remaining 20% for testing

# Split data into training data and test data
# 0.7 means 70% of the data is used for training and 30% for testing
# We shuffle the data to grant the independetly and identically distributed properties of the data
train, test = train_test_split(np.hstack((x,y)), train_size=0.7, shuffle=True, random_state=0)

# We normalize the features around their mean and standard deviation
train = (train- train.mean(0)) / train.std(0)
test = (test- test.mean(0)) / test.std(0)

x_train = train[:, 0:2] 
y_train = train[:, -1] # The last row (the 3rd) is the feature we are trying to predict
x_test  = test[:, 0:2]
y_test  = test[:, -1]

# 2 features - 5 parameters
def linear_function(x, theta):
    y = theta[0] + x[1]*theta[1] + x[2]*theta[2] + sin(x[2])*theta[3] + theta[4] * x[1] * x[2]
    return y

def compact_form(x):
    ones_vector = np.ones((x.shape[0],1))
    x = x.reshape(-1,2)
    return np.hstack((ones_vector, x[:,0].reshape(-1,1), x[:,1].reshape(-1,1), np.sin(x[:,1]).reshape(-1,1), (x[:,0]*x[:,1]).reshape(-1,1)))#.reshape(-1,5)

x_train_compact = compact_form(x_train)
x_test_compact = compact_form(x_test)
## Solving using sklearn.LinearRegression()
d_linear_model = LinearRegression(fit_intercept=False)
d_linear_model.fit(x_train_compact, y_train)
theta = d_linear_model.coef_.T
### Measure performance
#### Train set
d_train_pred = d_linear_model.predict(x_train_compact)
d_train_performance = mean_squared_error(d_train_pred, y_train)
#### Test set
d_test_pred = d_linear_model.predict(x_test_compact)
d_test_performance = mean_squared_error(d_test_pred, y_test)




print("== LINEAR REGRESSION ==")
print('train.shape :', train.shape)
print('test.shape :', test.shape)
print(f"MSE Train set:\t{d_train_performance:.2f}")
print(f"MSE Test set:\t{d_test_performance:.2f}")
print(f"Optimal theta:\t{theta}")

pickle.dump(d_linear_model, open("./linearmodel.pickle", 'wb'))

# T2 : Solving using Steepest Descent

def calculate_gradient(X, Y, th):
  n = X.shape[0]
  gradient = (- 2 / n) * np.dot(X.T, (Y.reshape(-1,1) - np.dot(X, th).reshape(-1,1)))
  return gradient

# Calculates the mean squared error
def V(X, Y, theta):  
  return np.mean((Y.reshape(-1,1) - np.dot(X, theta))**2)


np.random.seed(0)
x_train_compact = compact_form(x_train)
theta = np.random.uniform(size=(x_train_compact.shape[1],1))

# step size (hyperparameter)
eps = 0.1                                             
# number of GD steps
steps = 100000

## History for plotting
thetas_history = np.zeros(shape=(theta.shape[0], steps+1))
loss_history = np.zeros(shape=(steps+1,1))
thetas_history[:,0:1] = theta.copy()
loss_history[0] = V(x_train_compact, y_train, theta)

# Perform GD
for i in range(1, steps+1):
  grad = calculate_gradient(x_train_compact, y_train, theta)
  theta = theta - eps * grad
  # log theta and loss
  thetas_history[:,i:i+1] = theta.copy()
  loss_history[i] = V(x_train_compact, y_train, theta)

thetas_history = np.array(thetas_history)
loss_history = np.array(loss_history)


print("\n== Steepest Descent ==")
print(f"MSE Train set:\t{loss_history[-1][0]:.2f}")
print(f"Optimal theta:\t{theta}")


# T3 (Bonus)

# print("Everything useful")



== LINEAR REGRESSION ==
train.shape : (1400, 3)
test.shape : (600, 3)
MSE Train set:	0.49
MSE Test set:	0.48
Optimal theta:	[ 0.00205442  0.04163966  0.13865613 -1.21626924  0.05262409]

== Steepest Descent ==
MSE Train set:	0.49
Optimal theta:	[[ 0.00205442]
 [ 0.04163966]
 [ 0.13865613]
 [-1.21626924]
 [ 0.05262409]]


# Example on how to use baseline model:

In [22]:
# Import libraries
import joblib
import io
import requests
import numpy as np

def evaluate_predictions(y_true, y_pred):
    """
    Evaluates the mean squared error between the values in y_true and the values
    in y_pred.
    ### YOU CAN NOT EDIT THIS FUNCTION ###
    :param y_true: Numpy array, the true target values from the test set;
    :param y_pred: Numpy array, the values predicted by your model.
    :return: float, the mean squared error between the two arrays.
    """
    print(y_true.shape, y_pred.shape)
    assert y_true.shape == y_pred.shape
    return ((y_true - y_pred) ** 2).mean()


def load_model(filename):
    """
    Loads a Scikit-learn model saved with joblib.dump.
    This is just an example, you can write your own function to load the model.
    Some examples can be found in src/utils.py.
    :param filename: string, path to the file storing the model.
    :return: the model.
    """
    model = joblib.load(filename)

    return model

# Load the data
# This will be replaced with our private test data when grading the assignment

# Load data from url
url = 'https://drive.switch.ch/index.php/s/TeDwnbYsBKRuJjv/download'
response = requests.get(url)
data = np.load(io.BytesIO(response.content))

# Alternatively yo can load the data from file
# data_path = '../data/data.npz'
# data = np.load(data_path)

# x is a Numpy array of shape (n_samples, n_features) with the inputs
x = data.f.x
# y is a Numpy array of shape (n_samples, ) with the targets
y = data.f.y

# Load the trained model
baseline_model_path = "./linearmodel.pickle"
baseline_model = pickle.load(open(baseline_model_path, 'rb'))


# Change input
x = compact_form(x)

# Predict on the given samples
y_pred = baseline_model.predict(x)

############################################################################
# STOP EDITABLE SECTION: do not modify anything below this point.
############################################################################

# Evaluate the prediction using MSE
mse = evaluate_predictions(y_pred, y)
print(f'MSE on whole dataset: {mse}')

# NOTE: NOW THIS CELL IS NOT WORKING SINCE YOU NEED TO CHANGE THE INPUT.
# DO IT AND EVERYTHING RUNS SMOOTH


(2000,) (2000,)
MSE on whole dataset: 4.4063525700419355
