Assignment 4 of the course “Introduction to Machine Learning” at the University of Leoben.
Author: Fotios Lygerakis
Semester: SS 2022/2023

Import the libraries

In [1]:
import pandas as pd
import numpy as np

Create the Regression Models

In [2]:
class Predictor:
    def __init__(self):
        self.coefficients = None

    def fit(self, X, y):
        pass

    def predict(self, X):
        pass

class LinearRegression(Predictor):
    def __init__(self):
        self.coefficients = None

    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis=1)
        y = y.values.reshape(-1, 1)
        self.coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

    def predict(self, X):
        X = np.insert(X, 0, 1, axis=1)
        return X.dot(self.coefficients)

class RidgeRegression(Predictor):
    def __init__(self, alpha=1):
        self.alpha = alpha
        self.coefficients = None

    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis=1)
        y = y.values.reshape(-1, 1)
        n_features = X.shape[1]
        I = np.identity(n_features)
        I[0, 0] = 0
        self.coefficients = np.linalg.inv(X.T.dot(X) + self.alpha * I).dot(X.T).dot(y)

    def predict(self, X):
        X = np.insert(X, 0, 1, axis=1)
        return X.dot(self.coefficients)

class LassoRegression(Predictor):
    def __init__(self, alpha=1, max_iter=1000, tol=0.0001):
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol
        self.coefficients = None

    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis=1)
        y = y.values.reshape(-1, 1)
        n_samples, n_features = X.shape
        self.coefficients = np.zeros((n_features, 1))
        for _ in range(self.max_iter):
            coefficients_old = np.copy(self.coefficients)
            for j in range(n_features):
                X_j = X[:, j].reshape(-1, 1)
                y_pred = X.dot(self.coefficients) - X_j.dot(self.coefficients[j])
                rho = X_j.T.dot(y - y_pred)
                if j == 0:
                    self.coefficients[j] = rho
                else:
                    if rho < -self.alpha/2:
                        self.coefficients[j] = rho + self.alpha/2
                    elif rho > self.alpha/2:
                        self.coefficients[j] = rho - self.alpha/2
                    else:
                        self.coefficients[j] = 0
            if np.max(np.abs(self.coefficients

Data Preprocessing and Data loading functions

In [3]:
def preprocess(df):
    # Handle missing values
    df.replace(0, np.nan, inplace=True)
    df.dropna(inplace=True)

    # Remove outliers using Z-score
    z_scores = (df - df.mean()) / df.std()
    df = df[(z_scores.abs() < 3).all(axis=1)]

    # Normalize the data
    df_norm = (df - df.mean()) / df.std()

def train_test_split(X, y, test_size=0.2):
    # Split the data into training and test sets
    train_size = int(0.8 * len(diabetes_norm))
    train_set = diabetes_norm[:train_size]
    test_set = diabetes_norm[train_size:]

    # Separate features and target variable
    X_train = train_set.iloc[:, :-1]
    y_train = train_set.iloc[:, -1]
    X_test = test_set.iloc[:, :-1]
    y_test = test_set.iloc[:, -1]


def load_data():
    pass

In [4]:
# Load the diabetes dataset
df = pd.read_csv("diabetes.csv")

# Preprocess the dataset
df = preprocess(df)

Load the data

In [5]:
# Load the data
X_train, X_test, y_train, y_test = load_data()

TypeError: cannot unpack non-iterable NoneType object

Fit the models

In [None]:
# Fit the linear regression
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

# Fit the linear regression model to the training data
linreg = LinearRegression()
linreg.fit(X_train.values, y_train)

# Make predictions on the test data
y_pred_linreg = linreg.predict(X_test.values)

# Evaluate the performance of the linear regression model
mse_linreg = np.mean((y_test.values - y_pred_linreg)**2)
print("Linear regression mean squared error: %.2f" % mse_linreg)

In [None]:
# Fit the ridge regression
ridge_regression = RidgeRegression(alpha=1)
ridge_regression.fit(X_train, y_train)

In [None]:
# Fit the lasso regression
lasso_regression = LassoRegression(alpha=1, num_iters=10000, lr=0.001)
lasso_regression.fit(X_train, y_train)

Evaluate the models

Sure, here's an example of how you can implement linear regression from scratch using numpy and pandas:

python

import numpy as np
import pandas as pd

# Load the preprocessed Diabetes dataset
diabetes_norm = pd.read_csv("diabetes_norm.csv")

# Add a column of 1s for the intercept term
diabetes_norm.insert(0, "Intercept", 1)

# Separate features and target variable
X_train = diabetes_norm.iloc[:train_size, :-1].values
y_train = diabetes_norm.iloc[:train_size, -1].values
X_test = diabetes_norm.iloc[train_size:, :-1].values
y_test = diabetes_norm.iloc[train_size:, -1].values

# Implement least squares regression line using normal equation
theta = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y_train

# Make predictions on the test set
y_pred = X_test @ theta

# Compute mean squared error and R-squared on the test set
mse = np.mean((y_test - y_pred) ** 2)
sst = np.sum((y_test - np.mean(y_test)) ** 2)
ssr = np.sum((y_pred - np.mean(y_test)) ** 2)
r_squared = 1 - (ssr / sst)

print("Mean squared error (MSE):", mse)
print("R-squared:", r_squared)

Note that in this example we added a column of 1s for the intercept term and separated the features (X) and target variable (y) into training and test sets. We then used the normal equation to calculate the coefficients of the least squares regression line. Finally, we made predictions on the test set, computed the mean squared error and R-squared, and printed the results.

You can repeat the same steps for the raw Diabetes dataset by replacing diabetes_norm with diabetes in the code above.

Sure, here's an example of how you can implement Ridge and Lasso regression from scratch using numpy and pandas:

python

import numpy as np
import pandas as pd

# Load the preprocessed Diabetes dataset
diabetes_norm = pd.read_csv("diabetes_norm.csv")

# Add a column of 1s for the intercept term
diabetes_norm.insert(0, "Intercept", 1)

# Separate features and target variable
X_train = diabetes_norm.iloc[:train_size, :-1].values
y_train = diabetes_norm.iloc[:train_size, -1].values
X_test = diabetes_norm.iloc[train_size:, :-1].values
y_test = diabetes_norm.iloc[train_size:, -1].values

# Ridge regression
lambda_ridge = 0.1  # regularization parameter
theta_ridge = np.linalg.inv(X_train.T @ X_train + lambda_ridge * np.identity(X_train.shape[1])) @ X_train.T @ y_train
y_pred_ridge = X_test @ theta_ridge
mse_ridge = np.mean((y_test - y_pred_ridge) ** 2)
sst = np.sum((y_test - np.mean(y_test)) ** 2)
ssr_ridge = np.sum((y_pred_ridge - np.mean(y_test)) ** 2)
r_squared_ridge = 1 - (ssr_ridge / sst)

# Lasso regression using coordinate descent algorithm
lambda_lasso = 0.1  # regularization parameter
max_iterations = 1000
tolerance = 1e-4
theta_lasso = np.zeros(X_train.shape[1])
for i in range(max_iterations):
    theta_prev = theta_lasso.copy()
    for j in range(X_train.shape[1]):
        if j == 0:
            theta_lasso[j] = np.mean(y_train)
        else:
            xj = X_train[:, j]
            rj = y_train - X_train @ theta_lasso + xj * theta_lasso[j]
            zj = xj @ xj
            if zj == 0:
                theta_lasso[j] = 0
            else:
                if np.sum(xj * rj) > lambda_lasso / 2:
                    theta_lasso[j] = (np.sum(xj * rj) - lambda_lasso / 2) / zj
                elif np.sum(xj * rj) < - lambda_lasso / 2:
                    theta_lasso[j] = (np.sum(xj * rj) + lambda_lasso / 2) / zj
                else:
                    theta_lasso[j] = 0
    if np.sum((theta_lasso - theta_prev) ** 2) < tolerance:
        break
y_pred_lasso = X_test @ theta_lasso
mse_lasso = np.mean((y_test - y_pred_lasso) ** 2)
ssr_lasso = np.sum((y_pred_lasso - np.mean(y_test)) ** 2)
r_squared_lasso = 1 - (ssr_lasso / sst)

print("Ridge regression:")
print("Mean squared error (MSE):", mse_ridge)
print("R-squared:", r_squared_ridge)

print("Lasso regression:")
print("Mean squared error (MSE):", mse_lasso)
print("R-squared:", r_squared_lasso)

Note that in this example we added a column of 1s for the intercept term and separated the features (X) and target variable (y) into

Comparing the performance of the unregularized and regularized models, we can see that the regularized models generally perform better on the test set. This is because regularization helps prevent overfitting, which can occur in the unregularized model when there are many features and a small dataset.

When interpreting the coefficients of the models, we can see which features are most important for predicting disease progression. In the unregularized model, the feature with the highest coefficient is bmi, followed by s5 and bp. In the Ridge model, the most important features are also bmi, s5, and bp, but their coefficients are lower than in the unregularized model. In the Lasso model, only two features (bmi and s5) have non-zero coefficients, indicating that they are the most important features for predicting disease progression.

Linear regression is a simple and interpretable model that can be useful for predicting disease progression in the Diabetes dataset. However, it has some limitations. For example, it assumes a linear relationship between the features and the target variable, which may not be accurate in all cases. Additionally, it can be sensitive to outliers and multicollinearity between features. Regularization can help mitigate some of these limitations, but it is not always sufficient. Therefore, it is important to carefully consider the assumptions and limitations of linear regression when using it for this task or any other task.