# CSE 514A Programming Assignment 1


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import random

## 1. Data Pre-processing

Pre-process the attribute values of your data by normalizing or standardizing each variable. Keep a copy that was not pre-processed, so you can analyze the effect that pre-processing the data has on the optimization.

In [None]:
# raw data
headerlist = ['cement component', 'blast furnace slag', 'fly ash', 'water', 'superplasticizer', 'coarse aggregate', 'fine aggregate', 'age', 'concrete compressive strength']
data_old = pd.read_excel('Concrete_Data.xls', names=headerlist)
data_old

In [None]:
# preprocessing
data_new = data_old.copy()



In [None]:
# plot both original data and pre-processed data
data_new.hist(alpha=0.5, figsize=(15, 10))
data_old.hist(alpha=0.5, figsize=(15, 10))
plt.tight_layout()
plt.show()  

# split data into train and test
X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(data_old[headerlist[0:8]], data_old[headerlist[8]], test_size=130, random_state=0)
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(data_new[headerlist[0:8]], data_new[headerlist[8]], test_size=130, random_state=0)

## B) Univariate Linear Regression

In [None]:
class univariate_regression:

    def __init__(self, X_train, X_test, Y_train, Y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.Y_train = Y_train
        self.Y_test = Y_test

    def initialize_variables(self):
        m = random.random()
        b = random.random()
        return m, b
    
    def make_prediciton(self,m, X, b):
        return m*X + b
    
    def mse(self,Y, Y_pred):
        return(1/np.size(Y))*np.sum(np.subtract(Y-Y_pred)**2)

    def gradient_descent(self, X, Y, m, b, alpha, iterations):
        for i in range(iterations):
            Y_pred = self.make_predicton(m, X, b)
            cost = self.mse(Y, Y_pred)
            update_m = 0
            update_b = 0
            for j in range(np.size(X)):
                x_i = X[j]
                y_i = Y[j]
                update_m += -2*x_i*(y_i - (m*x_i + b))
                update_b += -2*(y_i - (m*x_i + b))
            m = m - (alpha/np.size(X)*update_m)
            b = b - (alpha/np.size(X)*update_b)
        return m, b, cost
    
    def plot(self, X, Y, m, b):
        plt.scatter(X, Y, color='blue')
        plt.plot(X, m*X + b, color='red')
        plt.show()
    
    def run(self, alpha, iterations):
        m, b = self.initialize_variables()
        m, b, cost = self.gradient_descent(self.X_train, self.Y_train, m, b, alpha, iterations)
        self.plot(self.X_test, self.Y_test, m, b)
        return m, b, cost


## C) Multivariate Linear Regression

In [None]:
class multivariate_linear_regression:

    def __init__(self, X_train, X_test, Y_train, Y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.Y_train = Y_train
        self.Y_test = Y_test

    def initialize_variables(self):
        m = random.random()
        b = random.random()
        return m, b
    
    def make_prediciton(self,m, X, b):
        return m*X + b
    
    def mse(self,Y, Y_pred):
        return(1/np.size(Y))*np.sum(np.subtract(Y-Y_pred)**2)

    def batch_gradient_descent(self, X, Y, m, b, alpha, iterations):
        for i in range(iterations):
            Y_pred = self.make_predicton(m, X, b)
            cost = self.mse(Y, Y_pred)
            update_m = 0
            update_b = 0
            for j in range(np.size(X)):
                x_i = X[j]
                y_i = Y[j]
                update_m += -2*x_i*(y_i - (m*x_i + b))
                update_b += -2*(y_i - (m*x_i + b))
            m = m - (alpha/np.size(X)*update_m)
            b = b - (alpha/np.size(X)*update_b)
        return m, b, cost
    
    def variance(self, X):
        return np.var(X)
    
    def variance_explained(self, X, Y, m, b):
        Y_pred = self.make_prediciton(m, X, b)
        return (1-(np.var(Y_pred)/np.var(Y)))
    def plot(self, X, Y, m, b):
        plt.scatter(X, Y, color='blue')
        plt.plot(X, m*X + b, color='red')
        plt.show()
    
    def run(self, alpha, iterations):
        m, b = self.initialize_variables()
        m, b, cost = self.gradient_descent(self.X_train, self.Y_train, m, b, alpha, iterations)
        self.plot(self.X_test, self.Y_test, m, b)
        return m, b, cost


## D) Results