In [50]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from sklearn.model_selection import train_test_split # Split data into train and validation sets
from numpy import linalg # Calculate the inverse of a matrix
import matplotlib.pyplot as plt # Data visualization
from time import process_time # Calculate elapsed CPU time


In [51]:
# The normal equation

def calculate_theta(X,y):
    """
     Calculates the theta vector using the normal equation.
    
    :param X: inputs (feature values) - data frame of floats
    :param y: outputs (actual target values) - Numpy array of floats
    
    :return: new theta - Numpy array of floats
    
    """
    # Calculate transpose of X
    X_transpose = X.transpose()
    
    # Calculate the dot product between X_transpose and X
    temp_0 = np.dot(X_transpose, X)
        
    # Calculate the inverse of temp_0
    try:
        temp_1 = linalg.inv(temp_0)
     
    except:
        print("\033[93mWarning: Non-invertible Matrix! pinv() will be used\033[0m")
        temp_1 = linalg.pinv(temp_0)

    # Calculate the dot product between temp_1 and X_transpose
    temp_2 = np.dot(temp_1, X_transpose)

    # Calculate the dot product between temp_2 and y
    theta = np.dot(temp_2, y) 

    return  theta.reshape(-1)

In [52]:
# The hypothesis
def h(x, theta):
    """
     Calculates the predicted values (or predicted targets) for a given set of input and theta vectors.
    
    :param x: inputs (feature values) - data frame of floats 
    :param theta: theta vector (weights) - Numpy array of floats
    
    :return: predicted targets - Numpy array of floats
    
    """
    # The hypothesis is a column vector of m x 1
    return np.dot(x, theta)

In [53]:
# The cost function

def J(X,y,theta):
    """
     Calculates the total error using squared error function.
    
    :param X: inputs (feature values) - data frame of floats
    :param y: outputs (actual target values) - Numpy array of floats
    :param theta: theta vector (weights) - Numpy array of floats
    
    :return: total error - float
    
    """
    # Calculate number of examples
    m = len(X)
    
    # Calculate the constant
    c = 1/(2 * m)
       
    # Calculate the array of errors
    temp_0 = h(X, theta) - y.reshape(-1)

    # Calculate the transpose of array of errors
    temp_1 = temp_0.transpose()

    # Calculate the dot product 
    temp_2 = np.dot(temp_1, temp_0) 

    return  c * temp_2

In [72]:
#Loading Data

from sklearn.datasets import make_regression
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


#First, Dataset1
X,y = make_regression(n_samples=1000, n_features=1, n_informative=1, n_targets=1,noise=20,random_state=13)
X = pd.DataFrame(data=X)
y = pd.DataFrame(data=y)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)


#Second, Dataset_2
X_d,y_d = load_diabetes(return_X_y=True)
X_d = pd.DataFrame(data=X_d)
y_d = pd.DataFrame(data=y_d)
X_d_train,X_d_test,y_d_train,y_d_test = train_test_split(X_d,y_d,test_size=0.2,random_state=2)

In [56]:
# Initialize with DataSet_1

# Calculate elapsed CPU time
start = process_time()

# Calculate the number of examles
m_train = len(X_train)
m_valid = len(X_test)

# Calculate the number of features
# including X_0
n = X_train.shape[1] + 1

# Create a list of ones
ones_train = [1] * m_train
ones_valid = [1] * m_valid

# Insert ones to the fist column since
# X_0 for all training examples should
# be one.
X_train.insert(0, "X_0", ones_train, True)
X_test.insert(0, "X_0", ones_valid, True)

# Find the theta vector using the normal equation
theta_train = calculate_theta(X_train,y_train)

# Calculate elapsed CPU time
end = process_time()
execution_time = (end - start)*1000

# display theta and cpu execution time of training
print("\nExecution time: {} milliseconds".format(execution_time))
print("\nCalculated\033[1m θ\033[0m: {}\n".format(theta_train))

# Calculate and display the cost value on the training dataset
cost_train = J(X_train, y_train[0].values, theta_train)
print("The training cost is: {}".format(cost_train))


Execution time: 6.199999999999761 milliseconds

Calculated[1m θ[0m: [ 0.57982696 89.15929199]

The training cost is: 205.96296512388238


In [68]:
print("\nCalculated\033[1m θ\033[0m: {}\n".format(theta_train))

cost_valid = J(X_test, y_test[0].values, theta_train)
print("The validation cost is: {}".format(cost_valid))
print("Score is :", r2_score(y_test, h(X_test,theta_train)))


Calculated[1m θ[0m: [ 0.57982696 89.15929199]

The validation cost is: 192.63057937510982
Score is : 0.9586262587330123


In [69]:
# Compare actual results with predicted results
result = pd.DataFrame(index=X_test.index)
result['Actual CoA'] = y_test
result['Predicted CoA'] = h(X_test, theta_train)
result.head()

Unnamed: 0,Actual CoA,Predicted CoA
37,-213.64879,-177.963376
726,-168.093422,-159.441406
846,-26.760272,-25.533165
295,-27.651345,3.694949
924,55.600652,87.034217


In [70]:
####Going for Second Dataset, DataSet_2

In [73]:
# Initialize with DataSet_2

# Calculate elapsed CPU time
start = process_time()

# Calculate the number of examles
m_train_d = len(X_d_train)
m_valid_d = len(X_d_test)

# Calculate the number of features
# including X_0
n = X_d_train.shape[1] + 1

# Create a list of ones
ones_train_d = [1] * m_train_d
ones_valid_d = [1] * m_valid_d

# Insert ones to the fist column since
# X_0 for all training examples should
# be one.
X_d_train.insert(0, "X_0", ones_train_d, True)
X_d_test.insert(0, "X_0", ones_valid_d, True)

# Find the theta vector using the normal equation
theta_train_d = calculate_theta(X_d_train,y_d_train)

# Calculate elapsed CPU time
end = process_time()
execution_time = (end - start)*1000

# display theta and cpu execution time of training
print("\nExecution time: {} milliseconds".format(execution_time))
print("\nCalculated\033[1m θ\033[0m: {}\n".format(theta_train_d))

# Calculate and display the cost value on the training dataset
cost_train_d = J(X_d_train, y_d_train[0].values, theta_train_d)
print("The training cost is: {}".format(cost_train_d))


Execution time: 83.71800000000107 milliseconds

Calculated[1m θ[0m: [ 151.88331005   -9.15865318 -205.45432163  516.69374454  340.61999905
 -895.5520019   561.22067904  153.89310954  126.73139688  861.12700152
   52.42112238]

The training cost is: 1407.1067961714602


In [74]:
print("\nCalculated\033[1m θ\033[0m: {}\n".format(theta_train_d))

cost_valid_d = J(X_d_test, y_d_test[0].values, theta_train_d)
print("The validation cost is: {}".format(cost_valid_d))
print("Score is :", r2_score(y_d_test, h(X_d_test,theta_train_d)))


Calculated[1m θ[0m: [ 151.88331005   -9.15865318 -205.45432163  516.69374454  340.61999905
 -895.5520019   561.22067904  153.89310954  126.73139688  861.12700152
   52.42112238]

The validation cost is: 1547.2283357830304
Score is : 0.4399338661568972


In [77]:
# Compare actual results with predicted results
result_d = pd.DataFrame(index=X_d_test.index)
result_d['Actual CoA'] = y_d_test
result_d['Predicted CoA'] = h(X_d_test, theta_train_d)
result_d.head()

Unnamed: 0,Actual CoA,Predicted CoA
212,73.0,154.121388
422,233.0,204.818351
67,97.0,124.937554
89,111.0,106.089509
390,277.0,258.534858
