# Source Code: Multiple Variables Regression

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

# Success
print('Using Scikit-Learn version', sklearn.__version__)

Using Scikit-Learn version 0.18.1


In [2]:
# Load the Data
data_train = pd.read_csv('data/reg2_data.csv')
data_test = pd.read_csv('data/reg2_check.csv')

## Splitting Dataset into Training and Testing Sets

In [3]:
# Split the Data into Training and Testing Sets
target_fields = ['TGT0','TGT1','TGT2','TGT3']
features_train, targets_train = data_train.drop(target_fields, axis=1), data_train[target_fields]
features_test, targets_test = data_test.drop(target_fields, axis=1), data_test[target_fields]

## Data Exploration

In [4]:
# Number of Features and Targets
num_features = features_train.shape[1]
num_targets = targets_train.shape[1]

print("The training dataset dataset has {} data points with {} variables each.".format(*data_train.shape))
print("The testing dataset dataset has {} data points with {} variables each.".format(*data_test.shape))
print("The dataset has {} number of feature variables.".format(num_features))
print("The dataset has {} number of target variables.".format(num_targets))

The training dataset dataset has 1000 data points with 14 variables each.
The testing dataset dataset has 105 data points with 14 variables each.
The dataset has 10 number of feature variables.
The dataset has 4 number of target variables.


In [5]:
# Row and column sizes: Training Data
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
Value00    1000 non-null float64
Value01    1000 non-null float64
Value02    1000 non-null float64
Value03    1000 non-null float64
Value04    1000 non-null float64
Value05    1000 non-null float64
Value06    1000 non-null float64
Value07    1000 non-null float64
Value08    1000 non-null float64
Value09    1000 non-null float64
TGT0       1000 non-null float64
TGT1       1000 non-null float64
TGT2       1000 non-null float64
TGT3       1000 non-null float64
dtypes: float64(14)
memory usage: 109.5 KB


In [6]:
# Mean, Stddev, Min, Max Values: Training Data
data_describe = data_train.describe()

data_describe

Unnamed: 0,Value00,Value01,Value02,Value03,Value04,Value05,Value06,Value07,Value08,Value09,TGT0,TGT1,TGT2,TGT3
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.05377,-0.04727,0.02119,-0.01912,-0.0383,-0.04683,-0.03379,-0.01643,0.05873,-0.00908,87.47173,91.66473,94.51842,90.35287
std,0.974953,0.990043,0.982625,1.016896,0.999852,1.008939,0.991402,1.005754,0.954923,0.946831,169.21764,143.648337,168.817263,196.360771
min,-2.76,-2.84,-3.02,-2.81,-3.74,-3.07,-2.84,-3.12,-3.39,-2.99,-355.86,-368.27,-436.15,-564.11
25%,-0.73,-0.71,-0.6225,-0.76,-0.68,-0.72,-0.7125,-0.7625,-0.5525,-0.6525,-27.2675,-9.2475,-16.0525,-33.8525
50%,-0.06,-0.08,-0.005,-0.02,-0.03,-0.03,0.0,-0.05,0.05,-0.01,86.165,92.915,86.595,91.37
75%,0.6,0.64,0.69,0.6725,0.63,0.6375,0.62,0.6525,0.67,0.64,198.67,189.0275,213.725,222.695
max,2.93,2.66,2.87,3.21,3.31,3.43,3.17,3.8,2.98,3.09,723.19,585.31,650.52,705.46


# Building Machine Learning Model

## Analytical Linear Regression (Ordinary Least Squares)

In [7]:
# Analytical Solution using Normal Equation
def analytic_normal(features_train, features_test, targets_train):
    
    # Append Bias Term to Features Matrix
    n_training_samples = features_train.shape[0]
    n_dimensions = features_train.shape[1]
    n_targets = targets_train.shape[1]
    
    # Features + Bias Matrix
    X = np.reshape(np.c_[np.ones(n_training_samples),features_train],[n_training_samples,n_dimensions + 1])
    
    # Targets Matrix
    Y = targets_train
    
    # Solve for Theta = (X(T).X)^(-1).X(T).Y
    # Theta will include Bias + Weights Matrix
    theta = np.dot(np.linalg.inv(np.matrix(np.dot(X.T, X))), np.dot(X.T, Y))
    theta = np.array(np.reshape(theta,(-1,n_targets)))
    
    # Split Theta into Bias and Weights
    bias = theta[0]
    weights = theta[1:]
    
    # Predictions on Training and Testing Sets
    pred_trn = np.dot(features_train, weights) + bias
    pred_tst = np.dot(features_test, weights) + bias
    
    return bias, weights, pred_trn, pred_tst

In [8]:
# Returning Solution to the Analytical Function
bias, weights, pred_trn, pred_tst = analytic_normal(features_train, features_test, targets_train)

### Results

In [26]:
# Create Dataframe for Linear Coefficients
inhouse_lc = pd.DataFrame(weights)
inhouse_lc.columns = ['TGT0', 'TGT1', 'TGT2', 'TGT3']
inhouse_lc = inhouse_lc.set_index(features_train.columns)

# Create Dataframe for Bias Value
inhouse_bias = pd.DataFrame(bias)
inhouse_bias.columns = ['Bias']
inhouse_bias = inhouse_bias.set_index(targets_train.columns)

# Merge Dataframe
inhouse_result = pd.concat([inhouse_lc.T, inhouse_bias], axis=1, join_axes=[inhouse_lc.T.index])

inhouse_result.T

Unnamed: 0,TGT0,TGT1,TGT2,TGT3
Value00,20.905332,41.552513,50.182144,82.005159
Value01,88.158113,45.44959,62.034891,69.4342
Value02,13.217732,43.906325,69.67326,63.395746
Value03,9.643977,40.14146,52.256499,73.24341
Value04,8.246295,33.614186,5.06202,14.806817
Value05,75.144577,36.345816,5.585402,81.583236
Value06,77.771108,50.226606,31.071148,13.283668
Value07,65.36556,46.032369,98.183041,67.698966
Value08,13.210945,35.0446,53.477449,79.023733
Value09,74.468893,82.573827,49.184078,43.227789


In [10]:
# Create Dataframe for R-Squared Values
from sklearn.metrics import r2_score
rsquared = np.array([r2_score(targets_train, pred_trn),r2_score(targets_test, pred_tst)])
rs_rows = np.array(['Training Data', 'Testing Data'])

inhouse_rs = pd.DataFrame(rsquared)
inhouse_rs.columns = ['R-Squared Accuracy']
inhouse_rs = inhouse_rs.set_index(rs_rows)

inhouse_rs



Unnamed: 0,R-Squared Accuracy
Training Data,0.99945
Testing Data,0.999839


## Analytical Linear Regression (Ordinary Least Squares) Using Scikit-Learn

In [11]:
# ML Linear Regression using Scikit-Learn
from sklearn.linear_model import LinearRegression

LinReg = LinearRegression()
LinReg.fit(features_train, targets_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### Results

In [23]:
# Create Dataframe for Linear Coefficients
sklearn_lc = pd.DataFrame(LinReg.coef_.T)
sklearn_lc.columns = ['TGT0', 'TGT1', 'TGT2', 'TGT3']
sklearn_lc = sklearn_lc.set_index(features_train.columns)

# Create Dataframe for Bias Value
sklearn_bias = pd.DataFrame(LinReg.intercept_)
sklearn_bias.columns = ['Bias']
sklearn_bias = sklearn_bias.set_index(targets_train.columns)

# Merge Dataframe
sklearn_result = pd.concat([sklearn_lc.T, sklearn_bias], axis=1, join_axes=[sklearn_lc.T.index])

sklearn_result.T

Unnamed: 0,TGT0,TGT1,TGT2,TGT3
Value00,20.905332,41.552513,50.182144,82.005159
Value01,88.158113,45.44959,62.034891,69.4342
Value02,13.217732,43.906325,69.67326,63.395746
Value03,9.643977,40.14146,52.256499,73.24341
Value04,8.246295,33.614186,5.06202,14.806817
Value05,75.144577,36.345816,5.585402,81.583236
Value06,77.771108,50.226606,31.071148,13.283668
Value07,65.36556,46.032369,98.183041,67.698966
Value08,13.210945,35.0446,53.477449,79.023733
Value09,74.468893,82.573827,49.184078,43.227789


In [13]:
# Create Dataframe for R-Squared Values
rsquared = np.array([LinReg.score(features_train,targets_train), LinReg.score(features_test,targets_test)])
rs_rows = np.array(['Training Data', 'Testing Data'])

sklearn_rs = pd.DataFrame(rsquared)
sklearn_rs.columns = ['R-Squared Accuracy']
sklearn_rs = sklearn_rs.set_index(rs_rows)

sklearn_rs

Unnamed: 0,R-Squared Accuracy
Training Data,0.99945
Testing Data,0.999839


# Building Neural Network Model

## Single-Layer Neural Network Using Google TensorFlow

- Optimized Using Gradient Descent
- No Hidden Layer
- No Activation Function
- Predictions is therefore Y = W.X + b

In [14]:
# Import TF Library
import tensorflow as tf
print("Using Google TensorFlow version", tf.__version__)

Using Google TensorFlow version 1.0.0


In [15]:
# Build NN Architecture
def neural_network(learning_rate, epochs, features_train, targets_train, \
                   features_test, targets_test, size_in, size_out):

    # Start TensorFlow Graph
    tf.reset_default_graph()
    sess = tf.Session()
       
    # Inputs
    with tf.name_scope("Inputs"):
        x = tf.placeholder(tf.float32, shape=[None, size_in], name="features")
    
    # Targets    
    with tf.name_scope("Targets"):
        y = tf.placeholder(tf.float32, shape=[None, size_out], name="targets")

    # Single-Layer Dense Neural Network
    with tf.name_scope("Dense"):
        w = tf.Variable(tf.ones([size_in, size_out]), name="weights")
        b = tf.Variable(tf.constant(1.0, shape=[size_out]), name="biases")
        y_ = tf.add(tf.matmul(x, w), b)
        tf.summary.histogram("Weights", w)
        tf.summary.histogram("Biases", b)
        tf.summary.histogram("Predictions", y_)

    # Mean Squared Error (MSE) Cost Function
    with tf.name_scope("MSE-Cost"):
        mse = tf.reduce_mean(tf.square(y_ - y), name="MSE")
        tf.summary.scalar("Mean_Squared_Error", mse)

    # Gradient Descent Optimizer
    with tf.name_scope("Training"):
        train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(mse)

    # R-Squared Accuracy Score
    with tf.name_scope("R-Squared"):
        model_error = tf.reduce_sum(tf.square(tf.subtract(y, y_)))
        average_error = tf.reduce_sum(tf.square(tf.subtract(y, tf.reduce_mean(y))))
        # R-Squared Value
        accuracy = tf.subtract(1.0, (tf.div(model_error, average_error)))
        tf.summary.scalar("R_Squared_Value", accuracy)

    # Initialization
    summ = tf.summary.merge_all()  
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('./logs/1/train')
    writer.add_graph(sess.graph)
    
    # Training Set
    for i in range(epochs):
        if i % 10 == 0:
            [train_accuracy, s] = sess.run([accuracy, summ], \
                        feed_dict={x: features_train, y: targets_train})
            writer.add_summary(s, i)
        sess.run(train_step, feed_dict={x: features_train, y: targets_train})
    
    # Results
    slope = sess.run(w)
    intercept = sess.run(b)
    pred_train = sess.run(y_, feed_dict={x: features_train, y: targets_train})
    pred_test = sess.run(y_, feed_dict={x: features_test, y: targets_test})

    
    return (slope, intercept, pred_train, pred_test)

print('Build successful!')

Build successful!


In [16]:
# Setting up Hyperparameters
learning_rate = 0.01
epochs = 5000

print('Using {} learning rate on {} training epochs'.format(learning_rate, epochs))

Using 0.01 learning rate on 5000 training epochs


In [17]:
# Run Training
(slope, intercept, pred_train, pred_test) = neural_network(learning_rate, epochs, \
        features_train, targets_train, features_test, targets_test, num_features, num_targets)

### Results

In [24]:
# Create Dataframe for Linear Coefficients
tensorflow_lc = pd.DataFrame(slope)
tensorflow_lc.columns = ['TGT0', 'TGT1', 'TGT2', 'TGT3']
tensorflow_lc = tensorflow_lc.set_index(features_train.columns)

# Create Dataframe for Bias Value
tensorflow_bias = pd.DataFrame(intercept)
tensorflow_bias.columns = ['Bias']
tensorflow_bias = tensorflow_bias.set_index(targets_train.columns)

# Merge Dataframe
tensorflow_result = pd.concat([tensorflow_lc.T, tensorflow_bias], axis=1, join_axes=[tensorflow_lc.T.index])

tensorflow_result.T

Unnamed: 0,TGT0,TGT1,TGT2,TGT3
Value00,20.904984,41.551949,50.181709,82.004158
Value01,88.157341,45.449181,62.034386,69.433388
Value02,13.217596,43.905907,69.672462,63.395287
Value03,9.644102,40.14188,52.256523,73.244209
Value04,8.246093,33.613712,5.061895,14.806627
Value05,75.143692,36.345295,5.58531,81.582352
Value06,77.770287,50.226204,31.070923,13.283564
Value07,65.365891,46.032795,98.18235,67.699799
Value08,13.211051,35.044247,53.477032,79.022911
Value09,74.467979,82.572884,49.18364,43.2272


In [19]:
# Create Dataframe for R-Squared Values
tf_rsquared = np.array([r2_score(targets_train, pred_train), r2_score(targets_test, pred_test)])
tf_rs_rows = np.array(['Training Data', 'Testing Data'])

tensorflow_rs = pd.DataFrame(tf_rsquared)
tensorflow_rs.columns = ['R-Squared Accuracy']
tensorflow_rs = tensorflow_rs.set_index(tf_rs_rows)

tensorflow_rs



Unnamed: 0,R-Squared Accuracy
Training Data,0.99945
Testing Data,0.999839


# Conclusion

## Merging Dataframes

In [25]:
# Merge Result Dataframes: Linear Coefficients & Biases
combined_result = pd.concat([inhouse_result, sklearn_result, tensorflow_result], \
                            keys=['In-House', 'Scikit-Learn', 'TensorFlow'])

combined_result.T

Unnamed: 0_level_0,In-House,In-House,In-House,In-House,Scikit-Learn,Scikit-Learn,Scikit-Learn,Scikit-Learn,TensorFlow,TensorFlow,TensorFlow,TensorFlow
Unnamed: 0_level_1,TGT0,TGT1,TGT2,TGT3,TGT0,TGT1,TGT2,TGT3,TGT0,TGT1,TGT2,TGT3
Value00,20.905332,41.552513,50.182144,82.005159,20.905332,41.552513,50.182144,82.005159,20.904984,41.551949,50.181709,82.004158
Value01,88.158113,45.44959,62.034891,69.4342,88.158113,45.44959,62.034891,69.4342,88.157341,45.449181,62.034386,69.433388
Value02,13.217732,43.906325,69.67326,63.395746,13.217732,43.906325,69.67326,63.395746,13.217596,43.905907,69.672462,63.395287
Value03,9.643977,40.14146,52.256499,73.24341,9.643977,40.14146,52.256499,73.24341,9.644102,40.14188,52.256523,73.244209
Value04,8.246295,33.614186,5.06202,14.806817,8.246295,33.614186,5.06202,14.806817,8.246093,33.613712,5.061895,14.806627
Value05,75.144577,36.345816,5.585402,81.583236,75.144577,36.345816,5.585402,81.583236,75.143692,36.345295,5.58531,81.582352
Value06,77.771108,50.226606,31.071148,13.283668,77.771108,50.226606,31.071148,13.283668,77.770287,50.226204,31.070923,13.283564
Value07,65.36556,46.032369,98.183041,67.698966,65.36556,46.032369,98.183041,67.698966,65.365891,46.032795,98.18235,67.699799
Value08,13.210945,35.0446,53.477449,79.023733,13.210945,35.0446,53.477449,79.023733,13.211051,35.044247,53.477032,79.022911
Value09,74.468893,82.573827,49.184078,43.227789,74.468893,82.573827,49.184078,43.227789,74.467979,82.572884,49.18364,43.2272


In [21]:
# Merge Result Dataframes: R-Squared Values
combined_accuracy = pd.concat([inhouse_rs, sklearn_rs, tensorflow_rs], \
                            keys=['In-House', 'Scikit-Learn', 'TensorFlow'])

combined_accuracy

Unnamed: 0,Unnamed: 1,R-Squared Accuracy
In-House,Training Data,0.99945
In-House,Testing Data,0.999839
Scikit-Learn,Training Data,0.99945
Scikit-Learn,Testing Data,0.999839
TensorFlow,Training Data,0.99945
TensorFlow,Testing Data,0.999839
