# Using Scikit-Learn Linear Regression ML Model

In [1]:
# Import libraries necessary
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
print('Using Scikit-Learn version {}.'.format(sklearn.__version__))

# Load the Data
data_train = pd.read_csv('data/reg2_data.csv')
data_test = pd.read_csv('data/reg2_check.csv')

# Split the Data into Training and Testing Sets
target_fields = ['TGT0','TGT1','TGT2','TGT3']
features_train, targets_train = data_train.drop(target_fields, axis=1), data_train[target_fields]
features_test, targets_test = data_test.drop(target_fields, axis=1), data_test[target_fields]

# Number of Features and Targets
num_features = features_train.shape[1]
num_targets = targets_train.shape[1]

# Success
print("The training dataset dataset has {} data points with {} variables each.".format(*data_train.shape))
print("The testing dataset dataset has {} data points with {} variables each.".format(*data_test.shape))
print("The dataset has {} number of feature variables.".format(num_features))
print("The dataset has {} number of target variables.".format(num_targets))


Using Scikit-Learn version 0.18.1.
The training dataset dataset has 1000 data points with 14 variables each.
The testing dataset dataset has 105 data points with 14 variables each.
The dataset has 10 number of feature variables.
The dataset has 4 number of target variables.


## Building the Machine Learning Models

### Linear Regression

In [2]:
from sklearn.linear_model import LinearRegression

LinReg = LinearRegression()
LinReg.fit(features_train, targets_train)

print("Linear Regression R-Squared Score on Training Data:", LinReg.score(features_train,targets_train))
print("Linear Regression R-Squared Score on Testing Data:", LinReg.score(features_test,targets_test))


Linear Regression R-Squared Score on Training Data: 0.999450351427
Linear Regression R-Squared Score on Testing Data: 0.999838707527


In [3]:
# Create Dataframe for Coefficients
cf = pd.DataFrame(LinReg.coef_.T)
cf.columns = ['TGT0 Linear Coef', 'TGT1 Linear Coef', 'TGT2 Linear Coef', 'TGT3 Linear Coef']
cf = cf.set_index(features_train.columns)

cf

Unnamed: 0,TGT0 Linear Coef,TGT1 Linear Coef,TGT2 Linear Coef,TGT3 Linear Coef
Value00,20.905332,41.552513,50.182144,82.005159
Value01,88.158113,45.44959,62.034891,69.4342
Value02,13.217732,43.906325,69.67326,63.395746
Value03,9.643977,40.14146,52.256499,73.24341
Value04,8.246295,33.614186,5.06202,14.806817
Value05,75.144577,36.345816,5.585402,81.583236
Value06,77.771108,50.226606,31.071148,13.283668
Value07,65.36556,46.032369,98.183041,67.698966
Value08,13.210945,35.0446,53.477449,79.023733
Value09,74.468893,82.573827,49.184078,43.227789


In [4]:
# Create Dataframe for Bias / Intercept
bt = pd.DataFrame(LinReg.intercept_)
bt.columns = ['Zero Intercept / Bias Value']
bt = bt.set_index(targets_train.columns)

bt

Unnamed: 0,Zero Intercept / Bias Value
TGT0,100.104347
TGT1,100.019108
TGT2,100.096213
TGT3,99.801738


In [5]:
# Create Dataframe for R-Squared Values
rsquared = np.array([LinReg.score(features_train,targets_train), LinReg.score(features_test,targets_test)])
rs_rows = np.array(['Training Data', 'Testing Data'])

rs = pd.DataFrame(rsquared)
rs.columns = ['R-Squared Accuracy']
rs = rs.set_index(rs_rows)

rs

Unnamed: 0,R-Squared Accuracy
Training Data,0.99945
Testing Data,0.999839


# Using Deep Learning Model

In [6]:
import tensorflow as tf
print("Using Google TensorFlow version", tf.__version__)

# BUILD OUR MODEL
def neural_network(learning_rate, epochs, features_train, targets_train, \
                   features_test, targets_test, size_in, size_out):

    # Start TensorFlow Graph
    tf.reset_default_graph()
    sess = tf.Session()
       
    # Inputs
    with tf.name_scope("Inputs"):
        x = tf.placeholder(tf.float32, shape=[None, size_in], name="features")
    
    # Targets    
    with tf.name_scope("Targets"):
        y = tf.placeholder(tf.float32, shape=[None, size_out], name="targets")

    # Single-Layer Dense Neural Network
    with tf.name_scope("Dense"):
        w = tf.Variable(tf.ones([size_in, size_out]), name="weights")
        b = tf.Variable(tf.constant(1.0, shape=[size_out]), name="biases")
        y_ = tf.add(tf.matmul(x, w), b)
        tf.summary.histogram("Weights", w)
        tf.summary.histogram("Biases", b)
        tf.summary.histogram("Predictions", y_)

    # Mean Squared Error (MSE) Cost Function
    with tf.name_scope("MSE-Cost"):
        mse = tf.reduce_mean(tf.square(y_ - y), name="MSE")
        tf.summary.scalar("Mean_Squared_Error", mse)

    # Gradient Descent Optimizer
    with tf.name_scope("Training"):
        train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(mse)

    # R-Squared Accuracy Score
    with tf.name_scope("R-Squared"):
        model_error = tf.reduce_sum(tf.square(tf.subtract(y, y_)))
        average_error = tf.reduce_sum(tf.square(tf.subtract(y, tf.reduce_mean(y))))
        # R-Squared Value
        accuracy = tf.subtract(1.0, (tf.div(model_error, average_error)))
        tf.summary.scalar("R_Squared_Value", accuracy)

    # Initialization
    summ = tf.summary.merge_all()  
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('./logs/1/train')
    writer.add_graph(sess.graph)
    
    # Training Set
    for i in range(epochs):
        if i % 10 == 0:
            [train_accuracy, s] = sess.run([accuracy, summ], \
                        feed_dict={x: features_train, y: targets_train})
            writer.add_summary(s, i)
        sess.run(train_step, feed_dict={x: features_train, y: targets_train})
    
    # Results
    slope = sess.run(w)
    intercept = sess.run(b)
    pred_train = sess.run(y_, feed_dict={x: features_train, y: targets_train})
    pred_test = sess.run(y_, feed_dict={x: features_test, y: targets_test})

    
    return (slope, intercept, pred_train, pred_test)


Using Google TensorFlow version 1.0.0


In [7]:
# Setting up Hyperparameters
learning_rate = 0.01
epochs = 5000

# Run Training
(slope, intercept, pred_train, pred_test) = neural_network(learning_rate, epochs, \
        features_train, targets_train, features_test, targets_test, num_features, num_targets)


In [8]:
from sklearn.metrics import r2_score
print("TensorFlow Linear Regression R-Squared Score on Training Data:", r2_score(targets_train, pred_train))
print("TensorFlow Linear Regression R-Squared Score on Testing Data:", r2_score(targets_test, pred_test))
print("Zero Intercept / Bias Value:", intercept)


TensorFlow Linear Regression R-Squared Score on Training Data: 0.999450351336
TensorFlow Linear Regression R-Squared Score on Testing Data: 0.999838683522
Zero Intercept / Bias Value: [ 100.10345459  100.01828003  100.09541321   99.80091858]




In [9]:
# Create Dataframe for TF Coefficients
tf_cf = pd.DataFrame(slope)
tf_cf.columns = ['TGT0 TF LC', 'TGT1 TF LC', 'TGT2 TF LC', 'TGT3 TF LC']
tf_cf = tf_cf.set_index(features_train.columns)

tf_cf

Unnamed: 0,TGT0 TF LC,TGT1 TF LC,TGT2 TF LC,TGT3 TF LC
Value00,20.904984,41.551949,50.181709,82.004158
Value01,88.157341,45.449181,62.034386,69.433388
Value02,13.217596,43.905907,69.672462,63.395287
Value03,9.644102,40.14188,52.256523,73.244209
Value04,8.246093,33.613712,5.061895,14.806627
Value05,75.143692,36.345295,5.58531,81.582352
Value06,77.770287,50.226204,31.070923,13.283564
Value07,65.365891,46.032795,98.18235,67.699799
Value08,13.211051,35.044247,53.477032,79.022911
Value09,74.467979,82.572884,49.18364,43.2272


In [10]:
# Create Dataframe for TF Bias / Intercept
tf_bt = pd.DataFrame(intercept)
tf_bt.columns = ['TensorFlow Zero Intercept / Bias Value']
tf_bt = tf_bt.set_index(targets_train.columns)

tf_bt

Unnamed: 0,TensorFlow Zero Intercept / Bias Value
TGT0,100.103455
TGT1,100.01828
TGT2,100.095413
TGT3,99.800919


In [11]:
# Create Dataframe for TF R-Squared Values
tf_rsquared = np.array([r2_score(targets_train, pred_train), r2_score(targets_test, pred_test)])
tf_rs_rows = np.array(['Training Data', 'Testing Data'])

tf_rs = pd.DataFrame(tf_rsquared)
tf_rs.columns = ['TensorFlow R-Squared Accuracy']
tf_rs = tf_rs.set_index(tf_rs_rows)

tf_rs



Unnamed: 0,TensorFlow R-Squared Accuracy
Training Data,0.99945
Testing Data,0.999839
