In [1]:
# THIRD-PARTY IMPORTS
# For array management and some calculations
import numpy as np
# Used for timing the running of codes
import time
# Ridge Methods
from sklearn.linear_model.ridge import Ridge
# For plotting
import matplotlib.pyplot as plt
# Prevents extraneous printing of messages during a grid search
import warnings
# For making parameter lists in hyperparameter tuning
from itertools import product

# LOCAL IMPORTS
from RegressionSupport import *

#############################
# IMPORTS
#############################
# SYSTEM LEVEL IMPORTS
# Import files from other directories
import sys

# THIRD-PARTY IMPORTS
# For array handling
import numpy as np
import matplotlib.pyplot as plt

# LOCAL IMPORTS
# Linear regression codes
from LinearRegression import LinearRegressionAnalysis
# Ridge regression codes
from RidgeRegression import RidgeRegressionAnalysis
# Kernel ridge regression codes
from KernelRidgeRegression import KernelRidgeRegressionAnalysis
# Support methods, including graphing capabilities
from RegressionSupport import *
# Changing the import directory
sys.path.append('../DataSets/')
# Data sets (mostly physics related)
from DataSets import *
from ElectronGas import *
from NuclearBindingEnergy import *
from EquationOfState import *


%matplotlib



Using matplotlib backend: MacOSX


In [2]:
X_tot, y_tot, design_matrix = EquationOfState()
#print(len(X_tot))
#X_tot = np.arange(1, 100)
#y_tot = X_tot ** 3

training_dim = 80

In [3]:
#############################
# KNOWN DATA SEQ (SEQUENTIAL)
#############################
def sequential (X_train, y_train, y_tot, training_dim, params, verbose=True, seq=2):
    """
        Inputs:
            X_train (a list or numpy array): the x component of the training data
            y_train (a list or numpy array): the y component of the training data
            y_tot (a list of numpy array): the total set of data points (training plus validation)
            training_dim (an int): the size of the traing data (i.e. the number of points
                from y_tot that are used in the training)
            params (a list): contains the parameters of the ridge regression 
                algorithm.  In order: normalize, alpha, and solver.
            verbose (a boolean): True case: prints the MSE score of the extrapolated data
                when compared to the true data.
        Returns:
            y_return (a list): the known points and the extrapolated data points
            Unnamed (a float): the MSE error between the true data and the predicted
                data
        Performs ridge regression on the given data set using the given parameters
        and then extrapolates data points to get a complete data set.  Prints the MSE 
        score of the extrapolated data set compared to the true data set if desired and
        then returns the extrapolated data set.

    """
    # To ensure that all parameters are present 
    assert len(params)==3

    # Set up the model
    r = Ridge (normalize = params[0], alpha = params[1], solver = params[2])    

    # Fit the model to the training data
    r.fit(X_train, y_train)

    # Use the trained model to predict the points in the validation set
    y_return = y_tot[:training_dim].tolist()
    plt.scatter (X_tot[:training_dim], y_return)
    next_input = [[y_return[-2], y_return[-1]]]
    last = y_return[-1]
    i = training_dim
    while len(y_return) < len(y_tot):
        try:
            next = r.predict(next_input)
        except:
            print ('Overflow encountered on predicton')
            return None, 1e10
        y_return.append(next[0])
        plt.scatter(X_tot[i], y_return[-1])
        plt.pause(1.0)
        i = i + 1
        next_input =[[last, next[0]]]
        last = next[0]

    # Print the MSE error if needed
    if verbose:  
        print ('RIDGE MSE VALUE: ', mse(y_tot, y_return))

    # Return the predicted points and the MSE error
    return y_return, mse(y_tot, y_return)

#############################
# KNOWN DATA CR SEQ (CONTINUOUS RETRAIN, SEQUENTIAL)
#############################
def sequential_autoregression (X_train, y_train, y_tot,
    training_dim, params, verbose, seq=2):
    """
        Inputs:
            X_train (a list or numpy array): the x component of the training data
            y_train (a list or numpy array): the y component of the training data
            y_tot (a list of numpy array): the total set of data points
            training_dim (an int): the size of the traing data (i.e. the number of points
                from y_tot that are used in the training)
            params (a list): contains the parameters of the ridge regression 
                algorithm.  In order: normalize, alpha, and solver.
            verbose (a boolean): True case: prints the MSE score of the extrapolated data
                when compared to the true data.
            seq (an int): the length of the series to use in the time series formatting (default 
                value is 2)    
        Returns:
            y_return (a list): the known points and the extrapolated data points
            Unnamed (a float): the MSE error between the true data and the predicted
                data
        Performs ridge regression on the given data set using the given parameters
        and then extrapolates data points to get a complete data set. Ridge 
        regression is performed after each point is extrapolated to hopefully decrease 
        the average MSE score.  Prints the MSE score of the extrapolated data set 
        compared to the true data set if desired and then returns the extrapolated data
        set.

    """
    # To ensure that all parameters are present
    assert len(params)==3

    # Set up the model
    r = Ridge (normalize = params[0], alpha = params[1], solver = params[2])

    # Add the known training data to the predicted points list
    y_return = y_tot[:training_dim].tolist()
    plt.scatter (X_tot[:training_dim], y_return)
    i = training_dim
    plt.pause(1.0)
    # While the length of the predicted points list is less than the total number of 
    # needed points
    while len(y_return) < len(y_tot):
        # Ensure that there are enough points the the predicted points list to be 
        # properly formatted.  Re-fitting the model only occurs when there are enough 
        # data points for the data to be properly formatted
        if len(y_return) % seq == 0:
            print ("RETRAIN")
            # Format the data
            X_train, y_train = time_series_data(y_return)
            print(len(y_train))
            # Fit the model
            r.fit(X_train, y_train)
            print(r)
        # Predict the next point in the data set and add it to the list
        next_input = [[y_return[-2], y_return[-1]]]    
        next = r.predict(next_input)    
        y_return.append(next[0])
        plt.scatter(X_tot[i], y_return[-1])
        i = i + 1
        plt.pause(1.0)

    # Print the MSE error if needed
    if verbose:
        print ('RIDGE CONTINUOUS RETRAIN MSE VALUE: ', mse(y_tot, y_return))

    # Return the predicted list
    return y_return, mse(y_tot, y_return)



In [4]:
params = [True, 0, 'auto']

In [5]:
X_train, y_train = time_series_data (y_tot[:training_dim])

In [6]:
y1, mse1 = sequential(X_train, y_train, y_tot, training_dim, params, True, seq=2)


RIDGE MSE VALUE:  4497.778193496928


In [7]:
y2, mse2 = sequential_autoregression(X_train, y_train, y_tot, training_dim, params, True, seq=2)

RETRAIN
78
Ridge(alpha=0, normalize=True)
RETRAIN
80
Ridge(alpha=0, normalize=True)
RETRAIN
82
Ridge(alpha=0, normalize=True)
RETRAIN
84
Ridge(alpha=0, normalize=True)
RETRAIN
86
Ridge(alpha=0, normalize=True)
RIDGE CONTINUOUS RETRAIN MSE VALUE:  4497.778193490564


In [8]:
print ('SEQUENTIAL ONLY: ', mse1)
print ('SEQUENTIAL AUTOREGRESSION: ', mse2)       

SEQUENTIAL ONLY:  4497.778193496928
SEQUENTIAL AUTOREGRESSION:  4497.778193490564


In [9]:
plt.plot(X_tot, y_tot, label="true")
plt.plot(X_tot, y1, label="seq", linewidth=4)
plt.plot(X_tot, y2, label='seq_auto')
plt.legend()

<matplotlib.legend.Legend at 0x105cd3dd8>

In [10]:
print (mse1/y_tot[-1])

1.5093046419339269


In [11]:
print(1e-5/0.3)

3.3333333333333335e-05
