-
Notifications
You must be signed in to change notification settings - Fork 1
/
Support.py
129 lines (122 loc) · 5.05 KB
/
Support.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
##################################################
# Support
# Part of the Library: Sequential Regression Extrapolation
# Julie Butler Hartley
# Version 1.0.0
# Date Created: February 28, 2021
# Last Modified: February 28, 2021
##################################################
##############################
# IMPORTS
##############################
# THIRD-PARTY IMPORTS
import csv
import numpy as np
##############################
# FORMAT SEQUENTIAL DATA
##############################
def format_sequential_data (y, seq=2):
"""
Inputs:
y (a list or NumPy array): the y values of a data set
seq (an int): the length of the sequence. Default value is 2
Returns:
inputs (a list): the inputs for a machine learning model using
sequential data formatting
outputs (a list): the outputs for a machine learning model using
sequential data formatting
Formats a given list or array in sequential formatting using the
given sequence lenght. Default sequence length is two.
Explanation of sequential formatting:
Typically data points of the form (x,y) are used to train a machine
learning model. This teaches the model the relationship between the
x data and the y data in the training range. This model works well
for interpolation, but not so well for extrapolation. A better data
model for extrapolation would be one that learns the patterns in the y
data to better guess what y value should come next. Therefore, this
method formats the data in a sequential pattern so that the points are
of the form ((y1, y2, ..., yn), yn+1) where n is the lenght of the
sequence (seq).
"""
# Make sure seq is an int
assert isinstance(seq, int)
# Set up the input and output lists
inputs = []
outputs = []
# Cycle through the whole y list/array and separate the points into
# sequential format
for i in range(0, len(y)-seq):
inputs.append(y[i:i+seq])
outputs.append(y[i+seq])
# Return the input and output lists. NOTE: the data type of the return
# values is LIST
return inputs, outputs
##############################
# FORMAT DATA
##############################
def formatData (filename, delimiter):
"""
Inputs:
filename (a string): a file name representing the file where
the data is stored. The import code expects the file to
contain only the matrix elements with columns separated by
a set delimiter and rows separated by a new line.
delimiter (a string): the delimiter between the columns in the
text file
Returns:
formattedData (a 2D numpy array): a 2D array storing the matrix.
The columns of the matrix can be accessed via
formattedData[:,i], and the rows of the matrix can be accessed
via formattedData[i].
Imports data from a file and formats it to be used in the matrix
extrapolation codes. Note, this does not format the data in an
unusual way. Many matrices in Python are formatted using this format.
"""
# Lists to hold the data
formattedData = []
val = []
# Get every row from the file and store them in order in val
with open(filename, 'r') as f:
reader = csv.reader(f, dialect='excel', delimiter=delimiter)
for row in reader:
val.append(row)
# Convert all if the elements to floats and store in formattedData
for row in range(len(val)):
formattedData.append([float(i) for i in val[row]])
# Return the formatted matrix as an array
return np.asarray(formattedData)
##############################
# GENERATE POLYNOMIAL RANDOM NOISE
##############################
def generate_polynomial_random_noise (degree, coef, num_points):
"""
Inputs:
degree (an int): the highest degree in the polynomial
coef (a list or NumPy array): the length must be the same as
degree. The coefficients for each term in the polynomial.
num_points (an int): the number of points in the data set.
Returns:
x, y (NumPy arrays): the x and y components of the generated
polynomial
Generates a polynomial with random noise for testing purposes.
The maximum size of the random noise of 5% of the largest generated
y value.
"""
# Make sure inputs are all acceptable
assert isinstance(degree, int)
assert isinstance(num_points, int)
assert len(coef) == degree+1
# Creating the input and output data
x = np.random.randint(-10, 10, size=(num_points))
y = np.zeros(num_points)
for i in np.arange(0, degree+1):
y = y + coef[i]*x**i
# Add the noise
maxy = np.max(y)
if maxy < 0:
maxy = -1*maxy
y_5per = 0.05*maxy
if y_5per < 1:
y_5per = 1
#y = [i + np.random.randint(-y_5per, y_5per) for i in y]
return x, y