In [1]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import numbers

In [2]:
NNLS = 0
OLS = 1

In [3]:
def get_mode_name(mode):
    '''Return name of the current mode.'''
    return ('OLS' if i == OLS else 'NNLS')

def get_file_label(mode):
    '''Return filesystem-friendly name of the regressand field.'''
    return ('Energy' if 'Energy' in mode else 'Cycles')

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [4]:
df = pd.read_csv("stats_and_modeling/COMBINED/48/1/0/data-HW-and-TL.csv")

In [5]:
param_list = ['Executed insns (no MULS)', 'MULS insns', 'Taken branches', 'RAM data reads', 'RAM writes', 'Flash data reads', 'Flash insn reads', 'BL insns', 'PUSH/POP PC/LR']

In [6]:
target_column = 'HW Cycles (IT adjusted)'

In [7]:
adjust = 1.0

In [8]:
round = 5

In [9]:
y = df.loc[:,target_column].values
print(y)

[8.73163200e+07 1.07736480e+08 5.18880000e+05 4.00933000e+08
 4.85280000e+07 3.06888000e+07 1.96536000e+07 1.10808000e+07
 4.73133600e+08 1.10771520e+08 1.26065760e+08 9.85972800e+07
 4.35984048e+09 7.06290000e+09 7.85467200e+07 4.79568000e+06
 3.00807840e+08 4.88630000e+07 3.05136000e+06 2.33052370e+10
 1.21296000e+07 1.71552000e+06 2.04345024e+09 1.25347008e+09
 3.46752000e+06 1.62528000e+06 3.28766400e+07 9.60000000e+05
 4.51644000e+08 1.91541600e+08 1.68254208e+09 9.10933920e+08
 4.88007552e+09 9.38337600e+07 2.73515000e+10 3.21152160e+08
 1.21051680e+08 2.43754000e+07 2.34907200e+07 5.86128000e+06
 5.53805000e+07 7.66747200e+07 5.76350400e+07 1.16400000e+06
 1.14739200e+07 3.31665600e+07 5.44980624e+09 2.33492000e+08
 4.98379536e+09 2.41238400e+07 1.53757440e+08 1.17700800e+07
 7.48320480e+09 8.34768000e+06 8.59372800e+07 9.59155200e+07
 7.29811200e+07 3.14754240e+08 2.91341280e+08 3.39794880e+08
 2.50866240e+08 1.40255040e+08 6.03558720e+08 2.43208320e+08
 1.26459005e+10 3.152730

In [10]:
# Adjust the regressand.
y = y * adjust

In [11]:
fixed = "{}"

In [12]:
param_value_dict = eval(fixed)
fixed_params = param_value_dict.keys()
unconstrained_params = []

In [13]:
if param_value_dict:
    for param in param_list:
        if param in fixed_params and isinstance(param_value_dict[param], numbers.Number):
            # Subtract the contribution of the param from the Y vector
            print('')
            print("Ratio of residual/original")
            print((y - (df.loc[:,param].values * param_value_dict[param]))/y)
            print('')
            y = y - (df.loc[:,param].values * param_value_dict[param])
        else:
            unconstrained_params.append(param)
    # Reset param list to the free-running parameters only.
    param_list = unconstrained_params
else:
    pass

In [26]:
x = df.loc[:,param_list].values
print(x)

[[64561161        0  9052161 ... 36368388   303105   299008]
 [77611017  1081344  4521985 ... 42037252   270337   266240]
 [  274441        0    69633 ...   184324    12289     8192]
 ...
 [ 1241011     4200   136003 ...   708506     1001      600]
 [13199515  1390200  1507904 ...  7735108     1001      600]
 [19311311    40200  2117703 ... 11016506     1001      600]]


In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.base import clone

In [16]:
regressor = LinearRegression(fit_intercept=False)
rkf = RepeatedKFold(n_splits=10, n_repeats=1, random_state=None)
scrs = []
count = 0
for train_index, test_index in rkf.split(x):
    clone_regressor = clone(regressor)
    #print("Train:", train_index, "\nValidation:", test_index)
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clone_regressor.fit(X_train,y_train)
    y_pred = clone_regressor.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    scrs.append(rmse)
    count = count + 1;
    print("")
    print("-------Model using Fold No " + str(count) + "-------")
    print("Coef: " + str(clone_regressor.coef_))
    print ("Score: " + str(rmse))
    print ("R2 Score: " +  str(r2_score(y_test, y_pred)))


-------Model using Fold No 1-------
Coef: [0.98777416 1.00319263 2.00646011 0.99653949 1.00891084 0.99402997
 0.01485318 0.92286535 1.03637941]
Score: 490659.1926598857
R2 Score: 0.9999992604707232

-------Model using Fold No 2-------
Coef: [0.99385605 1.00344452 2.00746889 0.99819099 1.00534879 0.99537279
 0.004107   0.87889436 1.06670761]
Score: 1380716.8840697845
R2 Score: 0.9999997436135263

-------Model using Fold No 3-------
Coef: [0.98650345 1.00174827 2.00652426 0.99717642 1.01366091 0.98920752
 0.0168086  0.90246316 1.07950319]
Score: 909149.7937060053
R2 Score: 0.999999753788797

-------Model using Fold No 4-------
Coef: [0.98899155 0.99975834 2.00049634 0.9987508  1.00257162 0.98858037
 0.01388548 0.94788958 1.04377647]
Score: 584954.5252588143
R2 Score: 0.9999998336440497

-------Model using Fold No 5-------
Coef: [0.98962122 1.00102727 2.00312652 0.99843005 1.00509811 0.99152094
 0.0121476  0.94049977 1.03598358]
Score: 43067.3379007102
R2 Score: 0.9999999794984502

-----

In [17]:
scrs_array = np.asarray(scrs)
display_scores(scrs_array)

Scores: [ 490659.19265989 1380716.88406978  909149.79370601  584954.52525881
   43067.33790071 1496670.59289686  264965.34853819 1267810.69104451
  152120.03581225   48306.98848295]
Mean: 663842.1390369962
Standard deviation: 534218.0415918153


In [18]:
from sklearn.model_selection import cross_val_score
# Evaluate score by cross validation
regressor2 = LinearRegression(fit_intercept=False)
scores = cross_val_score(regressor2, x, y, scoring="neg_mean_squared_error", cv=10)
try:
    rmse_scores = np.sqrt(-scores)
except:
    print("### np.sqrt(-scores) failed, scores = " + str(scores))
display_scores(rmse_scores)

Scores: [ 586006.57272413 1570941.83569659 2498356.61550217 1271106.334445
  174371.71746824  103857.87057609  165611.47428437  119682.43460323
   52809.12027529   30558.75104579]
Mean: 657330.272662089
Standard deviation: 802159.1993910861


In [19]:
coefs = [None] * 2
predicted = [None] * 2
outliers = [None] * 2

In [20]:
print("")
print("-------Final Model using all data  -------")
regressor3 = LinearRegression(fit_intercept=False)
regressor3.fit(x, y)
pred = regressor3.predict(x)
#print("predict(x) [len(predict(x)) = %d] = %s" % (len(pred), str(pred)))
print(param_list)
with np.printoptions(linewidth=200):
    print(regressor3.coef_)
coefs[OLS] = regressor3.coef_
predicted[OLS] = pred


-------Final Model using all data  -------
['Executed insns (no MULS)', 'MULS insns', 'Taken branches', 'RAM data reads', 'RAM writes', 'Flash data reads', 'Flash insn reads', 'BL insns', 'PUSH/POP PC/LR']
[0.98963003 1.001042   2.00309931 0.99842728 1.00505224 0.99153485 0.01213653 0.94068954 1.03602931]


In [21]:
# Use constrained non-negative coefficients

print("Coefficients constrained to non-negative values, least-squares method")
from scipy.optimize import lsq_linear

lb = 0
ub = np.Inf
res = lsq_linear(x, y, bounds=(lb, ub))

# Round the coefficients if requested to.
print(param_list)
if round is not None:
    res.x = np.round(res.x, round)

with np.printoptions(linewidth=200):
    print(res.x)

Coefficients constrained to non-negative values, least-squares method
['Executed insns (no MULS)', 'MULS insns', 'Taken branches', 'RAM data reads', 'RAM writes', 'Flash data reads', 'Flash insn reads', 'BL insns', 'PUSH/POP PC/LR']
[0.98963 1.00104 2.0031  0.99843 1.00505 0.99153 0.01214 0.94069 1.03603]


In [22]:
coefs[NNLS] = res.x
predicted[NNLS] = np.dot(x, res.x)

In [23]:
threshold = 5 / 100.0

In [24]:
mean_abs_percentage_error = [None] * 2
percentage_error_vect = [None] * 2
mean_percentage_error = [None] * 2
median_percentage_error = [None] * 2
mean_squared_RE = [None] * 2
rmsre = [None] * 2
stddev_abs_percentage_error = [None] * 2
stddev_relative_error = [None] * 2
mse = [None] * 2
rmse = [None] * 2

In [25]:
for i in [NNLS, OLS]:
    outliers[i] = [ (bench, predicted, actual, 100*(predicted - actual)/actual) if abs(predicted - actual)/actual > threshold else None for (bench, predicted, actual) in zip(df.loc[:,'Bench'], predicted[i], y) ]

    # Determine and print mean(abs(relative error)).
    mean_abs_percentage_error[i] = mean_absolute_error(y/y, predicted[i]/y)
    print ("MAPE_%s = %.5f%%" % (get_mode_name(i), mean_abs_percentage_error[i] * 100.0))

    # Determine and print mean(percentage error).
    percentage_error_vect[i] = predicted[i]/y - y/y
    mean_percentage_error[i] = (percentage_error_vect[i]).mean()
    print ("MEAN(percentage_error_%s) = %.5f%%" % (get_mode_name(i), mean_percentage_error[i] * 100.0))

    # Determine and print the median error.
    median_percentage_error[i] = np.median(percentage_error_vect[i])
    print ("MEDIAN(percentage_error_%s) = %.5f%%" % (get_mode_name(i), median_percentage_error[i] * 100.0))

    # Determine and print root of mean square relative error.
    mean_squared_RE[i] =  mean_squared_error(y/y, predicted[i]/y)
    rmsre[i] =  np.sqrt(mean_squared_RE[i])
    print ("rootMSRE_%s = %.5f%%" % (get_mode_name(i), rmsre[i] * 100.0))

    stddev_abs_percentage_error[i] = np.sqrt(mean_squared_error(np.full(y.shape, mean_abs_percentage_error[i]), predicted[i]/y - np.full(y.shape, 1.0)))
    print ("STDDEV(MAPE_%s) = %.5f%%" % (get_mode_name(i), stddev_abs_percentage_error[i] * 100.0))

    stddev_relative_error[i] = np.sqrt(mean_squared_error(np.full(y.shape, mean_percentage_error[i]), predicted[i]/y - np.full(y.shape, 1.0)))
    print ("STDDEV(percentage_error_%s) = %.5f%%" % (get_mode_name(i), stddev_relative_error[i] * 100.0))

    mse[i] = mean_squared_error(y, predicted[i])
    rmse[i] = np.sqrt(mse[i])
    print (("RMSE Score %s:" % get_mode_name(i)) + str(rmse))
    print (("R2 Score %s:" % get_mode_name(i)) +  str(r2_score(y, predicted[i])))

    print("List of %d/%d outliers using %s at threshold %.2f%% (predicted, actual, error in %%):" % (len(list(filter(None, outliers[i]))), len(outliers[i]), get_mode_name(i), threshold*100.0))
    print("=================================================")
    [ print("%s: %.9f, %.9f, %5.2f%%" % elt) if elt else None for elt in outliers[i] ]


MAPE_NNLS = 0.08754%
MEAN(percentage_error_NNLS) = 0.06642%
MEDIAN(percentage_error_NNLS) = 0.01757%
rootMSRE_NNLS = 0.29271%
STDDEV(MAPE_NNLS) = 0.28585%
STDDEV(percentage_error_NNLS) = 0.28507%
RMSE Score NNLS:[334206.27722814295, None]
R2 Score NNLS:0.9999999866274003
List of 0/229 outliers using NNLS at threshold 5.00% (predicted, actual, error in %):
MAPE_OLS = 0.08749%
MEAN(percentage_error_OLS) = 0.06627%
MEDIAN(percentage_error_OLS) = 0.01742%
rootMSRE_OLS = 0.29267%
STDDEV(MAPE_OLS) = 0.28586%
STDDEV(percentage_error_OLS) = 0.28507%
RMSE Score OLS:[334206.27722814295, 334180.41482944106]
R2 Score OLS:0.99999998662947
List of 0/229 outliers using OLS at threshold 5.00% (predicted, actual, error in %):
