##Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Lasso
import statsmodels.api as sm
from math import sqrt

##Extract Information given

In [None]:
import yaml

information = {}

with open('../yaml/information.yaml') as f:
    information = yaml.load(f)

information

{'benchmarks': ['557.xz r',
  '531.deepsjeng r',
  '520.omnetpp r',
  '526.blender r',
  '511.povray r',
  '507.cactuBSSN r'],
 'events': ['cycles',
  'instructions',
  'branch-misses',
  'L2-load-misses',
  'L2-store-misses',
  'L1-dcache-load-misses',
  'L1-icache-load-misses',
  'dTLB-load-misses',
  'dTLB-store-misses',
  'iTLB-load-misses',
  'branch-load-misses']}

##Dataset Read and fetching means of all events values

In [None]:
filename = '557.xz_r'

if filename not in information['benchmarks']:
  print('Filename does not matches with the benchmarks given!!!')
else:
  dataset = pd.read_csv('../data/' + filename + '.csv')
  columns_means_values = dict(dataset.describe(include='all').loc['mean'])


Filename does not matches with the benchmarks given!!!


##Unnecessary column removal

In [None]:
 
dataset_columns = list(dataset.columns)
dataset_columns = dataset_columns[2:] #Remove first two columns as its not needed

print(information['events'])
print(dataset_columns)

if set(information['events']) == set(dataset_columns[:-1]):
  print("Data given is correct")
else:
  print("Data given is not correct. Columns does not match!!!")

events_remove_from_dataset = ['instructions','cycles']
for event in events_remove_from_dataset:
  dataset_columns.remove(event)



['cycles', 'instructions', 'branch-misses', 'L2-load-misses', 'L2-store-misses', 'L1-dcache-load-misses', 'L1-icache-load-misses', 'dTLB-load-misses', 'dTLB-store-misses', 'iTLB-load-misses', 'branch-load-misses']
['L2-load-misses', 'dTLB-store-misses', 'L2-store-misses', 'branch-misses', 'branch-load-misses', 'L1-dcache-load-misses', 'instructions', 'dTLB-load-misses', 'cycles', 'iTLB-load-misses', 'L1-icache-load-misses', 'cpi']
Data given is correct


##Dependent variable extraction

In [None]:

cpi = dataset['cpi']

updated_dataset = dataset[dataset_columns[:-1]]
updated_dataset.columns = dataset_columns[:-1]

X = updated_dataset
y = cpi

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.167)



##Model training and prediction

In [None]:

regressor = Lasso(alpha=0.00001,positive=True,selection='random',normalize=True,max_iter=1000000)
reg = regressor.fit(X_train, y_train)


y_pred = reg.predict(X_test)

print(reg.coef_)


[3.09515048e+01 9.12345366e+01 8.62517450e+01 2.74719562e+01
 3.91497848e+00 1.02885073e+01 3.29845249e+01 7.63901810e+03
 1.92949728e+02]


##Error Analysis

In [None]:

rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:- ",rmse)
y1 = regressor.predict(X_train)
rsq = r2_score(y_train,y1) #R-Squared on the training data
print('R-square: ',rsq)
rad = 1-(((1-rsq)*(len(y_train)-1))/(len(y_train)-1-X_train.shape[1]))
print('Adjusted R-square: ', rad)
res = y_test-y_pred
print('Residuals :', res)
F = (rsq/(1-rsq))*((len(y_test)-1-X_test.shape[1])/X_test.shape[1])
print('F value :',F)
regressor.score(X,y)

residual_average = 0.0
for i in range(len(res)):
  residual_average += abs(res.iloc[i])

residual_average = residual_average/len(res)
print('Residuals avg: ', residual_average)

reg2 = sm.OLS(y_train, X_train).fit()

print(reg2.summary())


dataset_columns = updated_dataset.columns

coeff = {}
ind = 0

for column in dataset_columns:

  coeff[column] = reg.coef_[ind]
  ind += 1

b0 = reg.intercept_

cpi_cal = b0

for column in dataset_columns:
  cpi_cal += (coeff[column] * columns_means_values[column])

print('mean cpi - ', columns_means_values['cpi'])
print('calculated cpi - ', cpi_cal)


RMSE:-  0.01795266729744212
R-square:  0.9887537494317477
Adjusted R-square:  0.9886759506192079
Residuals : 275    -0.032604
1340    0.018773
223    -0.006755
711    -0.001197
951     0.004259
          ...   
698     0.006056
606    -0.003221
753     0.000584
548    -0.004283
1117    0.006284
Name: cpi, Length: 264, dtype: float64
F value : 2481.2561191098007
Residuals avg:  0.012649639065581924
                                 OLS Regression Results                                
Dep. Variable:                    cpi   R-squared (uncentered):                   0.979
Model:                            OLS   Adj. R-squared (uncentered):              0.979
Method:                 Least Squares   F-statistic:                              6801.
Date:                Tue, 08 Dec 2020   Prob (F-statistic):                        0.00
Time:                        14:49:57   Log-Likelihood:                          1345.1
No. Observations:                1311   AIC:                           