# Part 1: Generating the Master Dataset Files

In [1]:
import pandas as pd
import numpy as np

# Read in master CSV file
df = pd.read_csv('../mem_bound_all_data.csv', index_col = 0)

# Drop columns with NaN
df = df.dropna(axis=1,how='any')
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,inst_per_warp,kernelname,branch_efficiency,warp_execution_efficiency,warp_nonpred_execution_efficiency,inst_replay_overhead,shared_load_transactions_per_request,shared_store_transactions_per_request,local_load_transactions_per_request,local_store_transactions_per_request,...,single_precision_fu_utilization,double_precision_fu_utilization,flop_hp_efficiency,flop_sp_efficiency,flop_dp_efficiency,sysmem_read_utilization,sysmem_write_utilization,architecture,application_name,input
0,56.00056,bpnn_adjust_weights_cuda,1.0,0.999995,0.999994,0.002058,0.0,0.0,0.0,0.0,...,2,1,0.0,0.0,0.047434,0,1,P100,backprop,-100000_bpnn_adjust_weights_cuda
1,184.0,bpnn_layerforward_CUDA,1.0,0.943953,0.761888,0.000633,0.645833,0.696429,0.0,0.0,...,6,0,0.0,0.004695,0.0,0,1,P100,backprop,-100000_bpnn_layerforward_CUDA
2,56.0056,bpnn_adjust_weights_cuda,1.0,0.99995,0.999948,0.016234,0.0,0.0,0.0,0.0,...,3,2,0.0,0.0,0.043993,0,1,P100,backprop,-10000_bpnn_adjust_weights_cuda
3,184.0,bpnn_layerforward_CUDA,1.0,0.943953,0.761888,0.006918,0.645833,0.696429,0.0,0.0,...,6,0,0.0,0.003017,0.0,0,1,P100,backprop,-10000_bpnn_layerforward_CUDA
4,56.00056,bpnn_adjust_weights_cuda,1.0,0.999995,0.999994,0.002492,0.0,0.0,0.0,0.0,...,2,1,0.0,0.0,0.048306,0,1,P100,backprop,-100016_bpnn_adjust_weights_cuda


In [2]:
cols = df.columns.tolist()
cols

['inst_per_warp',
 'kernelname',
 'branch_efficiency',
 'warp_execution_efficiency',
 'warp_nonpred_execution_efficiency',
 'inst_replay_overhead',
 'shared_load_transactions_per_request',
 'shared_store_transactions_per_request',
 'local_load_transactions_per_request',
 'local_store_transactions_per_request',
 'gld_transactions_per_request',
 'gst_transactions_per_request',
 'shared_store_transactions',
 'shared_load_transactions',
 'local_load_transactions',
 'local_store_transactions',
 'gld_transactions',
 'gst_transactions',
 'sysmem_read_transactions',
 'sysmem_write_transactions',
 'l2_read_transactions',
 'l2_write_transactions',
 'dram_read_transactions',
 'dram_write_transactions',
 'global_hit_rate',
 'local_hit_rate',
 'gld_requested_throughput',
 'gst_requested_throughput',
 'gld_throughput',
 'gst_throughput',
 'local_memory_overhead',
 'tex_cache_hit_rate',
 'l2_tex_read_hit_rate',
 'l2_tex_write_hit_rate',
 'dram_read_throughput',
 'dram_write_throughput',
 'tex_cache_t

In [3]:
len(cols)

120

In [4]:
## Amount of kernels
len(df['kernelname'].value_counts())

38

In [5]:
# Define peak memory bandwidth p100 732
peak_mem_bw = {
    "V100": 898.048 * (1024*1024*1024),
    "P100": 749.0 * (1024*1024*1024),
}
mem_bw_thresh = 0.75

# Add a column specifying if the case is memory bound
df_archs = []
for arch in peak_mem_bw.keys():
    df_tmp = df[df["architecture"] == arch].copy()
    new_col = (
        df_tmp["dram_read_throughput"] + df_tmp["dram_write_throughput"]
    ) / peak_mem_bw[arch]
    new_col = new_col > mem_bw_thresh
    df_tmp["memory_bound"] = new_col
    df_archs.append(df_tmp.copy())
df_merged = pd.concat(df_archs).sort_index()
df_merged.head()

Unnamed: 0,inst_per_warp,kernelname,branch_efficiency,warp_execution_efficiency,warp_nonpred_execution_efficiency,inst_replay_overhead,shared_load_transactions_per_request,shared_store_transactions_per_request,local_load_transactions_per_request,local_store_transactions_per_request,...,double_precision_fu_utilization,flop_hp_efficiency,flop_sp_efficiency,flop_dp_efficiency,sysmem_read_utilization,sysmem_write_utilization,architecture,application_name,input,memory_bound
0,56.00056,bpnn_adjust_weights_cuda,1.0,0.999995,0.999994,0.002058,0.0,0.0,0.0,0.0,...,1,0.0,0.0,0.047434,0,1,P100,backprop,-100000_bpnn_adjust_weights_cuda,False
1,184.0,bpnn_layerforward_CUDA,1.0,0.943953,0.761888,0.000633,0.645833,0.696429,0.0,0.0,...,0,0.0,0.004695,0.0,0,1,P100,backprop,-100000_bpnn_layerforward_CUDA,False
2,56.0056,bpnn_adjust_weights_cuda,1.0,0.99995,0.999948,0.016234,0.0,0.0,0.0,0.0,...,2,0.0,0.0,0.043993,0,1,P100,backprop,-10000_bpnn_adjust_weights_cuda,False
3,184.0,bpnn_layerforward_CUDA,1.0,0.943953,0.761888,0.006918,0.645833,0.696429,0.0,0.0,...,0,0.0,0.003017,0.0,0,1,P100,backprop,-10000_bpnn_layerforward_CUDA,False
4,56.00056,bpnn_adjust_weights_cuda,1.0,0.999995,0.999994,0.002492,0.0,0.0,0.0,0.0,...,1,0.0,0.0,0.048306,0,1,P100,backprop,-100016_bpnn_adjust_weights_cuda,False


In [6]:
# Convert bool "memory_bound" column to integers
df_merged["memory_bound"]= df_merged["memory_bound"].astype('int')

In [7]:
# Here we have our master dataframe (df_merged).
# Assume the numerical data from this dataframe is used to
# scale everything (also leave out `memory_bound` column).
from sklearn.preprocessing import StandardScaler

# Helper funciton to return non-numerical column list
def _get_string_cols(df_in, str_cols=None):
    # Automatically detect non numerical columns
    str_cols = str_cols or []
    for col in df_in:
        if df_in[col].dtype == 'object':
            str_cols.append(col)
    return str_cols
        
# Convert numerical columns to out training/test
drop_cols = _get_string_cols(df_merged, ['memory_bound'])
df_col_ref = df_merged.drop(drop_cols, axis=1)
data_to_scale = df_col_ref.values
scaler = StandardScaler().fit(data_to_scale)
scaled_data_ = scaler.transform(data_to_scale)

# Add column to df_merged called 'master_index'
df_merged['master_index'] = [int(i) for i in range(len(df_merged.index))]

In [9]:
df_col_ref

Unnamed: 0,inst_per_warp,branch_efficiency,warp_execution_efficiency,warp_nonpred_execution_efficiency,inst_replay_overhead,shared_load_transactions_per_request,shared_store_transactions_per_request,local_load_transactions_per_request,local_store_transactions_per_request,gld_transactions_per_request,...,cf_fu_utilization,special_fu_utilization,half_precision_fu_utilization,single_precision_fu_utilization,double_precision_fu_utilization,flop_hp_efficiency,flop_sp_efficiency,flop_dp_efficiency,sysmem_read_utilization,sysmem_write_utilization
0,56.00056,1.0,0.999995,0.999994,0.002058,0.000000,0.000000,0.0,0.0,15.999891,...,1,0,0,2,1,0.0,0.000000,0.047434,0,1
1,184.00000,1.0,0.943953,0.761888,0.000633,0.645833,0.696429,0.0,0.0,12.000020,...,1,3,0,6,0,0.0,0.004695,0.000000,0,1
2,56.00560,1.0,0.999950,0.999948,0.016234,0.000000,0.000000,0.0,0.0,15.998914,...,1,0,0,3,2,0.0,0.000000,0.043993,0,1
3,184.00000,1.0,0.943953,0.761888,0.006918,0.645833,0.696429,0.0,0.0,12.000200,...,1,3,0,6,0,0.0,0.003017,0.000000,0,1
4,56.00056,1.0,0.999995,0.999994,0.002492,0.000000,0.000000,0.0,0.0,15.999891,...,1,0,0,2,1,0.0,0.000000,0.048306,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78325,123.00000,1.0,1.000000,0.952235,0.042877,0.000000,0.000000,0.0,0.0,4.000000,...,1,0,0,1,0,0.0,0.001050,0.000000,0,1
78326,120.00000,1.0,1.000000,0.951041,0.049354,0.000000,0.000000,0.0,0.0,4.000000,...,1,0,0,1,0,0.0,0.000000,0.000000,0,1
78327,121.00000,1.0,1.000000,0.951446,0.047521,0.000000,0.000000,0.0,0.0,4.000000,...,1,0,0,1,0,0.0,0.001094,0.000000,0,1
78328,123.00000,1.0,1.000000,0.952235,0.040356,0.000000,0.000000,0.0,0.0,4.000000,...,1,0,0,1,0,0.0,0.002153,0.000000,0,1


In [12]:

df_col_ref_2 = df_merged.drop(drop_cols, axis=1)
data_to_scale_2 = df_col_ref_2.values


data_test_scale = np.zeros(data_to_scale_2.shape)
print(data_test_scale)

for j in range(data_to_scale_2.shape[1]):
    scaler_2 = StandardScaler().fit(data_to_scale_2[:,j].reshape(-1, 1))
    scaled_data_2 = scaler_2.transform(data_to_scale_2[:,j].reshape(-1, 1))
    for i in range(len(scaled_data_2)):
        data_test_scale[i,j] = scaled_data_2[i]
data_test_scale


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


array([[-0.0635103 ,  0.38663447,  0.49002807, ...,  0.        ,
         0.        , -1.7320287 ],
       [-0.06273309,  0.38663447, -0.03722028, ...,  0.        ,
         0.        , -1.73198447],
       [-0.06351026,  0.38663447,  0.48960471, ...,  0.        ,
         0.        , -1.73194025],
       ...,
       [-0.06311562,  0.38663447,  0.49007511, ...,  0.        ,
         0.        ,  1.73194025],
       [-0.06310348,  0.38663447,  0.49007511, ...,  0.        ,
         0.        ,  1.73198447],
       [-0.06312776,  0.38663447,  0.49007511, ...,  0.        ,
         0.        ,  1.7320287 ]])

In [13]:
data_test_scale.max()

177.38292448377587

In [14]:
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
%matplotlib inline

plt.plot(scaled_data_)
#plt.xscale('log')
plt.yscale('log')

plt.show()
#for i in range(scaled_data_.shape[0]):
#    print(scaled_data_[i].max())

KeyboardInterrupt: 

In [15]:
# Lets create a dataframe (df_joined) with each row
# corresponding to a specific type of run.
# The V100 and P100 metrics are included in the same row,
# with `_V100` appended to the metric label for V100, etc.
# This means we have 2x the number of metrics for each row.
base = 'V100'
other = 'P100'

df_all = df_merged.copy()
unique_col = [] #column that has matches of kernels run on both architectures
for k, i in zip(df_all['kernelname'].values, df_all['input'].values):
    unique_col.append(k+'_'+i)
df_all['unique_index'] = unique_col
df_all.set_index('unique_index', inplace=True)

# Create 'base' and 'other' dataframes for join
df_b = df_all[df_all['architecture'] == base].copy()
df_o = df_all[df_all['architecture'] == other].copy()

# Final join operation, and drop rows with NaN elements
df_joined = df_b.join(df_o, lsuffix='_'+base, rsuffix='_'+other)
df_joined = df_joined.dropna(axis=0,how='any')
df_joined[['master_index_V100','master_index_P100']].head()

Unnamed: 0_level_0,master_index_V100,master_index_P100
unique_index,Unnamed: 1_level_1,Unnamed: 2_level_1
bpnn_adjust_weights_cuda_-100000_bpnn_adjust_weights_cuda,19996,0.0
bpnn_layerforward_CUDA_-100000_bpnn_layerforward_CUDA,19997,1.0
bpnn_adjust_weights_cuda_-10000_bpnn_adjust_weights_cuda,19998,2.0
bpnn_layerforward_CUDA_-10000_bpnn_layerforward_CUDA,19999,3.0
bpnn_adjust_weights_cuda_-100016_bpnn_adjust_weights_cuda,20000,4.0


In [16]:
# We now have our "master" dataframe with all of our data
# in one place.  In `df_merged` each row corresponds to a
# distinct run on a distinct architecture
print("df_merged.shape", df_merged.shape)

# `scaled_data_` now has our scaled data representation
# of all numerical data in `df_merged`.
# The row index of the 2-D numpy array is the same as
# the 'master_index' column of `df_merged`
print("scaled_data_.shape", scaled_data_.shape)

# Each row of `df_joined` has corresponds to a specific
# type of run (kernel + input), with both architectures
# stored in the same row. All column labels are appended
# with the name of the architecture (e.g. `'_V100'`)
# This means we have 2x the number of metrics for each row.
# Note: Column `'master_index_V100'` corresponds to the row
# in "scaled_data_" for V100 (same for '_P100')
print("df_joined.shape", df_joined.shape)

# We are assuming here that the columns of `df_col_ref`
# are ordered in the same way as `scaled_data_`.
df_col_ref[:0]

df_merged.shape (78330, 122)
scaled_data_.shape (78330, 116)
df_joined.shape (32291, 244)


Unnamed: 0,inst_per_warp,branch_efficiency,warp_execution_efficiency,warp_nonpred_execution_efficiency,inst_replay_overhead,shared_load_transactions_per_request,shared_store_transactions_per_request,local_load_transactions_per_request,local_store_transactions_per_request,gld_transactions_per_request,...,cf_fu_utilization,special_fu_utilization,half_precision_fu_utilization,single_precision_fu_utilization,double_precision_fu_utilization,flop_hp_efficiency,flop_sp_efficiency,flop_dp_efficiency,sysmem_read_utilization,sysmem_write_utilization


In [22]:
# Save df_merged to 'df_master.parquet'
df_merged.to_parquet('df_master.parquet')
df_merged.to_csv('df_master.csv')

# Save df_joined to 'df_master_joined.parquet'
df_joined.to_parquet('df_master_joined.parquet')

# Save df_col_ref to 'df_column_reference.parquet'
df_col_ref[:0].to_parquet('df_column_reference.parquet')

# Save scaled_data_ to 'master_scaled_data.npy'
np.save('master_scaled_data.npy', scaled_data_)

# Part 2: Using the Master Dataset Files

The code in "Part 1" does not need to be repeated in the future, because we wrote the primary results into persistent files (unless you want to add data to your master dataset, etc).

In [18]:
# Master dataframe with all columns, including `memory_bound` and `master_index`.
# Each row corresponds to distinct architecture and run

df_master = pd.read_parquet('df_master.parquet')
df_master.shape

(78330, 122)

In [19]:
# Combining V100 and P100 on same row for same run
# We are deleting cases where there is no run for either of the architectures
# Every column name is appended with the name of the architecture (e.g. "_V100");
# This includes the `master_index` (e.g `master_index_V100`)

df_joined = pd.read_parquet('df_master_joined.parquet')
df_joined.shape

(32291, 244)

In [20]:
# This is an "empty" dataframe (meaning no rows), containing
# column names for numerical data only.
# The column nmaes can be used to index the columns of the
# scaled data (in master_scaled_data.npy)

df_columns_only = pd.read_parquet('df_column_reference.parquet')
df_columns_only.shape

(0, 116)

In [21]:
# This is a 2-D numpy array corresponding to the numerical data in 'df_master.parquet'
# The data has been scaled using the StandardScaler in scikitlearn

# Notes: 
#   - The row indices correspond to the `master_index` column of 'df_master.parquet'
#   - The columns correspond to the columns in 'df_column_reference.parquet'.
#     (e.g. can use `df.get_loc(column-name)` to get the column index)

master_data_scaled = np.load('master_scaled_data.npy')
master_data_scaled.shape

(78330, 116)