In [2]:
import pandas as pd
import numpy as np

In [3]:
## Creating load_data for deep hyper
# Combining V100 and P100 on same row for same run
# We are deleting cases where there is no run for either of the architectures
# Every column name is appended with the name of the architecture (e.g. "_V100");
# This includes the `master_index` (e.g `master_index_V100`)

df_joined = pd.read_parquet('df_master_joined.parquet')
df_joined.master_index_P100 = df_joined.master_index_P100.astype('int64') # Make sure index is integer
df_joined.master_index_V100 = df_joined.master_index_V100.astype('int64') # Make sure index is integer
df_joined.shape

# This is an "empty" dataframe (meaning no rows), containing
# column names for numerical data only.
# The column nmaes can be used to index the columns of the
# scaled data (in master_scaled_data.npy)

df_columns_only = pd.read_parquet('df_column_reference.parquet')
df_columns_only


# This is a 2-D numpy array corresponding to the numerical data in 'df_master.parquet'
# The data has been scaled using the StandardScaler in scikitlearn

# Notes: 
#   - The row indices correspond to the `master_index` column of 'df_master.parquet'
#   - The columns correspond to the columns in 'df_column_reference.parquet'.
#     (e.g. can use `df.get_loc(column-name)` to get the column index)

master_data_scaled = np.load('master_scaled_data.npy')
master_data_scaled.shape

(78330, 116)

In [9]:
from sklearn.model_selection import train_test_split

df = df_joined.copy()  # Start with all of df_joined

# Target index and values
target_index = df['master_index_V100'].values
target = master_data_scaled[ target_index ]

# Training data index and values
data_index = df['master_index_P100'].values
data = master_data_scaled[ data_index ]


# Split the data for training
(
    X_train, X_test,
    y_train, y_test,
) = train_test_split(
    data,
    target,
    random_state=42,
    test_size=.33
)

In [10]:
print("X_train.shape", X_train.shape)
print("y_train.shape", y_train.shape)
print("X_test.shape", X_test.shape)
print("y_test.shape", y_test.shape)

X_train.shape (21634, 116)
y_train.shape (21634, 116)
X_test.shape (10657, 116)
y_test.shape (10657, 116)
