In [1]:
!git clone https://github.com/afrenkai/DS-3010-Final.git

Cloning into 'DS-3010-Final'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 80 (delta 27), reused 61 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (80/80), 16.16 MiB | 14.63 MiB/s, done.
Resolving deltas: 100% (27/27), done.


In [2]:
%cd DS-3010-Final

/content/DS-3010-Final


In [61]:
!ls
!pip install torcheval

balls.pth  data.ipynb  main.py		    requirements.txt
cv.py	   LICENSE     Preprocessing.ipynb  setup.bat
Data	   Main.ipynb  README.md	    sgemm_product.csv
Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [62]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb

import torcheval.metrics as tm

In [5]:
train_df = pd.read_csv('Data/SGEMM_train.csv')
val_df = pd.read_csv('Data/SGEMM_val.csv')

# preprocessing

In [6]:
def norm(x, xmin, xmax, a, b):
  '''
  Restricts x values to range of [xmin, xmax]
  '''
  numerator = x - xmin
  denominator = xmax - xmin
  return (numerator / denominator) * (b - a) + a

In [7]:
cols_to_combine = ['Run1 (ms)', 'Run2 (ms)', 'Run3 (ms)', 'Run4 (ms)']

In [8]:
def preprocess(df: pd.DataFrame):
  df['DELTA_RUNTIME'] = df.apply(
      lambda row: np.mean([row['Run1 (ms)'], row['Run2 (ms)'], row['Run3 (ms)'], row['Run4 (ms)']]),
      axis=1
  )
  for col in df.columns:
    if col in cols_to_combine:
      df = df.drop(col, axis = 1)
  min = 0
  max = 1


  df = df.apply(
      lambda row: (norm(row, row.min(), row.max(), min, max))
  )
  x = df.iloc[:, :14]
  y = df.iloc[:, -1:]
  return x, y


# LightGBM

In [44]:
from sklearn.metrics import r2_score
x_tr, y_tr = preprocess(train_df)

train_data = lgb.Dataset(x_tr, label=y_tr)
x_val, y_val = preprocess(val_df)
# Create a LightGBM dataset for testing with features X_val and labels Y_val,
# and specify the reference dataset as train_data for consistent evaluation
val_data = lgb.Dataset(x_val, label=y_val, reference=train_data)
params = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[
                val_data])

from sklearn.metrics import mean_squared_error as mse
from lightgbm import LGBMRegressor

# Create an instance of the LightGBM Regressor with the RMSE metric.
model = LGBMRegressor(metric='mse')

# Train the model using the training data.
model.fit(x_tr, y_tr)

y_train = model.predict(x_tr)
y_v = model.predict(x_val)
print("Training MSE:", mse(y_tr, y_train))
print("Validation MSE:", mse(y_val, y_v))

print('train r2:', r2_score(y_tr, y_train))
print('val r2:', r2_score(y_val, y_v))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 193280, number of used features: 14
[LightGBM] [Info] Start training from score 0.061354
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 40
[LightGBM] [Info] Number of data points in the train set: 193280, number of used features: 14
[LightGBM] [Info] Start training from score 0.061354
Training MSE: 0.00012048252219381041
Validation MSE: 0.0001191615359407208
train r2: 0.9901529624802171
val r2: 0.9903041142135522


# Random Forest

In [46]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

#Want to predict Delta_Runtime via random forest
X = train_df.drop('DELTA_RUNTIME', axis=1)
y = train_df['DELTA_RUNTIME']


#Create Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=69, verbose = 1, n_jobs = 16)
#Fit Random Forest
rf.fit(X, y)
#See validation error based on validation set
y_pred = rf.predict(val_df.drop('DELTA_RUNTIME', axis=1))
mse = mean_squared_error(val_df['DELTA_RUNTIME'], y_pred)
print(f'Validation MSE: {mse}')
#Use CV to find best random forest





[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   48.1s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:  2.5min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.3s


Validation MSE: 0.14641877729097943


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.0s finished


# Simple Neural Net

In [10]:
class GPUNN(nn.Module):
  def __init__(self, in_feat, out_feat):
    super(GPUNN, self).__init__()
    self.device = 'cuda' if torch.cuda.is_available else 'cpu'
    self.L1 = nn.Linear(in_feat, 64, device=self.device)
    self.L2 = nn.Linear(64, out_feat, device = self.device)
    self.relu = nn.ReLU()
    self.bn1 = nn.BatchNorm1d(64, device = self.device)
    self.bn2 = nn.BatchNorm1d(out_feat, device = self.device)

  def forward(self, x):
    x = self.bn1(self.relu(self.L1(x)))
    x = self.bn2(self.L2(x))
    return x



In [51]:
def train(model: nn.Module, train_dl: DataLoader, batch_size, device, n_epochs, optimizer, criterion):
  model.train()
  for batch, (data, target) in enumerate(train_dl):

    data, target = data.to(device).float(), target.to(device).float()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, target)
    loss.backward()
    optimizer.step()
    if batch % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                n_epochs, batch * len(data), len(train_dl.dataset),
                100. * batch / len(train_dl), loss.item()))

  torch.save(model.state_dict(), 'balls.pth')




In [81]:
def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    losses = []
    r2s = []
    r2 = tm.R2Score().to(device)
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device).float(), target.to(device).float()
            output = model(data)
            test_loss = criterion(output, target)
            losses.append(test_loss)
            r2 = tm.R2Score()
            r2.update(output, target)
        r2s.append(r2.compute())
    # print(test_loss / len(test_loader.dataset))
    print(type(r2s[0].detach().cpu().numpy()))
    return (np.mean([ten.detach().cpu().numpy() for ten in losses]), r2s)






In [49]:
def create_dls(x: pd.DataFrame, y:pd.DataFrame):
  x = x.loc[:, :].values
  y = y.loc[:, :].values
  x_ten = torch.tensor(x)
  y_ten = torch.tensor(y)
  ds = TensorDataset(x_ten, y_ten)
  dl = DataLoader(ds, batch_size = 32)
  return ds, dl

In [64]:
x_tr, y_tr = preprocess(train_df)
model = GPUNN(len(x_tr.columns), len(y_tr.columns))
print(model)
_, train_dl = create_dls(x_tr, y_tr)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-4)
train(model, train_dl, 32, model.device, 10, optimizer, criterion)

GPUNN(
  (L1): Linear(in_features=14, out_features=64, bias=True)
  (L2): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [82]:
x_val, y_val = preprocess(val_df)
_, val_dl = create_dls(x_val, y_val)
criterion = nn.MSELoss()
print(f'Neural Network Validation Mean Squared Error: {test(model, model.device, val_dl, criterion)}')

<class 'numpy.ndarray'>
Neural Network Validation Mean Squared Error: (np.float32(0.13344504), [tensor(-13.4533, device='cuda:0')])


In [72]:
model.load_state_dict(torch.load('balls.pth', weights_only=True))
# Print model's state_dict
print("Model's state_dict:")
model.state_dict()

Model's state_dict:


OrderedDict([('L1.weight',
              tensor([[-1.8914e-03,  2.3810e-01,  8.4527e-02,  1.7204e-01, -2.5414e-01,
                       -4.6591e-03,  1.2225e-01, -6.8267e-02,  6.5934e-04, -2.4809e-01,
                       -1.1737e-01, -7.3755e-02,  1.4262e-01,  1.5098e-01],
                      [ 1.1689e-01,  4.6500e-02, -2.2184e-01,  3.0327e-02,  1.1563e-01,
                       -8.3968e-02,  2.7062e-01, -1.7804e-01,  1.6714e-01,  2.2119e-01,
                       -2.0030e-01, -8.6430e-02, -1.6962e-01, -1.8974e-01],
                      [ 1.1384e-01, -1.8544e-01,  1.5184e-01,  1.4528e-01, -1.1466e-01,
                        2.2361e-01, -1.6375e-01,  1.7745e-01,  9.2666e-02,  2.4153e-01,
                       -2.6441e-02, -1.9462e-01, -1.8542e-01,  3.3772e-02],
                      [-5.4599e-03, -2.0889e-01, -1.4373e-02,  1.0746e-01, -1.9390e-01,
                        1.4463e-02,  2.2057e-01,  1.9347e-01, -2.7768e-01,  2.4790e-01,
                       -2.0094e-01, -1.16