# Imports and Mounting

In [1]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from scipy.stats import beta
from sklearn.impute import SimpleImputer

from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


# Downloading all files

## Iterating through A100

In [2]:
%cd '/content/gdrive/MyDrive/benchmarks_bs1/A100/chat'

/content/gdrive/MyDrive/benchmarks_bs1/A100/chat


In [3]:
A100_subfolders = [f.path for f in os.scandir() if f.is_dir()]

In [4]:
A100_subfolders

['./h2oai--h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2',
 './tatsu-lab--alpaca-7B',
 './project-baize--baize-v2-7B',
 './metaai--llama-13B',
 './openaccess-ai-collective--manticore-13b-chat-pyg',
 './BAIR--koala-7b',
 './lmsys--vicuna-7B',
 './Neutralzz--BiLLa-7B-SFT',
 './BAIR--koala-13b',
 './nomic-ai--gpt4all-13b-snoozy',
 './togethercomputer--RedPajama-INCITE-7B-Chat',
 './OpenAssistant--oasst-sft-1-pythia-12b',
 './camel-ai--CAMEL-13B-Combined-Data',
 './metaai--llama-7B',
 './Salesforce--xgen-7b-8k-inst',
 './FreedomIntelligence--phoenix-inst-chat-7b',
 './StabilityAI--stablelm-tuned-alpha-7b',
 './metaai--Llama-2-7b-chat-hf',
 './metaai--Llama-2-13b-chat-hf',
 './databricks--dolly-v2-12b',
 './lmsys--vicuna-13B',
 './lmsys--fastchat-t5-3b-v1.0']

In [5]:
A100_files = {}

for folder in A100_subfolders:
    files = os.listdir(folder)
    for file in files:
        file_path = os.path.join(folder, file)
        print(f"Processing file: {file_path}")
        # Do something with the file, e.g., open and read its contents
        with open(file_path, 'r') as f:
            A100_files[folder] = {}
            A100_files[folder]['model'] = pd.read_json(f)
            # Process the content as needed

Processing file: ./h2oai--h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2/benchmark_batch_1.json
Processing file: ./tatsu-lab--alpaca-7B/benchmark_batch_1.json
Processing file: ./project-baize--baize-v2-7B/benchmark_batch_1.json
Processing file: ./metaai--llama-13B/benchmark_batch_1.json
Processing file: ./openaccess-ai-collective--manticore-13b-chat-pyg/benchmark_batch_1.json
Processing file: ./BAIR--koala-7b/benchmark_batch_1.json
Processing file: ./lmsys--vicuna-7B/benchmark_batch_1.json
Processing file: ./Neutralzz--BiLLa-7B-SFT/benchmark_batch_1.json
Processing file: ./BAIR--koala-13b/benchmark_batch_1.json
Processing file: ./nomic-ai--gpt4all-13b-snoozy/benchmark_batch_1.json
Processing file: ./togethercomputer--RedPajama-INCITE-7B-Chat/benchmark_batch_1.json
Processing file: ./OpenAssistant--oasst-sft-1-pythia-12b/benchmark_batch_1.json
Processing file: ./camel-ai--CAMEL-13B-Combined-Data/benchmark_batch_1.json
Processing file: ./metaai--llama-7B/benchmark_batch_1.json
P

## Iterating through A40

In [6]:
%cd '/content/gdrive/MyDrive/benchmarks_bs1/A40/chat'

/content/gdrive/MyDrive/benchmarks_bs1/A40/chat


In [7]:
A40_subfolders = [f.path for f in os.scandir() if f.is_dir()]

In [8]:
A40_files = {}

for folder in A40_subfolders:
    files = os.listdir(folder)
    for file in files:
        file_path = os.path.join(folder, file)
        print(f"Processing file: {file_path}")
        # Do something with the file, e.g., open and read its contents
        with open(file_path, 'r') as f:
            A40_files[folder] = {}
            A40_files[folder]['model'] = pd.read_json(f)
            # Process the content as needed

Processing file: ./BAIR--koala-7b/benchmark_batch_1.json
Processing file: ./h2oai--h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2/benchmark_batch_1.json
Processing file: ./openaccess-ai-collective--manticore-13b-chat-pyg/benchmark_batch_1.json
Processing file: ./tatsu-lab--alpaca-7B/benchmark_batch_1.json
Processing file: ./metaai--llama-13B/benchmark_batch_1.json
Processing file: ./project-baize--baize-v2-7B/benchmark_batch_1.json
Processing file: ./lmsys--vicuna-7B/benchmark_batch_1.json
Processing file: ./Neutralzz--BiLLa-7B-SFT/benchmark_batch_1.json
Processing file: ./togethercomputer--RedPajama-INCITE-7B-Chat/benchmark_batch_1.json
Processing file: ./BAIR--koala-13b/benchmark_batch_1.json
Processing file: ./Salesforce--xgen-7b-8k-inst/benchmark_batch_1.json
Processing file: ./nomic-ai--gpt4all-13b-snoozy/benchmark_batch_1.json
Processing file: ./metaai--llama-7B/benchmark_batch_1.json
Processing file: ./FreedomIntelligence--phoenix-inst-chat-7b/benchmark_batch_1.json
Pro

In [9]:
A40_files.keys()

dict_keys(['./BAIR--koala-7b', './h2oai--h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2', './openaccess-ai-collective--manticore-13b-chat-pyg', './tatsu-lab--alpaca-7B', './metaai--llama-13B', './project-baize--baize-v2-7B', './lmsys--vicuna-7B', './Neutralzz--BiLLa-7B-SFT', './togethercomputer--RedPajama-INCITE-7B-Chat', './BAIR--koala-13b', './Salesforce--xgen-7b-8k-inst', './nomic-ai--gpt4all-13b-snoozy', './metaai--llama-7B', './FreedomIntelligence--phoenix-inst-chat-7b', './camel-ai--CAMEL-13B-Combined-Data', './StabilityAI--stablelm-tuned-alpha-7b', './BlinkDL--RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth', './metaai--Llama-2-13b-chat-hf', './OpenAssistant--oasst-sft-1-pythia-12b', './metaai--Llama-2-7b-chat-hf', './databricks--dolly-v2-12b', './lmsys--vicuna-13B', './lmsys--fastchat-t5-3b-v1.0'])

## Iterating through V100

In [10]:
%cd '/content/gdrive/MyDrive/benchmarks_bs1/V100/chat'

/content/gdrive/MyDrive/benchmarks_bs1/V100/chat


In [11]:
V100_subfolders = [f.path for f in os.scandir() if f.is_dir()]

In [12]:
V100_files = {}

for folder in V100_subfolders:
    files = os.listdir(folder)
    for file in files:
        file_path = os.path.join(folder, file)
        print(f"Processing file: {file_path}")
        # Do something with the file, e.g., open and read its contents
        with open(file_path, 'r') as f:
            V100_files[folder] = {}
            V100_files[folder]['model'] = pd.read_json(f)
            # Process the content as needed

Processing file: ./project-baize--baize-v2-7B/benchmark_batch_1.json
Processing file: ./tatsu-lab--alpaca-7B/benchmark_batch_1.json
Processing file: ./lmsys--vicuna-7B/benchmark_batch_1.json
Processing file: ./h2oai--h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2/benchmark_batch_1.json
Processing file: ./metaai--llama-13B/benchmark_batch_1.json
Processing file: ./openaccess-ai-collective--manticore-13b-chat-pyg/benchmark_batch_1.json
Processing file: ./BAIR--koala-7b/benchmark_batch_1.json
Processing file: ./Neutralzz--BiLLa-7B-SFT/benchmark_batch_1.json
Processing file: ./metaai--Llama-2-13b-chat-hf/benchmark_batch_1.json
Processing file: ./StabilityAI--stablelm-tuned-alpha-7b/benchmark_batch_1.json
Processing file: ./OpenAssistant--oasst-sft-1-pythia-12b/benchmark_batch_1.json
Processing file: ./nomic-ai--gpt4all-13b-snoozy/benchmark_batch_1.json
Processing file: ./BAIR--koala-13b/benchmark_batch_1.json
Processing file: ./togethercomputer--RedPajama-INCITE-7B-Chat/benchmark_

# Functions for modeling and graphing

In [13]:
def plot_length_energy(dictionary, model_name, color='c', showplot=True):
  model_data = dictionary[model_name]['model']

  plt.scatter(model_data["response_length"], model_data["energy"], alpha=0.2, s=5, color=color)
  plt.legend([model_name], loc='upper left')

  if showplot:
    plt.show()

In [14]:
def make_linear_model(dictionary, model_name):
  model_data = dictionary[model_name]['model']

  X = model_data["response_length"]
  y = model_data["energy"]

  #X = sm.add_constant(X)

  model = sm.OLS(y, X).fit()

  dictionary[model_name]['linear X'] = model.params[0]

  return model

In [15]:
def make_quadratic_model(dictionary, model_name):
  model_data = dictionary[model_name]['model']

  X = model_data["response_length"]
  y = model_data["energy"]

  df = pd.DataFrame({'X': X, 'X_squared': X**2, 'y': y})
  #X_quad = sm.add_constant(df[['X', 'X_squared']])

  model = sm.OLS(df['y'], df[['X', 'X_squared']]).fit()

  dictionary[model_name]['quad X'] = model.params[0]
  dictionary[model_name]['quad X2'] = model.params[1]

  return model

In [16]:
def make_cubic_model(dictionary, model_name):
  model_data = dictionary[model_name]['model']

  X = model_data["response_length"]
  y = model_data["energy"]

  df = pd.DataFrame({'X': X, 'X_squared': X**2, 'X_cubed': X**3,'y': y})

  model = sm.OLS(df['y'], df[['X', 'X_squared', 'X_cubed']]).fit()

  dictionary[model_name]['cubic X'] = model.params[0]
  dictionary[model_name]['cubic X2'] = model.params[1]
  dictionary[model_name]['cubic X3'] = model.params[2]

  return model

In [17]:
def plot_resid(model):
  # Obtain residuals
  residuals = model.resid

  # Plot residuals
  plt.scatter(model.fittedvalues, residuals)
  plt.xlabel('Fitted values')
  plt.ylabel('Residuals')
  plt.title('Residual Plot')
  plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0 for reference
  plt.show()

# Determining best polynomial using CV

In [18]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

np.random.seed(42)

# Set the maximum degree of the polynomial
max_degree = 10
best_degree_count = [0] * 10

y_sum = []
poly_mses = []

#
# V100
#

for name in V100_subfolders:
  # extract the model data
  model_data = V100_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Perform k-fold cross-validation for each degree
  cv_scores = []
  kf = KFold(n_splits=5, shuffle=True, random_state=42)

  for degree in range(1, max_degree + 1):

    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)

    # Fit the linear regression model
    model = LinearRegression(fit_intercept=False)

    # Calculate cross-validation score
    score = np.mean(cross_val_score(model, X_poly, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_scores.append(score)

  # record metrics
  y_sum.append(np.mean(y))
  poly_mses.append(cv_scores)
  # Find the degree with the highest cross-validation score
  best_degree = np.argmax(cv_scores) + 1
  best_degree_count[best_degree - 1] += 1

#
# A100
#

for name in A100_subfolders:
  # extract the model data
  model_data = A100_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Perform k-fold cross-validation for each degree
  cv_scores = []
  kf = KFold(n_splits=5, shuffle=True, random_state=42)

  for degree in range(1, max_degree + 1):

    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)

    # Fit the linear regression model
    model = LinearRegression(fit_intercept=False)

    # Calculate cross-validation score
    score = np.mean(cross_val_score(model, X_poly, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_scores.append(score)

  # record metrics
  y_sum.append(np.mean(y))
  poly_mses.append(cv_scores)

  # Find the degree with the highest cross-validation score
  best_degree = np.argmax(cv_scores) + 1
  best_degree_count[best_degree - 1] += 1

#
# A40
#

for name in A40_subfolders:
  # extract the model data
  model_data = A40_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Perform k-fold cross-validation for each degree
  cv_scores = []
  kf = KFold(n_splits=5, shuffle=True, random_state=42)

  for degree in range(1, max_degree + 1):

    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)

    # Fit the linear regression model
    model = LinearRegression(fit_intercept=False)

    # Calculate cross-validation score
    score = np.mean(cross_val_score(model, X_poly, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_scores.append(score)

  # record metrics
  y_sum.append(np.mean(y))
  poly_mses.append(cv_scores)
  # Find the degree with the highest cross-validation score
  best_degree = np.argmax(cv_scores) + 1
  best_degree_count[best_degree - 1] += 1

In [19]:
best_degree_count

[2, 15, 20, 11, 15, 4, 0, 0, 0, 0]

In [20]:
# Convert the 2D list to a numpy array
poly_mses = np.array(poly_mses)

# Calculate column-wise averages
column_averages = np.mean(poly_mses, axis=0)

column_averages

array([ -23521.55505991,  -17219.27437267,  -17191.82147695,
        -17191.01365286,  -17197.26185958,  -17337.20822109,
        -32007.66710557,  -60217.20202604, -100337.5944204 ,
       -150335.60239122])

In [21]:
mean_mses = [np.mean(x) for x in poly_mses]

it seems that a cubic polynomal is the best model

## train test confirmation

In [None]:
from sklearn.model_selection import train_test_split

for name in V100_subfolders:
  # extract the model data
  model_data = A40_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  for degree in range(1, max_degree + 1):

    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)

    # Fit the linear regression model
    model = LinearRegression(fit_intercept=False)

    # Calculate cross-validation score
    score = np.mean(cross_val_score(model, X_poly, y, cv=kf, scoring='neg_mean_squared_error'))
    cv_scores.append(score)


# Using lasso regression to find significant terms

## determining appropriate shrinkage factor

In [None]:
from sklearn.linear_model import LassoCV

# Set the maximum degree of the polynomial
max_degree = 10

alphas = [0.001, 0.01, 0.1, 1, 10, 100]

best_alphas = []

#
# V100
#

for name in V100_subfolders:
  # extract the model data
  model_data = V100_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Create polynomial features
  poly = PolynomialFeatures(degree=degree)
  X_poly = poly.fit_transform(X)

  # Create a LassoCV model with cross-validation
  lasso_cv = LassoCV(alphas=alphas, cv=KFold(n_splits=5, shuffle=True, random_state=42), max_iter=20000, fit_intercept=False)

  # Fit the model to the data
  lasso_cv.fit(X_poly, y)

  # Get the best alpha determined by cross-validation
  best_alphas.append(lasso_cv.alpha_)

#
# A100
#

for name in A100_subfolders:
  # extract the model data
  model_data = A100_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Create polynomial features
  poly = PolynomialFeatures(degree=degree)
  X_poly = poly.fit_transform(X)

  # Create a LassoCV model with cross-validation
  lasso_cv = LassoCV(alphas=alphas, cv=KFold(n_splits=5, shuffle=True, random_state=42), max_iter=20000, fit_intercept=False)

  # Fit the model to the data
  lasso_cv.fit(X_poly, y)

  # Get the best alpha determined by cross-validation
  best_alphas.append(lasso_cv.alpha_)


#
# A40
#

for name in A40_subfolders:
  # extract the model data
  model_data = A40_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Create polynomial features
  poly = PolynomialFeatures(degree=degree)
  X_poly = poly.fit_transform(X)

  # Create a LassoCV model with cross-validation
  lasso_cv = LassoCV(alphas=alphas, cv=KFold(n_splits=5, shuffle=True, random_state=42), max_iter=20000, fit_intercept=False)

  # Fit the model to the data
  lasso_cv.fit(X_poly, y)

  # Get the best alpha determined by cross-validation
  best_alphas.append(lasso_cv.alpha_)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [None]:
from collections import Counter

element_counts = Counter(best_alphas)

# Display the counts
for element, count in element_counts.items():
    print(f"{element}: {count} times")

0.001: 35 times
0.01: 19 times
0.1: 9 times
1.0: 2 times
10.0: 2 times


i guess we'll use 0.001!

In [None]:
from sklearn.linear_model import Lasso

# Set the maximum degree of the polynomial
max_degree = 10
alpha = 0.001

coef_list = []

#
# V100
#

for name in V100_subfolders:
  # extract the model data
  model_data = V100_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Create polynomial features
  poly = PolynomialFeatures(degree=degree)
  X_poly = poly.fit_transform(X)

  # Create and fit Lasso model
  lasso = Lasso(alpha=alpha, max_iter=20000, fit_intercept=False)  # You can adjust the alpha value
  lasso.fit(X_poly, y)

  # Get the coefficients
  coef_list.append(lasso.coef_)

#
# A100
#

for name in A100_subfolders:
  # extract the model data
  model_data = A100_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Create polynomial features
  poly = PolynomialFeatures(degree=degree)
  X_poly = poly.fit_transform(X)

  # Create and fit Lasso model
  lasso = Lasso(alpha=alpha, max_iter=20000, fit_intercept=False)  # You can adjust the alpha value
  lasso.fit(X_poly, y)

  # Get the coefficients
  coef_list.append(lasso.coef_)

#
# A40
#

for name in A40_subfolders:
  # extract the model data
  model_data = A40_files[name]['model']

  # subset variables
  X = model_data["response_length"].values.reshape(-1, 1)
  y = model_data["energy"]

  # Create polynomial features
  poly = PolynomialFeatures(degree=degree)
  X_poly = poly.fit_transform(X)

  # Create and fit Lasso model
  lasso = Lasso(alpha=alpha, max_iter=20000, fit_intercept=False)  # You can adjust the alpha value
  lasso.fit(X_poly, y)

  # Get the coefficients
  coef_list.append(lasso.coef_)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [None]:
# Convert the 2D list to a numpy array
coef_list = np.array(coef_list)

# Calculate column-wise averages
count_less_than_threshold = np.sum(coef_list < 0.00001, axis=0)

In [None]:
count_less_than_threshold

array([ 1,  0, 14, 67, 67, 67, 67, 67, 67, 67, 67])