In [None]:
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.cluster import KMeans
import pickle
tqdm.pandas(desc="Processing")

client = OpenAI()
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
# read neuron explanations
with open('neuron_explanation_gpt2_small.pkl', 'rb') as file:
    neuron_explanation_gpt2_small = pickle.load(file)
sorted_neuron_explanation = sorted(neuron_explanation_gpt2_small, key=lambda x: x[-1], reverse=True)

In [None]:
#convert to pandas df
neuron_explanation_df = pd.DataFrame(sorted_neuron_explanation, columns=['layer_id', 'neuron_id', 'explanation', 'explanation score'])

In [None]:
# Choose neurons with explanation score greater than a threshold, ensuring it's well explained.
top_10_percent = neuron_explanation_df[neuron_explanation_df["explanation score"] > 0.53]

In [None]:
# get concept embedding for each neuron
top_10_percent['ada_embedding'] = top_10_percent['explanation'].progress_apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [None]:
embedding_matrix = np.vstack(top_10_percent.ada_embedding.values)

In [None]:
embedding_matrix.shape

In [None]:
# Load dataset concepts embedding
with open('data/CBT_V_concepts_embedding.pkl', 'rb') as file:
    CBT_V_embedding = pickle.load(file)

In [None]:
# cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)

    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

In [None]:
# calculate similarity score according to neuron and dataset concept embeddings
similarity_score = []
ind = []
for i in range(embedding_matrix.shape[0]):
    curr_max = 0
    curr_ind = 0
    for j in range(CBT_V_embedding.T.shape[1]):
        sim = cosine_similarity(embedding_matrix[i], CBT_V_embedding.T[:,j])
        if sim > curr_max:
            curr_ind = j
        curr_max = max(curr_max, sim)
    ind.append(curr_ind)
    similarity_score.append(curr_max)

In [None]:
top_10_percent["similarity"] = similarity_score

In [None]:
# sort neurons based on their importance score
sorted_top_10_percent = top_10_percent.sort_values(by='similarity', ascending=False)

In [None]:
# create neuron prune dict based on selected neuron tuples
def get_dict_from_tuples(tuples):
    returned_dict = {}
    for e in tuples:
        layer_id = e[0]
        neuron_id = e[1]
        layer_name = f"transformer.h.{layer_id}.mlp.act"
        if layer_name in returned_dict:
            returned_dict[layer_name].append(e[1])
        else:
            returned_dict[layer_name] = [e[1]]
    return returned_dict

In [None]:
# select top k neurons, create tuples list
k = 1000
tuple_list = []
for i in range(k):
    tuple_list.append((sorted_top_10_percent.iloc[i]["layer_id"], sorted_top_10_percent.iloc[i]["neuron_id"]))

with open(f'SNIP_CBT_V_top_{k}.pkl', 'wb') as file:
    pickle.dump(get_dict_from_tuples(tuple_list), file)