In [1]:
# Extract data from az://openaipublic/neuron-explainer/data/collated-activations/1/1.json

import json
import pandas as pd
import numpy as np
import os
import glob
import re
import sys
import time
import datetime
import logging
import argparse
import subprocess

# Download GPT-4 explanations dataset

Taken straight from [the API documentation](https://github.com/openai/automated-interpretability), this code downloads all files for the neurons, combines them, and compresses them.

In [56]:
# Download data from az://openaipublic/neuron-explainer/data/collated-activations/1/1.json
# to local directory

# Set up GPT-2 XL tokenization
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl', timeout = 1000)

# Set up workers for parallel processing
from joblib import Parallel, delayed
import multiprocessing as mp
num_cores = mp.cpu_count()
print(f"Number of cores: {num_cores}")

# Set up logging
import logging
logging.getLogger().setLevel(logging.CRITICAL)

def download_data(layer, neuron, dir="../data"):
    """
    Download data from Azure OpenAI blob storage to local directory.
    """

    # Download data from Azure blob storage
    os.makedirs(dir, exist_ok=True)
    print(layer, neuron)
    cmd = f"azcopy copy \"https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations/{layer}/{neuron}.json\" \"{dir}\""
    subprocess.call(cmd, shell=True)

def data_processing(data_dir, tokenizer = tokenizer, data_format = "neuron-viewer"):
    """
    Process data from neuron-viewer json format to a more efficient token-encoded format.
    """    
    
    # Read json data
    if data_format == "neuron-viewer":
        with open(data_dir) as f:
            data = json.load(f)
            for sample in data['random_sample']:
                sample['tokens'] = tokenizer.encode(''.join(sample['tokens']))
            for sample_list in data['random_sample_by_quantile']:
                for sample in sample_list:
                    sample['tokens'] = tokenizer.encode(''.join(sample['tokens']))
            for sample in data['most_positive_activation_records']:
                sample['tokens'] = tokenizer.encode(''.join(sample['tokens']))

    with open(data_dir, 'w') as f:
        json.dump(data, f, separators=(',', ':'))

def download_and_process(layer, neuron, dir="../data"):
    """
    Download data from Azure blob storage and process data from neuron-viewer json format to a more efficient token-encoded format.
    """    

    # Check if data exists
    if os.path.exists(f"{dir}/{neuron}.json"):
        print(f"Data already exists for layer {layer}, neuron {neuron}.")
        return
    
    # Download data from Azure blob storage
    download_data(layer, neuron, dir)
    
    # Process data
    data_dir = f"{dir}/{neuron}.json"
    data_processing(data_dir)

def download_and_process_all(layers = 48, neurons = 6400, model = "gpt2-xl"):
    """
    Download data from Azure blob storage and process data from neuron-viewer json format to a more efficient token-encoded format.
    """    
    
    # Download data from Azure blob storage and process data with parallelization
    Parallel(n_jobs=num_cores)(delayed(download_and_process)(layer, neuron, f"../data/{model}/{layer}") for layer in range(layers) for neuron in range(neurons))

Number of cores: 10


In [57]:
%%capture

download_and_process_all()

Data already exists for layer 0, neuron 0.
Data already exists for layer 0, neuron 5.
Data already exists for layer 0, neuron 1.
Data already exists for layer 0, neuron 2.
Data already exists for layer 0, neuron 3.
0 8
Data already exists for layer 0, neuron 4.
Data already exists for layer 0, neuron 10.
INFO: Scanning...
INFO: Autologin not specified.
0 6
0 11
INFO: Scanning...
INFO: Autologin not specified.
INFO: Scanning...
INFO: Autologin not specified.
0 12
0 7
INFO: Scanning...
INFO: Autologin not specified.
INFO: Scanning...
INFO: Autologin not specified.
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

Job 60312c5a-0ba9-eb4b-4330-11f594eec5ea has started
Log file is located at: /Users/esben/.azcopy/60312c5a-0ba9-eb4b-4330-11f594eec5ea.log

INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support
INFO: Any empty folders will not be processed, because source 

In [28]:
current_dir = f"{1}"
current_data = f"{1}.json"

for l in range(1, 48):
    current_dir = f"../data/{l}"
    for n in range(0, 6400):
        current_data = f"../data/{l}/{n}.json"
        download_data(l,n, current_dir)
        data_processing(current_data)

2 6399
INFO: Scanning...
INFO: Autologin not specified.
INFO: Any empty folders will not be processed, because source and/or destination doesn't have full folder support

Job face1ccc-4521-1948-752a-6dd10e8d33fb has started
Log file is located at: /Users/esben/.azcopy/face1ccc-4521-1948-752a-6dd10e8d33fb.log

100.0 %, 1 Done, 0 Failed, 0 Pending, 0 Skipped, 1 Total, 2-sec Throughput (Mb/s): 0.9455


Job face1ccc-4521-1948-752a-6dd10e8d33fb summary
Elapsed Time (Minutes): 0.0334
Number of File Transfers: 1
Number of Folder Property Transfers: 0
Number of Symlink Transfers: 0
Total Number of Transfers: 1
Number of File Transfers Completed: 1
Number of Folder Transfers Completed: 0
Number of File Transfers Failed: 0
Number of Folder Transfers Failed: 0
Number of File Transfers Skipped: 0
Number of Folder Transfers Skipped: 0
TotalBytesTransferred: 236547
Final Job Status: Completed

