In [1]:
import numpy as np  # numpy!
import seaborn as sns # visualisation!
import matplotlib.pyplot as plt # visualisation!
import pandas as pd # dataframes & data analysis!
from ast import literal_eval
import re #for Regex
from json import loads, dumps
import json

In [3]:
# Load the dataset
assets = pd.read_csv('all_assets.csv')
#Quick Check
pd.set_option('display.max_columns', None)  
assets.head(1)

Unnamed: 0,type,name,organization,description,created_date,url,model_card,modality,analysis,size,dependencies,training_emissions,training_time,training_hardware,quality_control,access,license,intended_uses,prohibited_uses,monitoring,feedback,adaptation,output_space,terms_of_service,monthly_active_users,user_distribution,failures,datasheet,sample,included,excluded
0,model,GenSLM,Argonne National Laboratory,,2022-10-11,https://www.biorxiv.org/content/10.1101/2022.1...,,text; genome sequence,,25B parameters (dense),"['SARS-CoV-2 genome dataset', 'BV-BRC dataset']",,,,,open,MIT,,,,,,,,,,,,,,


In [4]:
assets.shape

(580, 31)

In [5]:
# Replace empty values with a specific string
non_empty_assets = assets.fillna('unknown')

In [6]:
# Function to clean and transform the dataset
def cleaning(df):
    
    # Explode the 'dependencies' column if it contains lists
    df = df.explode('dependencies')

    # List of columns to be label encoded
    id_columns = ['type', 'organization', 'description', 'created_date', 'url',
       'datasheet', 'modality', 'size', 'sample', 'analysis', 'dependencies',
       'included', 'excluded', 'quality_control', 'access', 'license',
       'intended_uses', 'prohibited_uses', 'monitoring', 'feedback',
       'model_card', 'training_emissions', 'training_time',
       'training_hardware', 'adaptation', 'output_space', 'terms_of_service',
       'monthly_active_users', 'user_distribution', 'failures']
    
    # Loop through chosen columns to label encode each column
    for col in id_columns:
        # Label encode each column by converting to categorical codes
        df[f'{col}_id'] = df[col].astype('category').cat.codes + 1
   
    # Reset the index and add 'id' column starting from 1
    df['llm_data_id'] = df.reset_index().index + 1
   
    # Return the cleaned and transformed DataFrame
    return df


clean_assets = cleaning(non_empty_assets)
clean_assets.shape

(580, 62)

In [7]:
clean_assets.columns

Index(['type', 'name', 'organization', 'description', 'created_date', 'url',
       'model_card', 'modality', 'analysis', 'size', 'dependencies',
       'training_emissions', 'training_time', 'training_hardware',
       'quality_control', 'access', 'license', 'intended_uses',
       'prohibited_uses', 'monitoring', 'feedback', 'adaptation',
       'output_space', 'terms_of_service', 'monthly_active_users',
       'user_distribution', 'failures', 'datasheet', 'sample', 'included',
       'excluded', 'type_id', 'organization_id', 'description_id',
       'created_date_id', 'url_id', 'datasheet_id', 'modality_id', 'size_id',
       'sample_id', 'analysis_id', 'dependencies_id', 'included_id',
       'excluded_id', 'quality_control_id', 'access_id', 'license_id',
       'intended_uses_id', 'prohibited_uses_id', 'monitoring_id',
       'feedback_id', 'model_card_id', 'training_emissions_id',
       'training_time_id', 'training_hardware_id', 'adaptation_id',
       'output_space_id', 'terms

In [8]:
clean_assets

Unnamed: 0,type,name,organization,description,created_date,url,model_card,modality,analysis,size,dependencies,training_emissions,training_time,training_hardware,quality_control,access,license,intended_uses,prohibited_uses,monitoring,feedback,adaptation,output_space,terms_of_service,monthly_active_users,user_distribution,failures,datasheet,sample,included,excluded,type_id,organization_id,description_id,created_date_id,url_id,datasheet_id,modality_id,size_id,sample_id,analysis_id,dependencies_id,included_id,excluded_id,quality_control_id,access_id,license_id,intended_uses_id,prohibited_uses_id,monitoring_id,feedback_id,model_card_id,training_emissions_id,training_time_id,training_hardware_id,adaptation_id,output_space_id,terms_of_service_id,monthly_active_users_id,user_distribution_id,failures_id,llm_data_id
0,model,GenSLM,Argonne National Laboratory,unknown,2022-10-11,https://www.biorxiv.org/content/10.1101/2022.1...,unknown,text; genome sequence,unknown,25B parameters (dense),"['SARS-CoV-2 genome dataset', 'BV-BRC dataset']",unknown,unknown,unknown,unknown,open,MIT,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,3,13,478,92,401,39,56,93,10,259,235,32,25,86,3,33,167,70,22,190,167,21,76,120,9,32,39,3,2,1,1
1,model,h2oGPT,H2O AI,Series of models fine-tuned on well-known LLMs...,2023-06-16,https://arxiv.org/pdf/2306.08161.pdf,https://huggingface.co/h2oai/h2ogpt-oasst1-512...,text; text,Evaluated on EleutherAI evaluation harness.,20B parameters (dense),"['GPT-NeoX', 'H2O AI OpenAssistant', 'h2oGPT R...",unknown,unknown,unspecified number of 48GB A100 NVIDIA GPUs,unknown,open,Apache 2.0,unknown,unknown,unknown,https://huggingface.co/h2oai/h2ogpt-oasst1-512...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,3,70,362,175,119,39,66,82,10,62,101,32,25,86,3,2,167,70,22,157,130,21,76,122,9,32,39,3,2,1,2
2,model,H2O Danube,H2O AI,H2O Danube is a language model trained on 1T t...,2024-01-30,https://arxiv.org/pdf/2401.16818.pdf,https://huggingface.co/h2oai/h2o-danube-1.8b-base,text; text,Evaluated on common sense and world knowledge ...,1.8B parameters (dense),[],unknown,unknown,8x H100 GPUs on a single node,unknown,open,Apache 2.0,unknown,Users are encouraged to use the large language...,unknown,https://huggingface.co/h2oai/h2o-danube-1.8b-b...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,3,70,181,261,156,39,66,14,10,92,305,32,25,86,3,2,167,64,22,156,129,21,76,78,9,32,39,3,2,1,3
3,application,Character,Character AI,Character allows users to converse with variou...,2022-09-16,https://beta.character.ai/,unknown,unknown,unknown,unknown,[],unknown,unknown,unknown,unknown,limited,unknown,unknown,unknown,unknown,unknown,unknown,AI-generated chat conversations,https://beta.character.ai/tos,unknown,unknown,unknown,unknown,unknown,unknown,unknown,1,40,79,85,179,39,72,205,10,259,305,32,25,86,2,55,167,70,22,190,167,21,76,120,9,1,4,3,2,1,4
4,model,Bark,Suno,Bark is a text-to-audio model that can generat...,2023-04-20,https://github.com/suno-ai/bark,https://github.com/suno-ai/bark/blob/main/mode...,text; audio,unknown,unknown,['AudioLM'],unknown,unknown,unknown,unknown,open,MIT,unknown,unknown,unknown,https://huggingface.co/spaces/suno/bark/discus...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,3,167,62,151,260,39,51,205,10,259,13,32,25,86,3,33,167,70,22,173,26,21,76,120,9,32,39,3,2,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575,model,Firefly Image 2,Adobe,Firefly Image 2 is the next generation of gene...,2023-10-10,https://firefly.adobe.com/,unknown,text; image,unknown,unknown,[],unknown,unknown,unknown,unknown,closed,unknown,creative generation of digital art and images,"AI/ML training, attempting to create abusive, ...",unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,3,6,147,215,235,39,57,205,10,259,305,32,25,86,1,55,154,4,22,190,167,21,76,120,9,32,39,3,2,1,576
576,model,Firefly Vector,Adobe,Firefly Vector is the world’s first generative...,2023-10-10,https://firefly.adobe.com/,unknown,text; vector graphic,unknown,unknown,[],unknown,unknown,unknown,unknown,closed,unknown,creative generation of digital art and images,"AI/ML training, attempting to create abusive, ...",unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,3,6,148,215,235,39,68,205,10,259,305,32,25,86,1,55,154,4,22,190,167,21,76,120,9,32,39,3,2,1,577
577,model,Firefly Design,Adobe,Firefly Design powers instant generation of am...,2023-10-10,https://firefly.adobe.com/,unknown,text; template design,unknown,unknown,[],unknown,unknown,unknown,unknown,closed,unknown,creative generation of digital art and images,"AI/ML training, attempting to create abusive, ...",unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,3,6,146,215,235,39,65,205,10,259,305,32,25,86,1,55,154,4,22,190,167,21,76,120,9,32,39,3,2,1,578
578,application,Firefly,Adobe,Adobe Firefly is a standalone web application....,2023-03-21,https://firefly.adobe.com/,unknown,unknown,unknown,unknown,"['Firefly Image 2', 'Firefly Vector', 'Firefly...",unknown,unknown,unknown,unknown,limited,unknown,creative generation of digital art and images,"AI/ML training, attempting to create abusive, ...",unknown,unknown,unknown,AI-generated creations,https://www.adobe.com/legal/licenses-terms/ado...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,1,6,29,142,235,39,72,205,10,259,76,32,25,86,2,55,154,4,22,190,167,21,76,120,9,2,23,3,2,1,579


In [9]:
# create tables
llm_table = clean_assets[['llm_data_id','name', 'type_id',
                          'organization_id', 'description_id', 'created_date_id', 'url_id',
                          'datasheet_id', 'modality_id', 'size_id', 'sample_id', 'analysis_id',
                          'dependencies_id', 'included_id', 'excluded_id', 'quality_control_id',
                          'access_id', 'license_id', 'intended_uses_id', 'prohibited_uses_id',
                          'monitoring_id', 'feedback_id', 'model_card_id',
                          'training_emissions_id', 'training_time_id', 'training_hardware_id',
                          'adaptation_id', 'output_space_id', 'terms_of_service_id',
                          'monthly_active_users_id', 'user_distribution_id', 'failures_id']].drop_duplicates()

type_table = clean_assets[['type_id', 'type']].drop_duplicates().sort_values(by='type_id').reset_index(drop=True)
organization_table = clean_assets[['organization_id', 'organization']].drop_duplicates().sort_values(by='organization_id').reset_index(drop=True)
description_table = clean_assets[['description_id', 'description']].drop_duplicates().sort_values(by='description_id').reset_index(drop=True)
created_date_table = clean_assets[['created_date_id', 'created_date']].drop_duplicates().sort_values(by='created_date_id').reset_index(drop=True)
url_table = clean_assets[['url_id', 'url']].drop_duplicates().sort_values(by='url_id').reset_index(drop=True)
datasheet_table = clean_assets[['datasheet_id', 'datasheet']].drop_duplicates().sort_values(by='datasheet_id').reset_index(drop=True)
modality_table = clean_assets[['modality_id', 'modality']].drop_duplicates().sort_values(by='modality_id').reset_index(drop=True)
size_table = clean_assets[['size_id', 'size']].drop_duplicates().sort_values(by='size_id').reset_index(drop=True)
sample_table = clean_assets[['sample_id', 'sample']].drop_duplicates().sort_values(by='sample_id').reset_index(drop=True)
analysis_table = clean_assets[['analysis_id', 'analysis']].drop_duplicates().sort_values(by='analysis_id').reset_index(drop=True)

dependencies_table = clean_assets[['dependencies_id', 'dependencies']].drop_duplicates().sort_values(by='dependencies_id').reset_index(drop=True)
#make dependencies a list of strings
dependencies_table.dependencies = dependencies_table.dependencies.apply(literal_eval)

included_table = clean_assets[['included_id', 'included']].drop_duplicates().sort_values(by='included_id').reset_index(drop=True)
excluded_table = clean_assets[['excluded_id', 'excluded']].drop_duplicates().sort_values(by='excluded_id').reset_index(drop=True)
quality_control_table = clean_assets[['quality_control_id', 'quality_control']].drop_duplicates().sort_values(by='quality_control_id').reset_index(drop=True)
access_table = clean_assets[['access_id', 'access']].drop_duplicates().sort_values(by='access_id').reset_index(drop=True)
license_table = clean_assets[['license_id', 'license']].drop_duplicates().sort_values(by='license_id').reset_index(drop=True)
intended_uses_table = clean_assets[['intended_uses_id', 'intended_uses']].drop_duplicates().sort_values(by='intended_uses_id').reset_index(drop=True)
prohibited_uses_table = clean_assets[['prohibited_uses_id', 'prohibited_uses']].drop_duplicates().sort_values(by='prohibited_uses_id').reset_index(drop=True)
monitoring_table = clean_assets[['monitoring_id', 'monitoring']].drop_duplicates().sort_values(by='monitoring_id').reset_index(drop=True)
feedback_table = clean_assets[['feedback_id', 'feedback']].drop_duplicates().sort_values(by='feedback_id').reset_index(drop=True)
model_card_table = clean_assets[['model_card_id', 'model_card']].drop_duplicates().sort_values(by='model_card_id').reset_index(drop=True)
training_emissions_table = clean_assets[['training_emissions_id', 'training_emissions']].drop_duplicates().sort_values(by='training_emissions_id').reset_index(drop=True)
training_time_table = clean_assets[['training_time_id', 'training_time']].drop_duplicates().sort_values(by='training_time_id').reset_index(drop=True)
training_hardware_table = clean_assets[['training_hardware_id', 'training_hardware']].drop_duplicates().sort_values(by='training_hardware_id').reset_index(drop=True)
adaptation_table = clean_assets[['adaptation_id', 'adaptation']].drop_duplicates().sort_values(by='adaptation_id').reset_index(drop=True)
output_space_table = clean_assets[['output_space_id', 'output_space']].drop_duplicates().sort_values(by='output_space_id').reset_index(drop=True)
terms_of_service_table = clean_assets[['terms_of_service_id', 'terms_of_service']].drop_duplicates().sort_values(by='terms_of_service_id').reset_index(drop=True)
monthly_active_users_table = clean_assets[['monthly_active_users_id', 'monthly_active_users']].drop_duplicates().sort_values(by='monthly_active_users_id').reset_index(drop=True)
user_distribution_table = clean_assets[['user_distribution_id', 'user_distribution']].drop_duplicates().sort_values(by='user_distribution_id').reset_index(drop=True)
failures_table = clean_assets[['failures_id', 'failures']].drop_duplicates().sort_values(by='failures_id').reset_index(drop=True)


In [10]:
llm_table

Unnamed: 0,llm_data_id,name,type_id,organization_id,description_id,created_date_id,url_id,datasheet_id,modality_id,size_id,sample_id,analysis_id,dependencies_id,included_id,excluded_id,quality_control_id,access_id,license_id,intended_uses_id,prohibited_uses_id,monitoring_id,feedback_id,model_card_id,training_emissions_id,training_time_id,training_hardware_id,adaptation_id,output_space_id,terms_of_service_id,monthly_active_users_id,user_distribution_id,failures_id
0,1,GenSLM,3,13,478,92,401,39,56,93,10,259,235,32,25,86,3,33,167,70,22,190,167,21,76,120,9,32,39,3,2,1
1,2,h2oGPT,3,70,362,175,119,39,66,82,10,62,101,32,25,86,3,2,167,70,22,157,130,21,76,122,9,32,39,3,2,1
2,3,H2O Danube,3,70,181,261,156,39,66,14,10,92,305,32,25,86,3,2,167,64,22,156,129,21,76,78,9,32,39,3,2,1
3,4,Character,1,40,79,85,179,39,72,205,10,259,305,32,25,86,2,55,167,70,22,190,167,21,76,120,9,1,4,3,2,1
4,5,Bark,3,167,62,151,260,39,51,205,10,259,13,32,25,86,3,33,167,70,22,173,26,21,76,120,9,32,39,3,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575,576,Firefly Image 2,3,6,147,215,235,39,57,205,10,259,305,32,25,86,1,55,154,4,22,190,167,21,76,120,9,32,39,3,2,1
576,577,Firefly Vector,3,6,148,215,235,39,68,205,10,259,305,32,25,86,1,55,154,4,22,190,167,21,76,120,9,32,39,3,2,1
577,578,Firefly Design,3,6,146,215,235,39,65,205,10,259,305,32,25,86,1,55,154,4,22,190,167,21,76,120,9,32,39,3,2,1
578,579,Firefly,1,6,29,142,235,39,72,205,10,259,76,32,25,86,2,55,154,4,22,190,167,21,76,120,9,2,23,3,2,1


In [11]:
dependencies_table

Unnamed: 0,dependencies_id,dependencies
0,1,"[AI-HUB dataset, National Institute of Korean ..."
1,2,[AI21 Paraphrase API]
2,3,[AI21 Summarize API]
3,4,"[AST, BERT, MuLan dataset]"
4,5,"[Alpaca, GPT-4, Dolly, ShareGPT, LLaMA, Vicuna]"
...,...,...
300,301,[text-davinci-002]
301,302,[text-davinci-003]
302,303,"[w2v-BERT, SoundStream]"
303,304,[web_clean]


In [12]:
#map llm_ids to dependency list
def get_dependency_id(dependencies_table, llm_table):
    mapped_dependencies = []

    for index, row in dependencies_table.iterrows():
        dependencies_list = row['dependencies']
        dependencies_llm_ids = []
    
        for dependency in dependencies_list:
     
            # Check if dependency exists in the LLM dataset
            llm_match = llm_table[llm_table['name'].str.lower() == dependency.lower()]
           
            if not llm_match.empty:
                dependencies_llm_ids.append(llm_match['llm_data_id'].iloc[0])    
            else:
                dependencies_llm_ids.append(np.nan)
               
        mapped_dependencies.append(dependencies_llm_ids)
    dependencies_table['dependencies_llm_ids'] = mapped_dependencies
    return dependencies_table
    
dependencies_table = get_dependency_id(dependencies_table, llm_table)   
dependencies_table

Unnamed: 0,dependencies_id,dependencies,dependencies_llm_ids
0,1,"[AI-HUB dataset, National Institute of Korean ...","[nan, nan]"
1,2,[AI21 Paraphrase API],[73]
2,3,[AI21 Summarize API],[nan]
3,4,"[AST, BERT, MuLan dataset]","[nan, nan, 355]"
4,5,"[Alpaca, GPT-4, Dolly, ShareGPT, LLaMA, Vicuna]","[440, 308, 101, nan, 506, 170]"
...,...,...,...
300,301,[text-davinci-002],[296]
301,302,[text-davinci-003],[297]
302,303,"[w2v-BERT, SoundStream]","[353, 352]"
303,304,[web_clean],[289]


In [13]:
# Function to count non-null values in each list
def count_non_null(lst):
    return sum(pd.notnull(lst))

# Apply the function and get the total count
total_non_null_count = dependencies_table['dependencies_llm_ids'].apply(count_non_null).sum()
print(total_non_null_count)

432


### To JSON

In [14]:
result = dependencies_table.to_json(orient="records")
result
parsed = loads(result)
parsed = dumps(parsed, indent=4)  
try:
    # Writing to sample.json
    with open(f"dependencies_table.json", "w") as outfile:
        outfile.write(parsed)
except:           
    print(f"Failed saving: test_dependencies_table.json")
else:
    print(f"Successfully saved test_dependencies_table.json")

Successfully saved test_dependencies_table.json


### To CSV

In [15]:
# Saving each table to a CSV file
llm_table.to_csv('llm_table.csv', index=False)
type_table.to_csv('type_table.csv', index=False)
organization_table.to_csv('organization_table.csv', index=False)
created_date_table.to_csv('created_date_table.csv', index=False)
url_table.to_csv('url_table.csv', index=False)
modality_table.to_csv('modality_table.csv', index=False)
size_table.to_csv('size_table.csv', index=False)
dependencies_table.to_csv('dependencies_table.csv', index=False)
access_table.to_csv('access_table.csv', index=False)
model_card_table.to_csv('model_card_table.csv', index=False)

description_table.to_csv('description_table.csv', index=False)

datasheet_table.to_csv('datasheet_table.csv', index=False)

sample_table.to_csv('sample_table.csv', index=False)
analysis_table.to_csv('analysis_table.csv', index=False)
included_table.to_csv('included_table.csv', index=False)
excluded_table.to_csv('excluded_table.csv', index=False)
quality_control_table.to_csv('quality_control_table.csv', index=False)
license_table.to_csv('license_table.csv', index=False)
intended_uses_table.to_csv('intended_uses_table.csv', index=False)
prohibited_uses_table.to_csv('prohibited_uses_table.csv', index=False)
monitoring_table.to_csv('monitoring_table.csv', index=False)
feedback_table.to_csv('feedback_table.csv', index=False)
training_emissions_table.to_csv('training_emissions_table.csv', index=False)
training_time_table.to_csv('training_time_table.csv', index=False)
training_hardware_table.to_csv('training_hardware_table.csv', index=False)
adaptation_table.to_csv('adaptation_table.csv', index=False)
output_space_table.to_csv('output_space_table.csv', index=False)
terms_of_service_table.to_csv('terms_of_service_table.csv', index=False)
monthly_active_users_table.to_csv('monthly_active_users_table.csv', index=False)
user_distribution_table.to_csv('user_distribution_table.csv', index=False)
failures_table.to_csv('failures_table.csv', index=False)

In [16]:
type(dependencies_table.dependencies)	

pandas.core.series.Series