## Importing Relevant Libraries

In [55]:
import numpy as np  # numpy!
import seaborn as sns # visualisation!
import matplotlib.pyplot as plt # visualisation!
import pandas as pd # dataframes & data analysis!

import re #for Regex

## Importing Data

In [56]:
# Load the dataset
assets = pd.read_csv('assets.csv')

In [57]:
#Quick Check
assets.head(1)

Unnamed: 0,type,name,organization,description,created_date,url,datasheet,modality,size,sample,...,model_card,training_emissions,training_time,training_hardware,adaptation,output_space,terms_of_service,monthly_active_users,user_distribution,failures
0,dataset,ToyMix,Mila-Quebec AI Institute,ToyMix is the smallest dataset of three extens...,2023-10-09,https://arxiv.org/pdf/2310.04292.pdf,,"molecules, tasks",13B labels of quantum and biological nature.,[],...,,,,,,,,,,


In [58]:
#Quick shape check
assets.shape

(568, 31)

In [59]:
#Quick Columns check
assets.columns

Index(['type', 'name', 'organization', 'description', 'created_date', 'url',
       'datasheet', 'modality', 'size', 'sample', 'analysis', 'dependencies',
       'included', 'excluded', 'quality_control', 'access', 'license',
       'intended_uses', 'prohibited_uses', 'monitoring', 'feedback',
       'model_card', 'training_emissions', 'training_time',
       'training_hardware', 'adaptation', 'output_space', 'terms_of_service',
       'monthly_active_users', 'user_distribution', 'failures'],
      dtype='object')

## Data Cleaning and Transformation

In [60]:
import re
import pandas as pd

# Function to classify link types
def link_name(column):
    # Check if 'pdf' is in the column, classify as 'Scientific paper' if found
    pdf_match = re.findall('pdf', str(column))
    if pdf_match and pdf_match[0] == 'pdf':
        return 'Scientific paper'
    
    # Check if 'github' is in the column, classify as 'Github Link' if found
    github_match = re.findall('github', str(column))
    if github_match and github_match[0] == 'github':
        return 'Github Link'
    
    # Check if 'huggingface' is in the column, classify as 'Huggingface Link' if found
    huggingface_match = re.findall('huggingface', str(column))
    if huggingface_match and huggingface_match[0] == 'huggingface':
        return 'Huggingface Link'
    
    # Check if 'blog' is in the column, classify as 'Blog' if found
    blog_match = re.findall('blog', str(column))
    if blog_match and blog_match[0] == 'blog':
        return 'Blog'
    
    # Return 'Other' if none of the above conditions are met
    return 'Other'

# Function to classify model card types
def model_card_type(column):
    # Check if 'google' is in the column, classify as 'Google' if found
    google_match = re.findall('google', str(column))
    if google_match and google_match[0] == 'google':
        return 'Google'
    
    # Check if 'github' is in the column, classify as 'Github' if found
    github_match = re.findall('github', str(column))
    if github_match and github_match[0] == 'github':
        return 'Github'
    
    # Check if 'huggingface' is in the column, classify as 'Huggingface' if found
    huggingface_match = re.findall('huggingface', str(column))
    if huggingface_match and huggingface_match[0] == 'huggingface':
        return 'Huggingface'
    
    # Check if 'arxiv' is in the column, classify as 'Arxiv' if found
    arxiv_match = re.findall('arxiv', str(column))
    if arxiv_match and arxiv_match[0] == 'arxiv':
        return 'Arxiv'
    
    # Return 'Other' if none of the above conditions are met
    return 'Other'

# Function to clean and transform the dataset
def cleaning(df):
    # Make a copy of the DataFrame to avoid modifying the original one
    df = df.copy()
    
    # Explode the 'dependencies' column if it contains lists
    df = df.explode('dependencies')
    
    # Apply the 'link_name' function to the 'url' column to classify link types
    df['link_type'] = df['url'].apply(link_name)
    
    # Apply the 'model_card_type' function to the 'model_card' column to classify model card types
    df['model_type'] = df['model_card'].apply(model_card_type)
    
    # List of columns to be label encoded
    id_columns = ['type', 'organization', 'created_date', 'link_type',
                  'modality', 'size', 'dependencies', 'access', 'model_type']
    
    # Loop through chosen columns to label encode each column
    for col in id_columns:
        # Label encode each column by converting to categorical codes
        df[f'{col}_id'] = df[col].astype('category').cat.codes
    
    # Return the cleaned and transformed DataFrame
    return df

In [61]:
assets = cleaning(assets)

In [62]:
assets.columns

Index(['type', 'name', 'organization', 'description', 'created_date', 'url',
       'datasheet', 'modality', 'size', 'sample', 'analysis', 'dependencies',
       'included', 'excluded', 'quality_control', 'access', 'license',
       'intended_uses', 'prohibited_uses', 'monitoring', 'feedback',
       'model_card', 'training_emissions', 'training_time',
       'training_hardware', 'adaptation', 'output_space', 'terms_of_service',
       'monthly_active_users', 'user_distribution', 'failures', 'link_type',
       'model_type', 'type_id', 'organization_id', 'created_date_id',
       'link_type_id', 'modality_id', 'size_id', 'dependencies_id',
       'access_id', 'model_type_id'],
      dtype='object')

## Creating And Saving Cleaned Tables

In [63]:
# Creating the tables
llm_table = assets[['name', 'description','datasheet', 'sample','analysis','included','excluded','quality_control',
                   'license','intended_uses','prohibited_uses','monitoring','feedback','training_emissions','training_time',
                   'training_hardware','adaptation','output_space','terms_of_service','monthly_active_users',
                    'user_distribution','failures','type_id', 'organization_id', 'created_date_id',
                   'link_type_id', 'modality_id', 'size_id', 'dependencies_id',
                   'access_id', 'model_type_id']].drop_duplicates()
type_table = assets[['type_id', 'type']].drop_duplicates()
organization_table = assets[['organization_id', 'organization']].drop_duplicates()
created_date_table = assets[['created_date_id', 'created_date']].drop_duplicates()
url_table = assets[['link_type_id', 'link_type','url']].drop_duplicates()
modality_table = assets[['modality_id', 'modality']].drop_duplicates()
size_table = assets[['size_id', 'size']].drop_duplicates()
dependencies_table = assets[['dependencies_id', 'dependencies']].drop_duplicates()
access_table = assets[['access_id', 'access']].drop_duplicates()
model_card_table = assets[['model_type_id', 'model_type','model_card']].drop_duplicates()

In [64]:
llm_table.shape

(568, 31)

In [65]:
# Saving each table to a CSV file
llm_table.to_csv('llm_table.csv', index=False)
type_table.to_csv('type_table.csv', index=False)
organization_table.to_csv('organization_table.csv', index=False)
created_date_table.to_csv('created_date_table.csv', index=False)
url_table.to_csv('url_table.csv', index=False)
modality_table.to_csv('modality_table.csv', index=False)
size_table.to_csv('size_table.csv', index=False)
dependencies_table.to_csv('dependencies_table.csv', index=False)
access_table.to_csv('access_table.csv', index=False)
model_card_table.to_csv('model_card_table.csv', index=False)