In [None]:
import numpy as np  # numpy!
import seaborn as sns # visualisation!
import matplotlib.pyplot as plt # visualisation!
import pandas as pd # dataframes & data analysis!
from ast import literal_eval
import re #for Regex
from dotenv import load_dotenv
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import json

### MongoDb Connection

In [None]:
from pymongo import MongoClient

#get secrets from .env
load_dotenv()

uri = os.getenv('URI')

# Create a new client and connect to the server
client = MongoClient(uri)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
db = client['lighthouse']

# Get collection 
collection = db['llm_datas']
collection.find_one()

# get all in collection 
#documents = collection.find()

# Print the documents
#for document in documents:
#    print(document)

In [None]:
db = client['lighthouse']
collection_names = db.list_collection_names()
print(collection_names)

In [None]:
## loop through documents in LLM_data
# update field objectids for each model
def update_llm_data_with_objects():
    
    llm_collection = db['llm_datas']
    
    for doc in llm_collection.find():
        updates = {}
        model_name = doc["name"]
        document_id = doc["_id"]
        for field, ref_id in doc.items():
            if field.endswith("_id"):  # Identify reference fields
                if field != '_id' and field != 'llm_data_id':
                    x = field.rsplit("_", 1)
                    field_name = 'llm_' + x[0]
                    if field_name == 'llm_excluded':
                        collection_name = 'llm_exclusions'
                    else:
                        collection_name = process.extractOne(field_name, collection_names, scorer=fuzz.ratio)[0]
                    
                    # Perform lookup for the corresponding collection
                    collection_name = db[collection_name]
                    referenced_doc = collection_name.find_one({field: ref_id})
                    if referenced_doc:  # Check if referenced document exists
                        updates[field] = referenced_doc['_id']  # Update with the objectid from collection
                        #print(referenced_doc['_id'])

        #print(updates)            
        result = llm_collection.update_one({"_id": document_id}, {"$set": updates})
        
        if result.matched_count > 0:
            print(f"Document with ID {document_id}, model: {model_name} updated successfully.")
        else:
            print(f"No document found with ID {document_id}.")
                
        
update_llm_data_with_objects()

#### Join objectIDs of models with dependency 

In [None]:
def update_dependencies_id():
    llm_collection = db['llm_dependencies']
    for doc in llm_collection.find():
        updates = {}
        object_id_list = []
        dependecies_ids = doc["dependencies_llm_ids"]
        document_id = doc["_id"]
        # Get model objectID from llm_datas for each model id
        #print(dependecies_ids)
        for id in dependecies_ids:
            #print(id)
            if id != None:
                #print(id)
                # Lookup id in main llm table
                llm_data_table = db['llm_datas']
                referenced_doc = llm_data_table.find_one({'llm_data_id': id})
                #print(f'{referenced_doc["name"]}: {referenced_doc["_id"]}')
                object_id_list.append(referenced_doc["_id"])
            else:
                object_id_list.append(id)

        print(f'Ids to add: {object_id_list}')
        updates['dependencies_llm_ids'] = object_id_list
        result = llm_collection.update_one({"_id": document_id}, {"$set": updates})
        
        if result.matched_count > 0:
            print(f"Document with ID {document_id} updated successfully.")
        else:
            print(f"No document found with ID {document_id}.")

update_dependencies_id()

#### Join objectIDs of scores table with lmm_data

In [None]:
scores_collection = db['llm_test_scores']
# for doc in scores_collection.find():
#     print(doc['llm_data_model']) 

scores_collection.find_one()

In [None]:
# Add llm_data_model field to scores and fill with none
scores_collection = db['llm_test_scores']
# Define the update operation
update_operation = {
    "$set": {
        "llm_data_model": None
    }
}

# Update all documents in the collection
result = scores_collection.update_many({}, update_operation)
# Print the result of the update operation
print(f"Modified {result.modified_count} documents.")

In [None]:
# populate correct name to llm_test_scores llm_data_model field
# Define the mapping between names and model names
scores_collection = db['llm_test_scores']

# Read the JSON data into a DataFrame
# Opening JSON file
f = open('Stanford_HELM_model_matches.JSON')
data = json.load(f)
#print(data)

# Function to update documents based on the mapping
def update_llm_data_model(collection, model_mapping):
    for name, model in model_mapping.items():
        if model != 'no_match':
            result = collection.update_many(
                {"model": model},
                {"$set": {"llm_data_model": name}}
            )
            print(f"Updated {result.modified_count} documents for model: {model} with llm_data_model: {name}")

# Call the function to update the documents
update_llm_data_model(scores_collection, data)


In [68]:
# use llm_data_model field to compare name with model_name in llm_data 
# to get ObjectID of score in llm_data

# Collections
llm_collection = db['llm_datas']  # Collection with model data
scores_collection = db['llm_test_scores'] # Collection with test scores

# Get all models in scores table that are in llm_datas table
model_id_mapping = {}
for doc in scores_collection.find({}, {"_id": 1, "llm_data_model": 1}):
    if doc['llm_data_model'] != None:
        model_id_mapping[doc["llm_data_model"]] = doc["_id"]

# populate score id in llm_datas table
for doc in llm_collection.find():
    llm_data_name = doc.get("name")
    
    if llm_data_name in model_id_mapping:
        model_id = doc.get("_id")
        test_scores_id = model_id_mapping[llm_data_name]
        result = llm_collection.update_one(
                {"_id": model_id},
                {"$set": {"test_scores_id": test_scores_id}}
            )
        print(f"Updated document {model_id} with test_id {test_scores_id}")
  


Updated document 669c43febcb5bda17074b731 with test_id 669a2393db0298c298c50927
Updated document 669c43febcb5bda17074b73f with test_id 669a2393db0298c298c5093e
Updated document 669c43febcb5bda17074b77f with test_id 66999aa18638ce80c646d131
Updated document 669c43febcb5bda17074b78a with test_id 66999aae8638ce80c646d137
Updated document 669c43febcb5bda17074b7ba with test_id 669a2393db0298c298c5092e
Updated document 669c43febcb5bda17074b7bb with test_id 669a2393db0298c298c5092d
Updated document 669c43febcb5bda17074b7bc with test_id 669a2393db0298c298c5092a
Updated document 669c43febcb5bda17074b7bd with test_id 669a2393db0298c298c5092b
Updated document 669c43febcb5bda17074b7bf with test_id 66999aa18638ce80c646d12e
Updated document 669c43febcb5bda17074b7d3 with test_id 669a2393db0298c298c5092f
Updated document 669c43febcb5bda17074b7dd with test_id 66999a498638ce80c646d126
Updated document 669c43febcb5bda17074b7f0 with test_id 669a2393db0298c298c50940
Updated document 669c43febcb5bda17074b82