This notebook will take a list of github URLs and return what checkboxes they tick off for reproducability.

In [24]:
import requests
import base64
import re

import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [25]:
# https://github.com/RRinTransportation/rr-measure-basic

def get_readme_from_github(url):
    parts = url.split("/")
    owner = parts[3]
    repo = parts[4]

    # GitHub API URL for the repository README
    url = f"https://api.github.com/repos/{owner}/{repo}/readme"
    headers = {
        "Accept": "application/vnd.github.v3+json"
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        readme_data = response.json()
        readme_content_base64 = readme_data.get("content", "")
        # Decode the base64 content
        readme_content = base64.b64decode(readme_content_base64).decode('utf-8')
        return readme_content
    else:
        return f"Error: Unable to fetch README (status code: {response.status_code})"

# Example usage
url = "https://github.com/RRinTransportation/rr-measure-basic/tree/main?tab=readme-ov-file"
readme_content = get_readme_from_github(url)
# print(readme_content)

The below are different "checks" to run, to make a new check just write a new function. Each takes a string (the markdown of readme) as input and outputs true (satisfies check) or false (does not satisfy check).

In [48]:
model = genai.GenerativeModel("gemini-1.5-flash")

def check_keywords(readme_text, keywords):
    """Check for presence of keywords in README."""

    keyword_matches = {keyword: re.search(rf"\b{keyword}\b", readme_text, re.IGNORECASE) 
                       for keyword in keywords}
    return {kw: match.group(0) for kw, match in keyword_matches.items() if match}

def readme_not_empty(readme):
    """
    Returns true if readme has more than 150 characters (since github default adds title)
    """
    return len(readme) > 150

def data_location(readme):
    """
    Check if readme states where the data can be found
    """

    keywords = [
        "data", "dataset", "data source", "download", "available", "access", 
        "input data", "training data", "data link", "data directory", 
        "data location", "data path", "data folder", "data archive"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False

def enviroment_setup(readme):
    """
    Check if readme includes how to setup the environment / dependencies
    """
    keywords = [
        # General setup terms
        "requirements", "dependencies", "environment", "install", "setup", 
        "virtualenv", "pip", "docker", "build", "configuration",
        "environment.yaml", "requirements.txt", 

        # MATLAB-specific terms
        "MATLAB", "matlabpath", "toolbox", "mex", "matlab script", "matlab install",

        # C/C++-specific terms
        "makefile", "gcc", "g\+\+", "cmake", "make", "compile", "C compiler", "C\+\+ compiler",

        # Java-specific terms
        "JDK", "Maven", "Gradle", "java -jar", "javac", "java version", "Java SDK",

        # R-specific terms
        "R package", "CRAN", "install.packages", "Rscript", "R environment", "R version"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False


def commands(readme):
    """
    Check if readme includes commands that can be run to reproduce data
    """
    keywords = [
        "run", "execute", "executable", "command", "reproduce", "steps", "usage", 
        "experiment", "reproduction", "how to run", "command line", 
        "terminal", "CLI", "script", "bash", "shell", "notebook", "ipynb", "notebooks"
    ]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False

def parameters(readme):
    """
    Check if readme includes parameters that can be run to reproduce data

    Checks for 2 things:
    1 - keywords about parameters
    2 - anything of the format --[something]=[something]
    """

    keywords = ["parameters", "hyperparameters", "configuration", "settings", "args", "arguments", "--[a-zA-Z0-9_-]+=[a-zA-Z0-9_-]+"]

    if not readme:
        return {}
    
    matches = check_keywords(readme, keywords)
    return True if matches else False
    

The following dictionary contains every check and will list all repos that satisfy the check.

In [49]:
check_functions = {
    "readme not empty": readme_not_empty,
    "environment setup": enviroment_setup,
    "data location": data_location,
    "commands": commands,
    "parameters": parameters
    # ADD NEW FUNCTIONS HERE
}

check_repos = {}

for check in check_functions:
    check_repos[check] = []

#github_urls = ["https://github.com/uctb/UCTB/", "https://github.com/marsauto/europilot", "https://github.com/hengli/camodocal"] # ADD URLS HERE

github_urls = [
    "https://github.com/I24-MOTION/data_tutorial",
    "https://github.com/I24-MOTION/VT_tools",
    "https://github.com/I24-MOTION/i24-video-dataset-utils",
    "https://github.com/I24-MOTION/i24_track_stack",
    "https://github.com/I24-MOTION/i24_rcs",
    "https://github.com/I24-MOTION/i24_database_api",
    "https://github.com/I24-MOTION/I24_logging",
    "https://github.com/I24-MOTION/i24_configparse",
    "https://github.com/I24-MOTION/I24-3D-dataset",
    "https://github.com/I24-MOTION/I24-postprocessing-lite",
    "https://github.com/I24-MOTION/I24M_documentation",
    "https://github.com/I24-MOTION/I24M_improvement_tracker"
]

for i, url in enumerate(github_urls):
    readme = get_readme_from_github(url)
    # print(readme)
    for check in check_functions:
        if check_functions[check](readme):
            check_repos[check].append(url.split("/")[4])

check_repos

{'readme not empty': ['data_tutorial',
  'VT_tools',
  'i24-video-dataset-utils',
  'i24_rcs',
  'i24_database_api',
  'I24_logging',
  'i24_configparse',
  'I24-3D-dataset',
  'I24-postprocessing-lite',
  'I24M_documentation',
  'I24M_improvement_tracker'],
 'environment setup': ['data_tutorial',
  'VT_tools',
  'i24-video-dataset-utils',
  'i24_rcs',
  'i24_database_api',
  'I24_logging',
  'i24_configparse',
  'I24-3D-dataset',
  'I24-postprocessing-lite',
  'I24M_documentation',
  'I24M_improvement_tracker'],
 'data location': ['data_tutorial',
  'VT_tools',
  'i24-video-dataset-utils',
  'i24_rcs',
  'i24_database_api',
  'i24_configparse',
  'I24-3D-dataset',
  'I24-postprocessing-lite',
  'I24M_documentation',
  'I24M_improvement_tracker'],
 'commands': ['data_tutorial',
  'VT_tools',
  'i24-video-dataset-utils',
  'i24_rcs',
  'i24_database_api',
  'I24_logging',
  'I24-3D-dataset',
  'I24-postprocessing-lite'],
 'parameters': ['i24_database_api',
  'I24_logging',
  'i24_config