In [None]:
# This Python notebook is designed to:
# 1. Create a list of GitHub repositories from the awesome list found in the README.md file at https://github.com/jamesmurdza/awesome-ai-devtools
# 2. For each GitHub repository in the list, read the README file and extract the installation instructions (e.g., sections titled "How to install", code blocks containing "pip install", "git clone .git", etc.)
# 3. Generate a JSON-LD file with the following fields for each repository:
#    - field1: URL (the GitHub link of the repository)
#    - field2: Text (the extracted installation instructions)
#    - field3: Tokens (the individual tokens of the text in field2)


In [2]:
import requests
from bs4 import BeautifulSoup
import re
import json
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import FOAF, XSD

In [20]:
   def fetch_repo_list(url):
       response = requests.get(url)
       soup = BeautifulSoup(response.text, 'html.parser')
       # Adjusted regex pattern based on your findings
       repo_links = soup.find_all('a', href=re.compile('https://github\.com/[^/]+/[^/]+'))
       return [link['href'] for link in repo_links]

In [21]:

# Function to extract README content using GitHub API
def fetch_readme_content(repo_url):
    api_url = f"https://api.github.com/repos/{'/'.join(repo_url.split('/')[-2:])}/readme"
    headers = {'Accept': 'application/vnd.github.v3.raw'}
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return None

In [22]:
# Function to extract installation instructions from README content
def extract_installation_instructions(readme_content):
    # Simple regex to find installation blocks, can be improved
    matches = re.findall(r'```[bash|sh].*?pip install.*?```', readme_content, re.DOTALL)
    return matches

In [23]:
# Function to generate JSON-LD
def generate_jsonld(repos_data):
    g = Graph()
    for repo in repos_data:
        repo_uri = URIRef(repo['URL'])
        g.add((repo_uri, RDF.type, FOAF.Document))
        g.add((repo_uri, FOAF.topic, Literal(repo['Text'], datatype=XSD.string)))
        # Tokens can be added similarly
    g.serialize(destination="output.jsonld", format='json-ld')

In [24]:

# Main execution
awesome_list_url = "https://raw.githubusercontent.com/jamesmurdza/awesome-ai-devtools/main/README.md"
repos_urls = fetch_repo_list(awesome_list_url)
print(repos_urls)

[]


In [8]:
repos_data = []

for repo_url in repos_urls:
    readme_content = fetch_readme_content(repo_url)
    if readme_content:
        instructions = extract_installation_instructions(readme_content)
        repos_data.append({
            "URL": repo_url,
            "Text": " ".join(instructions),
            "Tokens": []  # Tokenization can be added as needed
        })

generate_jsonld(repos_data)