In [8]:
# This Python notebook is designed to:
# 1. Create a list of GitHub repositories from the awesome list found in the README.md file at https://github.com/jamesmurdza/awesome-ai-devtools
# 2. For each GitHub repository in the list, read the README file and extract the installation instructions (e.g., sections titled "How to install", code blocks containing "pip install", "git clone .git", etc.)
# 3. Generate a JSON-LD file with the following fields for each repository:
#    - field1: URL (the GitHub link of the repository)
#    - field2: Text (the extracted installation instructions)
#    - field3: Tokens (the individual tokens of the text in field2)


In [9]:
import requests
from bs4 import BeautifulSoup
import re
import json
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import FOAF, XSD

In [10]:
import requests
import re

def fetch_raw_markdown(url):
    response = requests.get(url)
    return response.text if response.status_code == 200 else None

def extract_github_urls(markdown_content):
    # This regex matches GitHub repository URLs
    pattern = re.compile(r'https://github\.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+')
    return pattern.findall(markdown_content)

# URL to the raw markdown content of the awesome list
raw_url = "https://raw.githubusercontent.com/jamesmurdza/awesome-ai-devtools/main/README.md"

markdown_content = fetch_raw_markdown(raw_url)
if markdown_content:
    repos_urls = extract_github_urls(markdown_content)
    print(repos_urls)
else:
    print("Failed to fetch the markdown content.")

['https://github.com/silvanmelchior/IncognitoPilot', 'https://github.com/features/preview', 'https://github.com/smallcloudai/refact', 'https://github.com/codota/TabNine', 'https://github.com/rubberduck-ai/rubberduck-vscode', 'https://github.com/rsaryev/talk-codebase', 'https://github.com/beimzhan/shell-whiz', 'https://github.com/smol-ai/developer', 'https://github.com/paul-gauthier/aider', 'https://github.com/AntonOsika/gpt-engineer', 'https://github.com/0xpayne/gpt-migrate', 'https://github.com/melih-unsal/DemoGPT', 'https://github.com/kuafuai/DevOpsGPT', 'https://github.com/sweepai/sweep', 'https://github.com/mattzcarey/code-review-gpt', 'https://github.com/Codium-ai/pr-agent', 'https://github.com/keerthanpg/SwePT', 'https://github.com/Yuyz0112/vx', 'https://github.com/smol-ai/developer', 'https://github.com/morph-labs/rift', 'https://github.com/kesor/chatgpt-code-plugin']


In [11]:
# for each url found in the repos_urls, extract the installation instructions found in each readme file. For instance, the first url is https://github.com/silvanmelchior/IncognitoPilot; you need to find the readme.md file and extract the installation instructions found in the readme.md. In this case you should extract the line of comments and code text in here https://github.com/silvanmelchior/IncognitoPilot/blob/main/README.md#package-installation-gpt-via-openai-api

In [12]:
# Function to generate JSON-LD
def generate_jsonld(repos_data):
    g = Graph()
    for repo in repos_data:
        repo_uri = URIRef(repo['URL'])
        g.add((repo_uri, RDF.type, FOAF.Document))
        g.add((repo_uri, FOAF.topic, Literal(repo['Text'], datatype=XSD.string)))
        # Tokens can be added similarly
    g.serialize(destination="output.jsonld", format='json-ld')

In [13]:

# Main execution
awesome_list_url = "https://raw.githubusercontent.com/jamesmurdza/awesome-ai-devtools/main/README.md"
repos_urls = fetch_repo_list(awesome_list_url)
print(repos_urls)

NameError: name 'fetch_repo_list' is not defined

In [None]:
repos_data = []

for repo_url in repos_urls:
    readme_content = fetch_readme_content(repo_url)
    if readme_content:
        instructions = extract_installation_instructions(readme_content)
        repos_data.append({
            "URL": repo_url,
            "Text": " ".join(instructions),
            "Tokens": []  # Tokenization can be added as needed
        })

generate_jsonld(repos_data)

In [None]:
import requests
import re

def fetch_readme_content(repo_url):
    """
    Fetches the README content from a GitHub repository.
    """
    # Convert the GitHub repository URL to the raw README.md URL
    readme_url = repo_url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"
    response = requests.get(readme_url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def extract_installation_instructions(readme_content):
    """
    Extracts installation instructions from the README content.
    """
    # Define keywords to search for in the README content
    keywords = ["installation", "setup", "install", "how to", "getting started", "quick start"]
    # Combine keywords into a regex pattern
    pattern = re.compile("|".join(keywords), re.IGNORECASE)
    # Split the README content into sections
    sections = re.split(r'#+ ', readme_content)
    # Filter sections that contain any of the keywords
    installation_sections = [section for section in sections if pattern.search(section)]
    return installation_sections

# Example usage
repo_url = "https://github.com/silvanmelchior/IncognitoPilot"
readme_content = fetch_readme_content(repo_url)
if readme_content:
    installation_instructions = extract_installation_instructions(readme_content)
    print("\n\n".join(installation_instructions))
else:
    print("Failed to fetch the README content.")

In [None]:
import requests
import re

def fetch_raw_markdown(url):
    """
    Fetches the raw markdown content from a URL.
    """
    response = requests.get(url)
    return response.text if response.status_code == 200 else None

def extract_github_urls(markdown_content):
    """
    Extracts GitHub repository URLs from markdown content.
    """
    pattern = re.compile(r'https://github\.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+')
    return pattern.findall(markdown_content)

def fetch_readme_content(repo_url):
    """
    Fetches the README content from a GitHub repository.
    """
    readme_url = repo_url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"
    response = requests.get(readme_url)
    return response.text if response.status_code == 200 else None

def extract_installation_instructions(readme_content):
    """
    Extracts installation instructions from the README content.
    """
    keywords = ["installation", "setup", "install", "how to", "getting started", "quick start"]
    pattern = re.compile("|".join(keywords), re.IGNORECASE)
    sections = re.split(r'#+ ', readme_content)
    installation_sections = [section for section in sections if pattern.search(section)]
    return installation_sections

# Main execution
awesome_list_url = "https://raw.githubusercontent.com/jamesmurdza/awesome-ai-devtools/main/README.md"
markdown_content = fetch_raw_markdown(awesome_list_url)
if markdown_content:
    repos_urls = extract_github_urls(markdown_content)
    for repo_url in repos_urls:
        readme_content = fetch_readme_content(repo_url)
        if readme_content:
            installation_instructions = extract_installation_instructions(readme_content)
            print(f"Installation instructions for {repo_url}:\n{' '.join(installation_instructions)}\n")
        else:
            print(f"Failed to fetch README content for {repo_url}")
else:
    print("Failed to fetch the markdown content of the awesome list.")

Installation instructions for https://github.com/silvanmelchior/IncognitoPilot:
:package: Installation (GPT via OpenAI API)

This section shows how to install **Incognito Pilot** using a GPT model via OpenAI's API. For

- **Code Llama / Llama 2**, check [Installation for Llama 2](/docs/INSTALLATION_LLAMA.md) instead, and for
- **GPT on Azure**, check [Installation with Azure](/docs/INSTALLATION_AZURE.md) instead.
- If you don't have docker, you can install **Incognito Pilot** on your system directly, using the development setup (see below).

Follow these steps:

1. Install [docker](https://www.docker.com/).
2. Create an empty folder somewhere on your system.
   This will be the working directory to which **Incognito Pilot** has access to.
   The code interpreter can read your files in this folder and store any results.
   In the following, we assume it to be */home/user/ipilot*.
3. Create an [OpenAI account](https://platform.openai.com),
   add a [credit card](https://platform.openai.c

In [50]:
import requests
import re
import json
from nltk.tokenize import word_tokenize

def fetch_raw_markdown(url):
    response = requests.get(url)
    return response.text if response.status_code == 200 else None

def extract_github_urls(markdown_content):
    pattern = re.compile(r'https://github\.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+')
    return pattern.findall(markdown_content)

def fetch_readme_content(repo_url):
    readme_url = repo_url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"
    response = requests.get(readme_url)
    return response.text if response.status_code == 200 else None

def extract_installation_instructions(readme_content):
    keywords = ["installation", "setup", "install", "how to", "getting started", "quick start"]
    pattern = re.compile("|".join(keywords), re.IGNORECASE)
    sections = re.split(r'#+ ', readme_content)
    installation_sections = [section for section in sections if pattern.search(section)]
    return installation_sections

def tokenize_text(text):
    return word_tokenize(text)

# Create an heuristic classifier to cluster the repo_url by complexity of the installation instructions: 
# complexity = 0 if the installation instructions contain in token and text: "pip install", "package manager install",
# complexity = 1 if the installation instructions contain: "container", "docker container", "docker componse up"
# complexity = 2 if the installation instructions contain: "from source", "git clone", ".git"
# append the heuristic classifier to the repos_data


# Main execution
awesome_list_url = "https://raw.githubusercontent.com/jamesmurdza/awesome-ai-devtools/main/README.md"
markdown_content = fetch_raw_markdown(awesome_list_url)
repos_data = []

if markdown_content:
    repos_urls = extract_github_urls(markdown_content)
    for repo_url in repos_urls:
        readme_content = fetch_readme_content(repo_url)
        if readme_content:
            installation_instructions = extract_installation_instructions(readme_content)
            instructions_text = " ".join(installation_instructions)
            tokens = tokenize_text(instructions_text)

            # Heuristic classifier
            complexity = -1  # Default complexity
            if any(word in tokens for word in ["pip install", "package manager install"]):
                complexity = 0
            elif any(word in instructions_text for word in ["container", "docker container", "docker compose up"]):
                complexity = 1
            elif any(word in instructions_text for word in ["from source", "git clone", ".git"]):
                complexity = 2

            repos_data.append({
                "url": repo_url,
                "text": instructions_text,
                "tokens": tokens,
                "level complexity": complexity
            })
else:
    print("Failed to fetch the markdown content of the awesome list.")

# Output to a JSON file
with open('output.json', 'w') as outfile:
    json.dump(repos_data, outfile, indent=4)

In [62]:
import pandas as pd
columns_long_list = ['url', 'text', 'tokens', 'level complexity']
columns_short_list = ['url', 'text', 'tokens']
df = pd.read_json('output.json')
df.head(3)

FileNotFoundError: File output2.json does not exist

In [54]:
df.groupby(['level complexity'])['tokens'].count()

level complexity
-1    5
 1    2
 2    8
Name: tokens, dtype: int64

In [55]:
df['token_len'] = df.tokens.apply(lambda x: len(x))
query_len_summary = df.groupby('level complexity')['token_len'].quantile([.5, .7, .8, .9, .95])
display(pd.DataFrame(query_len_summary))

Unnamed: 0_level_0,Unnamed: 1_level_0,token_len
level complexity,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.5,107.0
-1,0.7,519.8
-1,0.8,671.4
-1,0.9,768.2
-1,0.95,816.6
1,0.5,745.5
1,0.7,894.5
1,0.8,969.0
1,0.9,1043.5
1,0.95,1080.75


In [57]:
df['text'] = df.tokens.apply(lambda x: len(x))
query_len_summary = df.groupby('level complexity')['text'].quantile([.5, .7, .8, .9, .95])
display(pd.DataFrame(query_len_summary))

Unnamed: 0_level_0,Unnamed: 1_level_0,text
level complexity,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.5,107.0
-1,0.7,519.8
-1,0.8,671.4
-1,0.9,768.2
-1,0.95,816.6
1,0.5,745.5
1,0.7,894.5
1,0.8,969.0
1,0.9,1043.5
1,0.95,1080.75


In [40]:
import pandas as pd
from collections import Counter

# Assuming df is your DataFrame and 'field3' is the column with tokens
# Step 1: Aggregate Tokens
all_tokens = sum(df['tokens'].tolist(), [])

# Step 2: Count Frequencies
token_counts = Counter(all_tokens)



# Step 3: Summarize Most Frequent Tokens
most_common_tokens = token_counts.most_common(10)  # Adjust the number to get more or fewer tokens

# Convert the most common tokens to a DataFrame for a nicer display
summary_df = pd.DataFrame(most_common_tokens, columns=['Token', 'Frequency'])

print(summary_df)

  Token  Frequency
0     :        379
1     `        338
2     .        268
3   the        243
4     *        211
5     ,        180
6     |        180
7     (        168
8     )        168
9     -        152


In [56]:
# Define the specific tokens you're interested in
import re
specific_tokens = [token for token in summary_df['Token'] if re.search(r'docker', token, re.IGNORECASE)]
print(specific_tokens)
# Filter the DataFrame for rows where the 'Token' column contains any of the specific tokens
filtered_df = summary_df[summary_df['Token'].isin(specific_tokens)]

print(filtered_df)


[]
Empty DataFrame
Columns: [Token, Frequency]
Index: []
