In [None]:
# This Python notebook is designed to:
# 1. Create a list of GitHub repositories from the awesome list found in the README.md file at https://github.com/jamesmurdza/awesome-ai-devtools
# 2. For each GitHub repository in the list, read the README file and extract the installation instructions (e.g., sections titled "How to install", code blocks containing "pip install", "git clone .git", etc.)
# 3. Generate a JSON-LD file with the following fields for each repository:
#    - field1: URL (the GitHub link of the repository)
#    - field2: Text (the extracted installation instructions)
#    - field3: Tokens (the individual tokens of the text in field2)


## AWESOME LIST OF AI DEVTOOLS

In [None]:
# for each url found in the repos_urls, extract the installation instructions found in each readme file. For instance, the first url is https://github.com/silvanmelchior/IncognitoPilot; you need to find the readme.md file and extract the installation instructions found in the readme.md. In this case you should extract the line of comments and code text in here https://github.com/silvanmelchior/IncognitoPilot/blob/main/README.md#package-installation-gpt-via-openai-api

In [1]:
import requests
import re
import json
from nltk.tokenize import word_tokenize

def fetch_raw_markdown(url):
    response = requests.get(url)
    return response.text if response.status_code == 200 else None

def extract_github_urls(markdown_content):
    pattern = re.compile(r'https://github\.com/[a-zA-Z0-9_-]+/[a-zA-Z0-9_-]+')
    return pattern.findall(markdown_content)

def fetch_readme_content(repo_url):
    readme_url = repo_url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"
    response = requests.get(readme_url)
    return response.text if response.status_code == 200 else None

# def extract_installation_instructions(readme_content):
#     keywords = ["installation", "setup", "install", "how to", "getting started", "quick start"]
#     pattern = re.compile("|".join(keywords), re.IGNORECASE)
#     sections = re.split(r'#+ ', readme_content)
#     installation_sections = [section for section in sections if pattern.search(section)]
#     return installation_sections

def extract_installation_instructions(readme_content):
    sections = re.split(r'#+ ', readme_content)
    installation_sections = [section for section in sections if re.search(r'installation|setup|install|how to|getting started|quick start', section, re.IGNORECASE)]
    return installation_sections

def differentiate_comments(installation_text):
    code_comments = re.findall(r'```.*?```', installation_text, re.DOTALL)
    text_comments = re.sub(r'```.*?```', '', installation_text, flags=re.DOTALL)
    return text_comments, code_comments

def tokenize_text(text):
    return word_tokenize(text)

# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import stopwords
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
   
# def tokenize_and_lemmatize(text):
#     lemmatizer = WordNetLemmatizer()
#     stop_words = set(stopwords.words('english'))
#     tokens = [lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words]
#     return tokens


# Create an heuristic classifier to cluster the repo_url by complexity of the installation instructions: 
# complexity = 0 if the installation instructions contain in token and text: "pip install", "package manager install",
# complexity = 1 if the installation instructions contain: "container", "docker container", "docker componse up"
# complexity = 2 if the installation instructions contain: "from source", "git clone", ".git"
# append the heuristic classifier to the repos_data


# Main execution
awesome_list_url = "https://raw.githubusercontent.com/jamesmurdza/awesome-ai-devtools/main/README.md"
markdown_content = fetch_raw_markdown(awesome_list_url)
repos_data = []

if markdown_content:
    repos_urls = extract_github_urls(markdown_content)
    for repo_url in repos_urls:
        readme_content = fetch_readme_content(repo_url)
        if readme_content:
            installation_instructions = extract_installation_instructions(readme_content)
            instructions_text = " ".join(installation_instructions)
            tokens = tokenize_text(instructions_text)

            # Heuristic classifier
            complexity = -1  # Default complexity
            if any(word in tokens for word in ["pip install", "package manager install"]):
                complexity = 0
            elif any(word in instructions_text for word in ["container", "docker container", "docker compose up"]):
                complexity = 1
            elif any(word in instructions_text for word in ["from source", "git clone", ".git"]):
                complexity = 2

            repos_data.append({
                "url": repo_url,
                "text": instructions_text,
                "tokens": tokens,
                "level complexity": complexity
            })
else:
    print("Failed to fetch the markdown content of the awesome list.")

# Output to a JSON file
with open('data/corpus-awesome_list.json', 'w') as outfile:
    json.dump(repos_data, outfile, indent=4)

In [2]:
import pandas as pd
columns_long_list = ['url', 'text', 'tokens', 'level complexity']
columns_short_list = ['url', 'text', 'tokens']
df = pd.read_json('data/corpus-awesome_list.json')
df.head(3)

  from pandas.core import (


Unnamed: 0,url,text,tokens,level complexity
0,https://github.com/silvanmelchior/IncognitoPilot,:package: Installation (GPT via OpenAI API)\n\...,"[:, package, :, Installation, (, GPT, via, Ope...",1
1,https://github.com/smallcloudai/refact,Running Refact Self-Hosted in a Docker Contain...,"[Running, Refact, Self-Hosted, in, a, Docker, ...",1
2,https://github.com/rubberduck-ai/rubberduck-vs...,Quick Install\n\nYou can install Rubberduck fr...,"[Quick, Install, You, can, install, Rubberduck...",-1


In [3]:
df['level complexity'].value_counts()

level complexity
 2    8
-1    5
 1    2
Name: count, dtype: int64

In [4]:
df.groupby(['level complexity'])['tokens'].count()

level complexity
-1    5
 1    2
 2    8
Name: tokens, dtype: int64

In [5]:
df['token_len'] = df.tokens.apply(lambda x: len(x))
query_len_summary = df.groupby('level complexity')['token_len'].quantile([.5, .7, .8, .9, .95])
display(pd.DataFrame(query_len_summary))

Unnamed: 0_level_0,Unnamed: 1_level_0,token_len
level complexity,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.5,107.0
-1,0.7,519.8
-1,0.8,671.4
-1,0.9,768.2
-1,0.95,816.6
1,0.5,746.5
1,0.7,895.1
1,0.8,969.4
1,0.9,1043.7
1,0.95,1080.85


In [6]:
df['text'] = df.tokens.apply(lambda x: len(x))
query_len_summary = df.groupby('level complexity')['text'].quantile([.5, .7, .8, .9, .95])
display(pd.DataFrame(query_len_summary))

Unnamed: 0_level_0,Unnamed: 1_level_0,text
level complexity,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,0.5,107.0
-1,0.7,519.8
-1,0.8,671.4
-1,0.9,768.2
-1,0.95,816.6
1,0.5,746.5
1,0.7,895.1
1,0.8,969.4
1,0.9,1043.7
1,0.95,1080.85


In [7]:
import pandas as pd
from collections import Counter

# Assuming df is your DataFrame and 'field3' is the column with tokens
# Step 1: Aggregate Tokens
all_tokens = sum(df['tokens'].tolist(), [])

# Step 2: Count Frequencies
token_counts = Counter(all_tokens)



# Step 3: Summarize Most Frequent Tokens
most_common_tokens = token_counts.most_common(10)  # Adjust the number to get more or fewer tokens

# Convert the most common tokens to a DataFrame for a nicer display
summary_df = pd.DataFrame(most_common_tokens, columns=['Token', 'Frequency'])

print(summary_df)

  Token  Frequency
0     :        373
1     `        338
2     .        268
3   the        242
4     *        227
5     |        186
6     ,        180
7     (        169
8     )        169
9     -        148


In [8]:
# Define the specific tokens you're interested in
import re
specific_tokens = [token for token in summary_df['Token'] if re.search(r'docker', token, re.IGNORECASE)]
print(specific_tokens)
# Filter the DataFrame for rows where the 'Token' column contains any of the specific tokens
filtered_df = summary_df[summary_df['Token'].isin(specific_tokens)]

print(filtered_df)


[]
Empty DataFrame
Columns: [Token, Frequency]
Index: []


---

## PaperWithCodes

In [None]:
!pip install requests nltk pandas paperswithcode-client

In [10]:
import requests
import re
   
def fetch_repositories():
    response = requests.get("https://paperswithcode.com/api/v1/repositories/")
    return response.json()['results'] if response.status_code == 200 else []

In [11]:
def fetch_readme_content(repo_url):
    readme_url = repo_url
    response = requests.get(readme_url)
    return response.text if response.status_code == 200 else None

In [12]:

def extract_installation_instructions(readme_content):
    sections = re.split(r'#+ ', readme_content)
    installation_sections = [section for section in sections if re.search(r'installation|setup|install|how to|getting started|quick start', section, re.IGNORECASE)]
    return installation_sections

In [13]:
def differentiate_comments(installation_text):
    code_comments = re.findall(r'```.*?```', installation_text, re.DOTALL)
    text_comments = re.sub(r'```.*?```', '', installation_text, flags=re.DOTALL)
    return text_comments, code_comments

In [14]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
   
def tokenize_and_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word) for word in word_tokenize(text) if word not in stop_words]
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ccugutrillague/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ccugutrillague/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ccugutrillague/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
import pandas as pd
   
def output_to_files(data):
    df = pd.DataFrame(data)
    df.to_json("data/corpus-paperwithcodes.json", orient="records", lines=True)
    df.to_csv("data/corpus-paperwithcodes.csv", index=False)

In [17]:
def process_repository(repo):
    data = {}
    readme_content = fetch_readme_content(repo['url'])
    if readme_content:
        installation_sections = extract_installation_instructions(readme_content)
        for section in installation_sections:
            text_comments, code_comments = differentiate_comments(section)
            tokens = tokenize_and_lemmatize(section)
            data = {
                "url": repo['url'],
                "readme_url": repo['url'],
                "text": section,
                "comments": text_comments,
                "code-comments": " ".join(code_comments),
                "sentence": sent_tokenize(section),
                "tokens": tokens
            }
    return data

def main():
    repos = fetch_repositories()
    processed_data = []
    for repo in repos:
        repo_data = process_repository(repo)
        if repo_data:
            processed_data.append(repo_data)
    
    # Output to files
    output_to_files(processed_data)

def output_to_files(data):
    df = pd.DataFrame(data)
    df.to_json("data/corpus-paperwithcodes.json", orient="records", lines=True)
    df.to_csv("data/corpus-paperwithcodes.csv", index=False)

if __name__ == "__main__":
    main()

In [None]:
from paperswithcode import PapersWithCodeClient

client = PapersWithCodeClient()
papers = client.repository_list()
print(papers.next_page)

In [None]:
paper = papers.results[4]
paper

---
## Github -repos

In [18]:
def getRepostitoryTopics(url, GitHub_Token, not_found):
    header = {'Authorization': 'Bearer ' + GitHub_Token}
    reposUrl = f"https://api.github.com/repos/{url}"
    reposr = requests.get(reposUrl, headers = header)
    reposj = reposr.json()
    try:
        topics = reposj["topics"]
        return (not_found, topics)
    except:
        return (not_found + 1, [])

In [19]:
repo_dict = {}
GitHub_Token = "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"
listTopics = ['LLM']

In [20]:
# def get_repositories_by_topic(topic):
#     header = {'Authorization': 'Bearer ' + GitHub_Token}
#     reposUrl = f"https://api.github.com/search/repositories?q=topic:{topic}&per_page=50"
#     reposr = requests.get(reposUrl, headers = header)
#     reposj = reposr.json()
#     return reposj["items"]

def get_repositories_by_topic(topic):
    header = {'Authorization': 'Bearer ' + GitHub_Token}
    reposUrl = f"https://api.github.com/search/repositories?q=topic:{topic}&per_page=50"
    reposr = requests.get(reposUrl, headers=header)
    
    # Check if the request was successful
    if reposr.status_code == 200:
        reposj = reposr.json()
        # Check if 'items' key exists in the response
        if "items" in reposj:
            return reposj["items"]
        else:
            print(f"'items' key not found in response. Response JSON: {reposj}")
            return []
    else:
        print(f"GitHub API request failed with status code: {reposr.status_code}")
        return []

In [21]:
urls_with_topics = {}
for topic in listTopics:
    repositories_json = get_repositories_by_topic(topic)
    for repository in repositories_json:
        topics = repository["topics"]
        combined_topics_string = '\t'.join(topics)
        if(not "LLM" in combined_topics_string):
            urls_with_topics[repository["html_url"]] = topics

In [22]:
print(urls_with_topics)

{'https://github.com/ollama/ollama': ['go', 'golang', 'llama', 'llama2', 'llm', 'llms', 'mistral', 'ollama'], 'https://github.com/geekan/MetaGPT': ['agent', 'gpt', 'hacktoberfest', 'llm', 'metagpt', 'multi-agent'], 'https://github.com/run-llama/llama_index': ['agents', 'application', 'data', 'fine-tuning', 'framework', 'llamaindex', 'llm', 'rag', 'vector-database'], 'https://github.com/QuivrHQ/quivr': ['ai', 'api', 'chatbot', 'chatgpt', 'database', 'docker', 'frontend', 'html', 'javascript', 'llm', 'openai', 'postgresql', 'privacy', 'rag', 'react', 'rest-api', 'security', 'typescript', 'vector', 'ycombinator'], 'https://github.com/milvus-io/milvus': ['anns', 'cloud-native', 'distributed', 'embedding-database', 'embedding-similarity', 'embedding-store', 'faiss', 'golang', 'hnsw', 'image-search', 'llm', 'nearest-neighbor-search', 'tensor-database', 'vector-database', 'vector-search', 'vector-similarity', 'vector-store'], 'https://github.com/JushBJJ/Mr.-Ranedeer-AI-Tutor': ['ai', 'educati

In [23]:
import requests
import nltk
import json
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def fetch_readme_content(repo_url):
    readme_url = repo_url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"
    response = requests.get(readme_url)
    return response.text if response.status_code == 200 else None
# todo: split text into sentences; tokenize the text; lemmatize and lowercase all tokens; remove stop words (preprocessing)

def extract_installation_instructions(readme_content):
    keywords = ["installation", "setup", "install", "how to", "getting started", "quick start"]
    pattern = re.compile("|".join(keywords), re.IGNORECASE)
    sections = re.split(r'#+ ', readme_content)
    installation_sections = [section for section in sections if pattern.search(section)]
    return installation_sections

# def tokenize_text(text):
#     return word_tokenize(text)
def process_text(text):
    sentences = sent_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    processed_sentences = []
    processed_tokens = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
        
        processed_sentences.append(' '.join(lemmatized_tokens))  # Join tokens to form the processed sentence
        processed_tokens.extend(lemmatized_tokens)  # Extend the list of processed tokens
    
    return processed_tokens, processed_sentences

def classify_complexity(text):
    complexity = -1  # Default complexity
    if any(word in text for word in ["pip install", "package manager install"]):
        complexity = 0
    elif any(word in text for word in ["container", "docker container", "docker compose up"]):
        complexity = 1
    elif any(word in text for word in ["from source", "git clone", ".git"]):
        complexity = 2
    return complexity

# from sklearn.decomposition import PCA
# from sklearn.mixture import GaussianMixture
# from sklearn.feature_extraction.text import TfidfVectorizer
# import numpy as np

# def cluster_data(texts, topics, sentence_lengths, code_counts):
#     # Vectorize the texts and topics
#     vectorizer = TfidfVectorizer()
#     text_features = vectorizer.fit_transform(texts + topics).toarray()
    
#     # Combine all features
#     features = np.hstack((text_features, np.array(sentence_lengths).reshape(-1, 1), np.array(code_counts).reshape(-1, 1)))
    
#     # Optional: Apply PCA for dimensionality reduction
#     pca = PCA(n_components=0.95)  # Keep 95% of variance
#     reduced_features = pca.fit_transform(features)
    
#     # Cluster using Gaussian Mixture Model
#     gmm = GaussianMixture(n_components=3)  # We want to cluster into 3 groups
#     gmm.fit(reduced_features)
#     cluster_labels = gmm.predict(reduced_features)
#     probabilities = gmm.predict_proba(reduced_features)
    
#     return cluster_labels, probabilities

# Iterate through the URLs and perform the tasks
output_data = []

for url, topics in urls_with_topics.items():
    readme_url = url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"  # Define readme_url for each repository
    readme_content = fetch_readme_content(url)
    if readme_content:
        installation_instructions = extract_installation_instructions(readme_content)
        for instruction in installation_instructions:
            tokens = process_text(instruction)
            sentence = process_text(instruction)
            complexity = classify_complexity(instruction)
            # cluster_labels, probabilities = cluster_data([instruction], topics, [len(sentence)], [len(tokens)])
            output_data.append({
                "url": url,
                "readme_url": readme_url,
                "topic": topics,
                "text": instruction,
                'sentence': sentence,
                "token": tokens,
                # "cluster_labels": cluster_labels.tolist(),
                # "probabilities": probabilities.tolist()
                "level of complexity": complexity
            })

# Output to a JSON file
with open('data/corpus-github.json', 'w') as outfile:
    json.dump(output_data, outfile, indent=4)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ccugutrillague/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ccugutrillague/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ccugutrillague/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
