In [None]:
# A jupter notebook that crawl Readme.md files from paperwithcode using python
# 1 Script to get the text from installation instructions of a readme file available in paperwithcode. For each file:
# 2. It extracts the content (code comments and text comments) from the section "Installation" from the readme. Please extract the part of text that relates or match with the following: |installation|setup|install|how|Getting started/quick start|
# 3. Create a dictionary with all the links and text per repository and output a csv file containing: url, text, tokenization of the text

In [50]:
import requests
import re

def getRepostitoryTopics(url, GitHub_Token, not_found):
    header = {'Authorization': 'Bearer ' + GitHub_Token}
    reposUrl = f"https://api.github.com/repos/{url}"
    reposr = requests.get(reposUrl, headers = header)
    reposj = reposr.json()
    try:
        topics = reposj["topics"]
        return (not_found, topics)
    except:
        return (not_found + 1, [])

In [51]:
repo_dict = {}
GitHub_Token = "YOUR-TOKEN"

In [52]:
listTopics = ['LLM']

In [53]:
def get_repositories_by_topic(topic):
    header = {'Authorization': 'Bearer ' + GitHub_Token}
    reposUrl = f"https://api.github.com/search/repositories?q=topic:{topic}&per_page=50"
    reposr = requests.get(reposUrl, headers = header)
    reposj = reposr.json()
    return reposj["items"]

In [57]:
urls_with_topics = {}
for topic in listTopics:
    repositories_json = get_repositories_by_topic(topic)
    for repository in repositories_json:
        topics = repository["topics"]
        combined_topics_string = '\t'.join(topics)
        if(not "LLM" in combined_topics_string):
            urls_with_topics[repository["html_url"]] = topics

In [58]:
print(urls_with_topics)

{'https://github.com/ollama/ollama': ['go', 'golang', 'llama', 'llama2', 'llm', 'llms', 'mistral', 'ollama'], 'https://github.com/geekan/MetaGPT': ['agent', 'gpt', 'hacktoberfest', 'llm', 'metagpt', 'multi-agent'], 'https://github.com/StanGirard/quivr': ['ai', 'api', 'chatbot', 'chatgpt', 'database', 'docker', 'frontend', 'html', 'javascript', 'llm', 'openai', 'postgresql', 'privacy', 'rag', 'react', 'rest-api', 'security', 'typescript', 'vector', 'ycombinator'], 'https://github.com/run-llama/llama_index': ['agents', 'application', 'data', 'fine-tuning', 'framework', 'llamaindex', 'llm', 'rag', 'vector-database'], 'https://github.com/milvus-io/milvus': ['anns', 'cloud-native', 'distributed', 'embedding-database', 'embedding-similarity', 'embedding-store', 'faiss', 'golang', 'hnsw', 'image-search', 'llm', 'nearest-neighbor-search', 'tensor-database', 'vector-database', 'vector-search', 'vector-similarity', 'vector-store'], 'https://github.com/JushBJJ/Mr.-Ranedeer-AI-Tutor': ['ai', 'educ

In [59]:
import requests
import re
import json
from nltk.tokenize import word_tokenize

def fetch_readme_content(repo_url):
    readme_url = repo_url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"
    response = requests.get(readme_url)
    return response.text if response.status_code == 200 else None

def extract_installation_instructions(readme_content):
    keywords = ["installation", "setup", "install", "how to", "getting started", "quick start"]
    pattern = re.compile("|".join(keywords), re.IGNORECASE)
    sections = re.split(r'#+ ', readme_content)
    installation_sections = [section for section in sections if pattern.search(section)]
    return installation_sections

def tokenize_text(text):
    return word_tokenize(text)

def classify_complexity(text):
    complexity = -1  # Default complexity
    if any(word in text for word in ["pip install", "package manager install"]):
        complexity = 0
    elif any(word in text for word in ["container", "docker container", "docker compose up"]):
        complexity = 1
    elif any(word in text for word in ["from source", "git clone", ".git"]):
        complexity = 2
    return complexity

# Iterate through the URLs and perform the tasks
output_data = []

for url, topics in urls_with_topics.items():
    readme_url = url.replace("github.com", "raw.githubusercontent.com") + "/main/README.md"  # Define readme_url for each repository
    readme_content = fetch_readme_content(url)
    if readme_content:
        installation_instructions = extract_installation_instructions(readme_content)
        for instruction in installation_instructions:
            tokens = tokenize_text(instruction)
            complexity = classify_complexity(instruction)
            output_data.append({
                "url": url,
                "readme_url": readme_url,
                "topic": topics,
                "text": instruction,
                "token": tokens,
                "level of complexity": complexity
            })

# Output to a JSON file
with open('data/corpus-topic.json', 'w') as outfile:
    json.dump(output_data, outfile, indent=4)