In [1]:
import requests

# Scraper Function
def scrape_github_repo_to_text(repo_url, access_token=None):
    # Extract owner and repository name from URL
    repo_name = repo_url.rstrip('/').split('/')[-1]
    owner_name = repo_url.rstrip('/').split('/')[-2]
    
    # GitHub API headers
    headers = {'Authorization': f'token {access_token}'} if access_token else {}
    
    # GitHub API URLs
    repo_api_url = f"https://api.github.com/repos/{owner_name}/{repo_name}"
    contents_api_url = f"{repo_api_url}/contents"
    
    # Get repository metadata
    repo_metadata = requests.get(repo_api_url, headers=headers).json()
    if 'message' in repo_metadata and 'Not Found' in repo_metadata['message']:
        print("Repository not found. Check the URL or your access token.")
        return
    
    print("Repository Metadata:")
    print(f"Name: {repo_metadata['name']}")
    print(f"Description: {repo_metadata.get('description', 'No description')}")
    print(f"Stars: {repo_metadata['stargazers_count']}")
    print(f"Forks: {repo_metadata['forks_count']}")
    
    # Recursive function to get all files and their content
    all_files = []

    def fetch_contents(api_url, path=""):
        contents = requests.get(api_url, headers=headers).json()
        for item in contents:
            if item['type'] == 'file':
                file_content = requests.get(item['download_url'], headers=headers).text
                all_files.append({
                    "path": f"{path}/{item['name']}",
                    "content": file_content
                })
                break
            elif item['type'] == 'dir':
                fetch_contents(item['_links']['self'], path=f"{path}/{item['name']}")

    fetch_contents(contents_api_url)

    print(f"Fetched {len(all_files)} files from the repository.")
    return all_files

# Example Usage
repo_url = "https://github.com/ros2/ros2_documentation"  # Replace with the GitHub repository URL
access_token = "your_personal_access_token"  # Replace with your PAT or leave None for public repos

repo_data = scrape_github_repo_to_text(repo_url)
print(repo_data)

# # Processing Files for RAG
# from sentence_transformers import SentenceTransformer
# from qdrant_client import QdrantClient
# from qdrant_client.http.models import PointStruct

# # Initialize models and database
# model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # Example model
# qdrant = QdrantClient("http://localhost:6333")  # Update with your Qdrant setup
# qdrant_collection = "github_docs"

# # Ensure the collection exists in Qdrant
# qdrant.recreate_collection(
#     collection_name=qdrant_collection,
#     vector_size=model.get_sentence_embedding_dimension(),
#     distance="Cosine",
# )

# # Encode and insert into Qdrant
# for file in repo_data:
#     vector = model.encode(file['content'], convert_to_tensor=False).tolist()
#     point = PointStruct(
#         id=file['path'],  # Use file path as a unique ID
#         vector=vector,
#         payload={"path": file['path'], "content": file['content']},
#     )
#     qdrant.upsert(collection_name=qdrant_collection, points=[point])

# print("Data encoded and stored in Qdrant vector database.")


Repository Metadata:
Name: ros2_documentation
Description: ROS 2 docs repository
Stars: 555
Forks: 1067
Fetched 3 files from the repository.
[{'path': '/.devcontainer/devcontainer.json', 'content': '{\n\t"name": "ROS 2 Documentation",\n\t"build": {\n\t\t"dockerfile": "../docker/image/Dockerfile"\n\t},\n\t"workspaceMount": "source=${localWorkspaceFolder},target=/tmp/doc_repository,type=bind",\n\t"workspaceFolder": "/tmp/doc_repository",\n\t"postCreateCommand": "pip3 install --no-warn-script-location --user -r requirements.txt -c constraints.txt --break-system-packages",\n\t"features": {\n\t\t"ghcr.io/devcontainers/features/git:1": {}\n\t},\n\t"customizations": {\n\t\t"vscode": {\n\t\t\t"extensions": [\n\t\t\t\t"ritwickdey.LiveServer"\n\t\t\t]\n\t\t}\n\t}\n}'}, {'path': '/.github/workflows/test.yml', 'content': 'name: Test\n\non: pull_request\n\njobs:\n  test:\n    runs-on: ubuntu-22.04\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v4\n\n      - name: Setup Python\n