In [1]:
import os
import requests

def download_github_document_by_format(owner, repo, file_path, base_save_path, desired_format, token=None):
    """
    Downloads a document from a GitHub repository if it matches the desired format,
    creates a subfolder for the format if it doesn't exist, and saves the file.

    Args:
        owner (str): The owner of the repository.
        repo (str): The name of the repository.
        file_path (str): The path to the file in the repository.
        base_save_path (str): The base directory where files will be saved.
        desired_format (str): The desired file format (e.g., 'pdf', 'md').
        token (str, optional): A GitHub personal access token for authenticated requests (if needed).

    Returns:
        str: The full path to the saved file if successful, None otherwise.
    """
    # Get the file extension
    _, extension = os.path.splitext(file_path)
    extension = extension.lstrip('.')  # Remove the leading dot from the extension

    # Check if the file matches the desired format
    if extension.lower() != desired_format.lower():
        print(f"Skipping download: {file_path} does not match the desired format {desired_format}.")
        return None

    # Construct the subfolder path based on the desired format
    subfolder_path = os.path.join(base_save_path, desired_format)
    os.makedirs(subfolder_path, exist_ok=True)  # Create the subfolder if it doesn't exist

    # Construct the full save path
    file_name = os.path.basename(file_path)
    full_save_path = os.path.join(subfolder_path, file_name)

    # GitHub API URL to fetch the file metadata
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
    headers = {}

    # Add the token to headers if provided
    if token:
        headers['Authorization'] = f"token {token}"

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        file_data = response.json()
        download_url = file_data.get('download_url')

        if download_url:
            file_response = requests.get(download_url)
            if file_response.status_code == 200:
                # Save the file to the constructed path
                with open(full_save_path, 'wb') as file:
                    file.write(file_response.content)
                print(f"File downloaded successfully and saved to {full_save_path}")
                return full_save_path
            else:
                print(f"Failed to download file from download_url: {file_response.status_code}")
        else:
            print("Download URL not found in the API response.")
    else:
        print(f"Failed to fetch file metadata: {response.status_code} - {response.json().get('message', '')}")

    return None



In [None]:
# Define the parameters for the file to download
owner = "cciliayang"  # GitHub repository owner
repo = "api-testing"  # GitHub repository name
file_path = "additional_text_file.txt"  # Path to the file in the repository
base_save_path = "./downloads"  # Base directory to save files
desired_format = "txt"  # Desired file format
token = None  # Optional: GitHub personal access token

# Call the method
download_github_document_by_format(owner, repo, file_path, base_save_path, desired_format, token)


In [5]:
import requests

def fetch_files_by_extension(owner, repo, extension, token):
    """
    Fetches all files with a specific extension from a GitHub repository using the GitHub GraphQL API.

    Args:
        owner (str): The owner of the repository.
        repo (str): The name of the repository.
        extension (str): The file extension to filter (e.g., '.txt').
        token (str): GitHub personal access token.

    Returns:
        list: A list of file paths matching the specified extension.
    """
    url = "https://api.github.com/graphql"
    headers = {
        "Authorization": f"Bearer {token}"
    }

    # GraphQL query to fetch the file tree
    query = """
    query($owner: String!, $repo: String!, $cursor: String) {
        repository(owner: $owner, name: $repo) {
            object(expression: "HEAD:") {
                ... on Tree {
                    entries(first: 100, after: $cursor) {
                        pageInfo {
                            hasNextPage
                            endCursor
                        }
                        nodes {
                            name
                            type
                            path
                        }
                    }
                }
            }
        }
    }
    """
    variables = {
        "owner": owner,
        "repo": repo,
        "cursor": None
    }

    matching_files = []

    while True:
        response = requests.post(url, json={"query": query, "variables": variables}, headers=headers)

        if response.status_code != 200:
            print(f"Failed to fetch files: {response.status_code} - {response.text}")
            break

        data = response.json()
        entries = data.get("data", {}).get("repository", {}).get("object", {}).get("entries", {})
        nodes = entries.get("nodes", [])

        # Filter files based on the extension
        for node in nodes:
            if node["type"] == "blob" and node["name"].endswith(extension):
                matching_files.append(node["path"])

        # Pagination handling
        page_info = entries.get("pageInfo", {})
        if not page_info.get("hasNextPage"):
            break
        variables["cursor"] = page_info["endCursor"]

    return matching_files


In [6]:
# Define parameters
owner = "cciliayang"  # GitHub repository owner
repo = "api-testing"  # GitHub repository name
extension = ".txt"  # Desired file extension
token = "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"  # GitHub personal access token

# Fetch files
files = fetch_files_by_extension(owner, repo, extension, token)

# Print matching files
if files:
    print("Files matching the extension:")
    for file in files:
        print(file)
else:
    print("No matching files found.")


No matching files found.


In [None]:
import requests

def fetch_txt_files(owner, repo, token, branch="main"):
    """
    Fetches all .txt files from the root directory of a GitHub repository using the GitHub GraphQL API.
    """
    url = "https://api.github.com/graphql"
    headers = {
        "Authorization": f"Bearer {token}"
    }

    # GraphQL query to fetch the root directory's tree
    query = """
    query($owner: String!, $repo: String!, $branch: String!) {
        repository(owner: $owner, name: $repo) {
            object(expression: $branch) {
                ... on Tree {
                    entries {
                        name
                        type
                        path
                    }
                }
            }
        }
    }
    """
    variables = {
        "owner": owner,
        "repo": repo,
        "branch": branch
    }

    response = requests.post(url, json={"query": query, "variables": variables}, headers=headers)

    # Debugging print
    print(f"Response status: {response.status_code}")
    print(f"Response JSON: {response.json()}")

    if response.status_code != 200:
        print(f"Error querying repository: {response.status_code} - {response.text}")
        return []

    # Parse the response
    data = response.json()
    entries = data.get("data", {}).get("repository", {}).get("object", {}).get("entries", [])
    if not entries:
        print("No files found in the root directory.")
        return []

    # Filter files by .txt extension
    txt_files = [entry["path"] for entry in entries if entry["type"] == "blob" and entry["name"].endswith(".txt")]

    return txt_files

# Usage
owner = "cciliayang"  # GitHub repository owner
repo = "api-testing"  # GitHub repository name
#extension = ".txt"  # Desired file extension
token = "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"

txt_files = fetch_txt_files(owner, repo, token)

if txt_files:
    print("Found .txt files:")
    for file in txt_files:
        print(file)
else:
    print("No .txt files found in the root directory.")



Response status: 200
Response JSON: {'data': {'repository': {'object': {}}}}
No files found in the root directory.
No .txt files found in the root directory.


In [11]:
import requests

headers = {"Authorization": "Bearer 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"}  # Replace with your token
owner = "cciliayang"  # GitHub repository owner
repo = "api-testing"  # GitHub repository name

url = f"https://api.github.com/repos/{owner}/{repo}"  # Construct API URL
response = requests.get(url, headers=headers)

# Print the response JSON
print(response.json())


{'id': 896609985, 'node_id': 'R_kgDONXEuwQ', 'name': 'api-testing', 'full_name': 'cciliayang/api-testing', 'private': False, 'owner': {'login': 'cciliayang', 'id': 107127001, 'node_id': 'U_kgDOBmKg2Q', 'avatar_url': 'https://avatars.githubusercontent.com/u/107127001?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/cciliayang', 'html_url': 'https://github.com/cciliayang', 'followers_url': 'https://api.github.com/users/cciliayang/followers', 'following_url': 'https://api.github.com/users/cciliayang/following{/other_user}', 'gists_url': 'https://api.github.com/users/cciliayang/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/cciliayang/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/cciliayang/subscriptions', 'organizations_url': 'https://api.github.com/users/cciliayang/orgs', 'repos_url': 'https://api.github.com/users/cciliayang/repos', 'events_url': 'https://api.github.com/users/cciliayang/events{/privacy}', 'received_events_url': '

In [12]:
import requests

def get_branches(owner, repo, token):
    """
    Fetches the list of branches for a given GitHub repository.

    Args:
        owner (str): The owner of the repository.
        repo (str): The name of the repository.
        token (str): GitHub personal access token.

    Returns:
        list: A list of branch names.
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/branches"
    headers = {
        "Authorization": f"Bearer {token}"
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        branches = [branch['name'] for branch in response.json()]
        return branches
    else:
        print(f"Failed to fetch branches: {response.status_code} - {response.text}")
        return []

# Usage
owner = "cciliayang"  # GitHub repository owner
repo = "api-testing"  # GitHub repository name
#extension = ".txt"  # Desired file extension
token = "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"

branches = get_branches(owner, repo, token)
if branches:
    print("Branches found:")
    for branch in branches:
        print(branch)
else:
    print("No branches found or failed to fetch branches.")


Branches found:
main


In [None]:
import requests

def fetch_all_files(owner, repo, token, branch="main"):
    """
    Fetches all files from a GitHub repository using the GraphQL API.

    Args:
        owner (str): The owner of the repository.
        repo (str): The name of the repository.
        token (str): GitHub personal access token.
        branch (str): The branch to search (default is 'main').

    Returns:
        list: A list of file details (name, type, path).
    """
    url = "https://api.github.com/graphql"
    headers = {
        "Authorization": f"Bearer {token}"
    }

    query = """
    query($owner: String!, $repo: String!, $branch: String!) {
        repository(owner: $owner, name: $repo) {
            object(expression: $branch) {
                ... on Tree {
                    entries {
                        name
                        type
                        path
                    }
                }
            }
        }
    }
    """
    variables = {
        "owner": owner,
        "repo": repo,
        "branch": branch
    }

    response = requests.post(url, json={"query": query, "variables": variables}, headers=headers)

    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        print(response.json())
        return []

    # Parse the response
    data = response.json()
    entries = data.get("data", {}).get("repository", {}).get("object", {}).get("entries", [])

    if not entries:
        print("No files found in the repository.")
        return []

    return entries

# Usage
owner = "cciliayang"  # Repository owner
repo = "api-testing"  # Repository name
token = "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"  # Replace with your GitHub token

files = fetch_all_files(owner, repo, token)

if files:
    print("Found files:")
    for file in files:
        print(file)
else:
    print("No files found in the repository.")


No files found in the repository.
No files found in the repository.
