In [1]:
import requests
import logging
import time
import os
import pandas as pd

# Set proxy if needed
os.environ['https_proxy'] = 'http://wwwproxy.ms.com:8080'

SLEEP_SECONDS = 5

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the base URL and search parameters
BASE_URL = "https://api.github.com/search/repositories"


LICENSE_QUERY = "license:apache-2.0 license:mit license:0bsd license:cc"

def run_github_query(query: str) -> pd.DataFrame:
    """
    Executes a GitHub search query to fetch repositories based on the provided query string.

    Args:
        query (str): The search query string to be used for fetching repositories from GitHub.

    Returns:
        pd.DataFrame: A DataFrame containing the details of the repositories fetched from GitHub.

    Note:
        - The function uses the `requests` library to make HTTP requests to the GitHub API.
        - The function uses the `logging` library to log information and errors.
        - The function assumes the existence of constants `BASE_URL` and `SLEEP_SECONDS` for the GitHub API base URL and sleep duration between requests, respectively.
    """

    results = []

    params = {
        "q": query,
        "sort": "stars",
        "order": "desc",
        "per_page": 100,  # Number of results per page
        "page": 1  # Start from the first page
    }

    while True:
        logging.info(f"Fetching page {params['page']}")
        response = requests.get(BASE_URL, params=params)
        if response.status_code != 200:
            logging.error(f"Failed to fetch data: {response.status_code}")
            break

        data = response.json()
        results.extend(data.get("items", []))
        logging.info(f"Fetched {len(data.get('items', []))} repositories")

        # Check if there is a next page
        if "next" in response.links:
            params["page"] += 1
            time.sleep(SLEEP_SECONDS)
        else:
            logging.info("No more pages to fetch")
            break

    # Process the collected results
    logging.info(f"Total repositories found: {len(results)}")

    return pd.DataFrame(results)



In [2]:
import pandas as pd
import logging
import os
from typing import Optional

# Set proxy if needed
os.environ['https_proxy'] = os.getenv('HTTPS_PROXY', 'http://wwwproxy.ms.com:8080')

QUERY_TYPES = ["language", "topic"]
LICENSE_QUERY = "license:apache-2.0 license:mit license:0bsd license:cc"
PARQUET_DIR = "parquet"

# Ensure the output directory exists
os.makedirs(PARQUET_DIR, exist_ok=True)

def run_and_save_query(query_type: str, license_query: str) -> Optional[pd.DataFrame]:
    """
    Runs the GitHub query for the given query type and saves the result to a parquet file.

    Args:
        query_type (str): The type of query to run (e.g., "language", "topic").
        license_query (str): The license query string to be used.

    Returns:
        Optional[pd.DataFrame]: The DataFrame containing the query results, or None if an error occurs.
    """
    query = f"{query_type}:q {license_query}"
    logging.info(f"Running query: {query}")
    try:
        df = run_github_query(query)
    except Exception as e:
        logging.error(f"Error running query {query}: {e}")
        return None

    output_path = os.path.join(PARQUET_DIR, f"q-repo-list_{query_type}.parquet")
    try:
        df.to_parquet(output_path, index=False)
        logging.info(f"Saved data to {output_path}")
    except Exception as e:
        logging.error(f"Error saving data to {output_path}: {e}")

    return df

dfs = []

# Run the query for each query type.
for query_type in QUERY_TYPES:
    df = run_and_save_query(query_type, LICENSE_QUERY)
    if df is not None:
        dfs.append(df)

# Combine the DataFrames
if dfs:
    combined_df = pd.concat(dfs).drop_duplicates(subset='full_name'
        ).sort_values('stargazers_count', ascending=False
        ).reset_index(drop=True)

    logging.info(f"Total repositories combined: {len(combined_df)}")

    combined_output_path = os.path.join(PARQUET_DIR, 'q-repo-list_combined.parquet')
    try:
        combined_df.to_parquet(combined_output_path, index=False)
        logging.info(f"Saved combined data to {combined_output_path}")
    except Exception as e:
        logging.error(f"Error saving combined data to {combined_output_path}: {e}")
else:
    logging.info("No data to combine")


2024-11-05 16:38:46,211 - INFO - Running query: language:q license:apache-2.0 license:mit license:0bsd license:cc
2024-11-05 16:38:46,212 - INFO - Fetching page 1
2024-11-05 16:38:48,335 - INFO - Fetched 100 repositories
2024-11-05 16:38:53,340 - INFO - Fetching page 2
2024-11-05 16:38:54,949 - INFO - Fetched 100 repositories
2024-11-05 16:38:59,957 - INFO - Fetching page 3
2024-11-05 16:39:00,678 - INFO - Fetched 30 repositories
2024-11-05 16:39:00,679 - INFO - No more pages to fetch
2024-11-05 16:39:00,680 - INFO - Total repositories found: 230
2024-11-05 16:39:00,834 - INFO - Saved data to parquet/q-repo-list_language.parquet
2024-11-05 16:39:00,835 - INFO - Running query: topic:q license:apache-2.0 license:mit license:0bsd license:cc
2024-11-05 16:39:00,835 - INFO - Fetching page 1
2024-11-05 16:39:01,842 - INFO - Fetched 55 repositories
2024-11-05 16:39:01,842 - INFO - No more pages to fetch
2024-11-05 16:39:01,843 - INFO - Total repositories found: 55
2024-11-05 16:39:01,937 - IN