In [None]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta

# GitHub API Token (Replace with your actual token)
TOKEN = "placeholder"

# Headers for authentication
HEADERS = {"Authorization": f"token {TOKEN}"}

# API base URL
BASE_URL = "https://api.github.com/search/repositories"

# Fields to Keep (Only relevant ones)
FIELDS_TO_KEEP = [
    "id", "name", "full_name", "html_url", "description",
    "created_at", "updated_at", "pushed_at", "size",
    "stargazers_count", "watchers_count", "forks_count", "open_issues_count",
    "language", "topics"
]

# Function to fetch repositories for a given search query and page
def fetch_repos(search_query, page):
    params = {
        "q": search_query,
        "sort": "stars",
        "order": "desc",
        "per_page": 100,  # Max per request
        "page": page
    }
    
    response = requests.get(BASE_URL, headers=HEADERS, params=params)
    
    if response.status_code == 200:
        return response.json().get("items", [])
    
    elif response.status_code == 403:  # Rate limit exceeded
        reset_time = int(response.headers.get("X-RateLimit-Reset", time.time() + 60))
        wait_time = max(reset_time - int(time.time()), 60)  # Ensure at least 60s wait
        print(f" Rate limit exceeded! Waiting {wait_time} seconds before retrying...")
        time.sleep(wait_time)
        return fetch_repos(search_query, page)  # Retry
    
    elif response.status_code == 422:
        print("GitHub only allows fetching the first 1,000 results per query!")
        return []
    
    else:
        print(f" Error: {response.status_code} - {response.json()}")
        return []

# Function to fetch all repositories for a search query
def fetch_all_repos(search_query):
    repo_list = []
    for page in range(1, 11):  # Max 10 pages (1000 results)
        print(f" Fetching page {page} for query: {search_query}")
        data = fetch_repos(search_query, page)
        if not data:
            break
        repo_list.extend(data)
    return repo_list

# Function to split date range into **daily** queries
def generate_daily_queries(start_date, end_date):
    queries = []
    current_date = start_date
    while current_date < end_date:
        next_date = current_date + timedelta(days=1)  # **Daily Queries**
        query = f"cloud OR aws OR gcp OR azure created:{current_date.strftime('%Y-%m-%d')}..{next_date.strftime('%Y-%m-%d')}"
        queries.append(query)
        current_date = next_date
    return queries

# Generate queries for **January 2024 (Daily)**
daily_queries = generate_daily_queries(datetime(2024, 1, 1), datetime(2024, 1, 31))

# Fetch data for all queries
all_repos = []
for query in daily_queries:
    print(f"\n Fetching data for query: {query}")
    all_repos.extend(fetch_all_repos(query))

# Convert to DataFrame and keep only required fields
df = pd.DataFrame(all_repos)

# Keep only the necessary fields
df = df[FIELDS_TO_KEEP]

# Save to CSV
df.to_csv("github_repos_2024_Jan.csv", index=False)

print(f"\n Data saved! Total repositories fetched: {len(df)}")


🔍 Fetching data for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 1 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 2 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 3 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 4 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 5 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 6 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 7 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01-01..2024-01-02
📄 Fetching page 8 for query: cloud OR aws OR gcp OR azure OR kubernetes OR docker created:2024-01