In [None]:
import requests
import time

GITHUB_TOKEN = "SECRET-TODO" # Replace with your GitHub token

def get_new_java_repos(created_after="2025-01-01"):
    api_url = "https://api.github.com/search/repositories"
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}",
        "Accept": "application/vnd.github.v3+json"
    }
    
    params = {
        "q": f"language:Java created:>{created_after}",
        "sort": "created",
        "order": "desc",
        "per_page": 10  # Max allowed per page
    }

    repositories = []
    page = 1
    
    while True:
        params["page"] = page
        response = requests.get(api_url, headers=headers, params=params)
        
        if response.status_code == 200:
            data = response.json()
            items = data.get("items", [])
            if not items:
                break
            
            repositories.extend(items)
            print(f"Page {page}: Found {len(items)} repos...")
            
            # Stop if we hit GitHub's 1000-result limit
            if len(repositories) >= 20:
                print("Reached result limit. Use date-slicing for more.")
                break
                
            page += 1
            time.sleep(2) # Respect rate limits
        else:
            print(f"Error: {response.status_code} - {response.text}")
            break
            
    return repositories

repos = get_new_java_repos()

In [None]:
print(repos)

In [None]:
import requests
import re
import os
import base64
import time
from langdetect import detect, DetectorFactory

# Set seed for consistent results
DetectorFactory.seed = 0

# --- CONFIGURATION ---
CREATED_AFTER = "2025-01-01"  # Ensures zero training leakage
SAVE_DIR = "../input/GitHubScrape/"
MIN_LOC = 50
MAX_LOC = 250

HEADERS = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

def is_english(text):
    """Isolates comments and checks if they are English."""
    # Extract text from /* ... */
    multi_line = re.findall(r'/\*([\s\S]*?)\*/', text)
    # Extract text from // ...
    inline = re.findall(r'//(.*)', text)
    
    # Combine all comment text into one string
    all_comments = " ".join(multi_line + inline).strip()
    
    # Clean up common Java symbols/artifacts so they don't confuse the detector
    clean_comments = re.sub(r'[*@\n\r\t]', ' ', all_comments)
    
    # We need a decent amount of text to detect language accurately
    if len(clean_comments) < 20: 
        return False
        
    try:
        return detect(clean_comments) == 'en'
    except:
        return False
    

def filter_java_content(content):
    """Checks for LOC, multi-line (/* */), and inline (//) comments."""
    lines = content.splitlines()
    loc = len(lines)
    
    if not (MIN_LOC <= loc <= MAX_LOC):
        return False
    
    # Regex for multi-line: /* ... */
    has_multi = re.search(r'/\*[\s\S]*?\*/', content)

    # Regex for inline: // ...
    has_inline = re.search(r'//.*', content)

    if not (has_multi and has_inline):
        return False

    # Finally, ensure those comments are English
    return is_english(content)


def download_java_files():
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)

    # 1. Search for Java repos created in 2025
    search_url = f"https://api.github.com/search/repositories?q=language:Java+created:>{CREATED_AFTER}&sort=stars"
    repos = requests.get(search_url, headers=HEADERS).json().get('items', [])
    saved_files_overall = 0


In [None]:
LICENSE = "mit" # Options: mit, apache-2.0, gpl-3.0, bsd-3-clause

def download_bulk_java():
    if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR)

    # LOOP THROUGH PAGES (1 to 10)
    for page in range(1, 11): 
        print(f"\n--- FETCHING PAGE {page} ---")
        search_url = f"https://api.github.com/search/repositories?q=language:Java+created:>{CREATED_AFTER}+license:{LICENSE}&sort=stars&per_page=100&page={page}"
        
        response = requests.get(search_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Reached Rate Limit or Error: {response.text}")
            break

        repos = response.json().get('items', [])
        saved_files_overall = 0

        if not repos: break

        for repo in repos:
            repo_name = repo['full_name']
            # [Insert the same logic here to scan tree and download files...]
            # (To save time, only scan the first 50 files per repo)
            print(f"Checking {repo_name}...")
                    
            saved_files_repo = 0

            # 2. Get the file tree (recursive)
            tree_url = f"https://api.github.com/repos/{repo_name}/git/trees/{repo['default_branch']}?recursive=1"
            tree_res = requests.get(tree_url, headers=HEADERS).json()
            
            if 'tree' not in tree_res: continue

            for file in tree_res['tree']:
                if saved_files_overall >= 5:
                    break

                if file['path'].endswith('.java'):
                    # 3. Get file content
                    file_url = file['url']
                    blob = requests.get(file_url, headers=HEADERS).json()
                    
                    if 'content' not in blob: continue
                    
                    # Decode from Base64
                    try:
                        raw_content = base64.b64decode(blob['content']).decode('utf-8')
                    except:
                        continue
                    
                    # 4. Apply filters
                    if filter_java_content(raw_content):
                        filename = f"{repo_name.replace('/', '_')}_{file['path'].split('/')[-1]}"
                        with open(os.path.join(SAVE_DIR, filename), "w", encoding="utf-8") as f:
                            f.write(raw_content)
                        print(f"  [SAVED] {file['path']} ({len(raw_content.splitlines())} LOC)")
                        saved_files_overall += 1
                        saved_files_repo += 1
                        
            if saved_files_overall >= 100:
                break

            time.sleep(1) # Be kind to the API

        # Be careful with the Search Rate Limit (30 requests/min for authenticated users)
        time.sleep(2)

In [None]:
download_bulk_java()