In [3]:
import os
from dotenv import load_dotenv

load_dotenv()  # Loads .env file into environment
token = os.getenv("GITHUB_TOKEN")
headers = {"Authorization": f"token {token}"}


In [50]:
# get base repositories for Java projects on GitHub

import requests
import json

repos = {'layer1': [], 'layer2': [], 'layer3': [], 'layer4': [], 'layer5': [], 'layer6': [], 'layer7': []}

base_url = "https://api.github.com/search/repositories"

# Define different star ranges to get up to 10,000 repositories
star_ranges = [
    "stars:>10000",
    "stars:5000..10000", 
    "stars:2000..4999",
    "stars:1000..1999",
    "stars:500..999",
    "stars:200..499",
    "stars:100..199",
    "stars:50..99",
    "stars:25..49",
    "stars:10..24"
]

for star_range in star_ranges:
    print(f"\nSearching repositories with {star_range}")
    
    params = {
        "q": f"language:Java {star_range} size:<256",
        "sort": "stars",
        "order": "desc",
        "per_page": 100,
        "page": 1
    }
    
    # Get up to x pages per star range
    for page in range(1, 3):
        params["page"] = page
        r = requests.get(base_url, headers=headers, params=params)
        
        if r.status_code == 200:
            page_items = r.json().get("items", [])
            repos["layer1"].extend(page_items)
            print(f"  Page {page}: Found {len(page_items)} repos - Total: {len(repos["layer1"])}")
            
            # If we get less than 100 results, we've reached the end
            if len(page_items) < 100:
                print(f"  Reached end of results for {star_range}")
                break
        else:
            print(f"  Page {page}: Status {r.status_code} - Failed")
            print(f"    Response: {r.text}")
            break
    
    print(f"Total repositories collected so far: {len(repos["layer1"])}")

# Remove duplicates based on full_name
# seen_repos = set()
# unique_repos = []
# for repo in repos["layer1"]:
#     if repo["full_name"] not in seen_repos:
#         seen_repos.add(repo["full_name"])
#         unique_repos.append(repo)

# repos = unique_repos
print(f"\nFinal count after removing duplicates: {len(repos['layer1'])}")

# Save the repositories to a json file
with open("/tmp/java_repos.json", "w") as f:
    json.dump(repos, f, indent=2)

print(f"Saved {len(repos['layer1'])} unique Java repositories to /tmp/java_repos.json")


Searching repositories with stars:>10000
  Page 1: Found 2 repos - Total: 2
  Reached end of results for stars:>10000
Total repositories collected so far: 2

Searching repositories with stars:5000..10000
  Page 1: Found 2 repos - Total: 2
  Reached end of results for stars:>10000
Total repositories collected so far: 2

Searching repositories with stars:5000..10000
  Page 1: Found 8 repos - Total: 10
  Reached end of results for stars:5000..10000
Total repositories collected so far: 10

Searching repositories with stars:2000..4999
  Page 1: Found 8 repos - Total: 10
  Reached end of results for stars:5000..10000
Total repositories collected so far: 10

Searching repositories with stars:2000..4999
  Page 1: Found 34 repos - Total: 44
  Reached end of results for stars:2000..4999
Total repositories collected so far: 44

Searching repositories with stars:1000..1999
  Page 1: Found 34 repos - Total: 44
  Reached end of results for stars:2000..4999
Total repositories collected so far: 44

S

In [24]:
len(repos)  # Output the number of repositories found


1000

In [None]:
java_dependencies = {'layer1':{}, 'layer2':{}, 'layer3':{}, 'layer4':{}, 'layer5':{}, 'layer6':{}, 'layer7': {}}
repo_relationships = {'layer1': {}, 'layer2': {}, 'layer3': {}, 'layer4': {}, 'layer5': {}, 'layer6': {}, 'layer7': {}}


def already_exists(repo):
    # check if it is already in repos layers
    for layer in repos:
        if repo in repos[layer]:
            return True
    return False

In [65]:
import re
import xml.etree.ElementTree as ET
import time

def extract_maven_dependencies(pom_content):
    """Extract dependencies from pom.xml content"""
    dependencies = []
    try:
        root = ET.fromstring(pom_content)
        # Handle namespaces
        ns = {'maven': 'http://maven.apache.org/POM/4.0.0'}
        
        # Find dependencies
        deps = root.findall('.//maven:dependency', ns) or root.findall('.//dependency')
        
        for dep in deps:
            group_id = dep.find('maven:groupId', ns) or dep.find('groupId')
            artifact_id = dep.find('maven:artifactId', ns) or dep.find('artifactId')
            version = dep.find('maven:version', ns) or dep.find('version')
            
            if group_id is not None and artifact_id is not None:
                dependencies.append({
                    'groupId': group_id.text,
                    'artifactId': artifact_id.text,
                    'version': version.text if version is not None else 'unknown'
                })
    except ET.ParseError:
        pass
    return dependencies

def extract_gradle_dependencies(gradle_content):
    """Extract dependencies from build.gradle content"""
    dependencies = []
    # Pattern to match various gradle dependency formats
    patterns = [
        r"implementation\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"compile\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"api\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"testImplementation\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]"
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, gradle_content)
        for match in matches:
            dependencies.append({
                'groupId': match[0],
                'artifactId': match[1],
                'version': match[2]
            })
    return dependencies

def get_repository_dependencies(repo_full_name):
    """Get dependencies from a repository's build files without searching for GitHub repos"""
    dependencies = []
    
    try:
        # Get repository contents to find build files
        contents_url = f"https://api.github.com/repos/{repo_full_name}/contents"
        response = requests.get(contents_url, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to get contents for {repo_full_name}: HTTP {response.status_code}")
            return dependencies
        
        contents = response.json()
        
        # Look for pom.xml
        pom_file = next((item for item in contents if item["name"] == "pom.xml"), None)
        if pom_file:
            pom_response = requests.get(pom_file["download_url"])
            if pom_response.status_code == 200:
                dependencies.extend(extract_maven_dependencies(pom_response.text))
            else:
                print(f"Failed to download pom.xml for {repo_full_name}: HTTP {pom_response.status_code}")
        
        # Look for build.gradle
        gradle_file = next((item for item in contents if item["name"] == "build.gradle"), None)
        if gradle_file:
            gradle_response = requests.get(gradle_file["download_url"])
            if gradle_response.status_code == 200:
                dependencies.extend(extract_gradle_dependencies(gradle_response.text))
            else:
                print(f"Failed to download build.gradle for {repo_full_name}: HTTP {gradle_response.status_code}")
                    
    except Exception as e:
        print(f"Error getting dependencies for {repo_full_name}: {e}")
    
    return dependencies

# Simple array to store all dependencies
all_dependencies = []

# Simple dependency extraction for repositories
SRC_LAYER = 'layer1'

print(f"Extracting dependencies from {SRC_LAYER} repositories...")

try:
    for i, repo in enumerate(repos[SRC_LAYER]):
        if SRC_LAYER == 'layer1':
            repo_full_name = repo["full_name"]
        else:
            if isinstance(repo, str):
                repo_full_name = repo
            else:
                repo_full_name = repo[0] if repo else None
                
        if not repo_full_name:
            continue
            
        print(f"[{i+1}/{len(repos[SRC_LAYER])}] Analyzing {repo_full_name}... Total found so far: {len(all_dependencies)}")
        
        try:
            dependencies = get_repository_dependencies(repo_full_name)
            
            # Add each dependency to the simple array
            for dep in dependencies:
                dependency_entry = {
                    'repository': repo_full_name,
                    'groupId': dep['groupId'],
                    'artifactId': dep['artifactId'],
                    'version': dep['version']
                }
                all_dependencies.append(dependency_entry)
            
            print(f"  Found {len(dependencies)} dependencies")
            
            # Rate limiting
            time.sleep(0.1)
            
        except Exception as e:
            print(f"  Error analyzing {repo_full_name}: {e}")
            continue

    print("Dependency extraction complete!")
    print(f"Total dependencies found: {len(all_dependencies)}")
    
except Exception as e:
    print(f"An error occurred: {e}")
    
finally:
    # Save the simple array of dependencies
    with open("/tmp/all_dependencies_simple.json", "w") as f:
        json.dump(all_dependencies, f, indent=2)
    
    print(f"Saved {len(all_dependencies)} dependencies to /tmp/all_dependencies_simple.json")
    
    # Also save a summary of unique dependencies
    unique_deps = set()
    for dep in all_dependencies:
        unique_deps.add(f"{dep['groupId']}:{dep['artifactId']}")
    
    unique_deps_list = sorted(list(unique_deps))
    with open("/tmp/unique_dependencies.json", "w") as f:
        json.dump(unique_deps_list, f, indent=2)
    
    print(f"Saved {len(unique_deps_list)} unique dependencies to /tmp/unique_dependencies.json")

Extracting dependencies from layer1 repositories...
[1/1351] Analyzing winterbe/java8-tutorial... Total found so far: 0
  Found 0 dependencies
[2/1351] Analyzing ashishps1/awesome-leetcode-resources... Total found so far: 0
  Found 0 dependencies
[2/1351] Analyzing ashishps1/awesome-leetcode-resources... Total found so far: 0
  Found 0 dependencies
[3/1351] Analyzing lihengming/spring-boot-api-project-seed... Total found so far: 0
  Found 0 dependencies
[3/1351] Analyzing lihengming/spring-boot-api-project-seed... Total found so far: 0


  group_id = dep.find('maven:groupId', ns) or dep.find('groupId')
  artifact_id = dep.find('maven:artifactId', ns) or dep.find('artifactId')
  version = dep.find('maven:version', ns) or dep.find('version')


  Found 0 dependencies
[4/1351] Analyzing Freelander/Android_Data... Total found so far: 0
  Found 0 dependencies
[5/1351] Analyzing JakeWharton/hugo... Total found so far: 0
  Found 0 dependencies
[5/1351] Analyzing JakeWharton/hugo... Total found so far: 0
  Found 0 dependencies
[6/1351] Analyzing Meituan-Dianping/Leaf... Total found so far: 0
  Found 0 dependencies
[6/1351] Analyzing Meituan-Dianping/Leaf... Total found so far: 0
  Found 0 dependencies
[7/1351] Analyzing ityouknow/spring-cloud-examples... Total found so far: 0
  Found 0 dependencies
[7/1351] Analyzing ityouknow/spring-cloud-examples... Total found so far: 0
  Found 0 dependencies
[8/1351] Analyzing daimajia/NumberProgressBar... Total found so far: 0
  Found 0 dependencies
[8/1351] Analyzing daimajia/NumberProgressBar... Total found so far: 0
  Found 0 dependencies
[9/1351] Analyzing LandGrey/SpringBootVulExploit... Total found so far: 0
  Found 0 dependencies
[9/1351] Analyzing LandGrey/SpringBootVulExploit... Total

In [66]:
unique_deps


{'${project.inspectGroup}:${project.inspectName}',
 'androidx.annotation:annotation',
 'androidx.core:core',
 'biz.aQute.bnd:biz.aQute.bndlib',
 'com.afollestad:material-dialogs',
 'com.alibaba:fastjson',
 'com.android.support.test.espresso:espresso-idling-resource',
 'com.android.support:appcompat-v7',
 'com.android.support:cardview-v7',
 'com.android.support:design',
 'com.android.support:gridlayout-v7',
 'com.android.support:percent',
 'com.android.support:recyclerview-v7',
 'com.android.support:support-annotations',
 'com.android.support:support-v4',
 'com.badlogicgames.gdx:gdx',
 'com.balysv:material-ripple',
 'com.codahale.metrics:metrics-core',
 'com.daimajia.androidanimations:library',
 'com.daimajia.swipelayout:library',
 'com.eventsourcing:eventsourcing-core',
 'com.extracraftx.minecraft:TemplateMakerFabric',
 'com.facebook.fresco:fresco',
 'com.facebook.rebound:rebound',
 'com.fasterxml.jackson.core:jackson-annotations',
 'com.fasterxml.jackson.core:jackson-core',
 'com.fast

In [63]:
import re
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
import time

def extract_maven_dependencies(pom_content):
    """Extract dependencies from pom.xml content"""
    dependencies = []
    try:
        root = ET.fromstring(pom_content)
        # Handle namespaces
        ns = {'maven': 'http://maven.apache.org/POM/4.0.0'}
        
        # Find dependencies
        deps = root.findall('.//maven:dependency', ns) or root.findall('.//dependency')
        
        for dep in deps:
            group_id = dep.find('maven:groupId', ns) or dep.find('groupId')
            artifact_id = dep.find('maven:artifactId', ns) or dep.find('artifactId')
            version = dep.find('maven:version', ns) or dep.find('version')
            
            if group_id is not None and artifact_id is not None:
                dependencies.append({
                    'groupId': group_id.text,
                    'artifactId': artifact_id.text,
                    'version': version.text if version is not None else 'unknown'
                })
    except ET.ParseError:
        pass
    return dependencies

def extract_gradle_dependencies(gradle_content):
    """Extract dependencies from build.gradle content"""
    dependencies = []
    # Pattern to match various gradle dependency formats
    patterns = [
        r"implementation\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"compile\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"api\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"testImplementation\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]"
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, gradle_content)
        for match in matches:
            dependencies.append({
                'groupId': match[0],
                'artifactId': match[1],
                'version': match[2]
            })
    return dependencies

def search_github_for_dependency(group_id, artifact_id):
    """Search GitHub for repositories that might contain this dependency"""
    search_query = f"language:Java {group_id} {artifact_id}"
    search_params = {
        "q": search_query,
        "sort": "stars",
        "order": "desc",
        "per_page": 3
    }
    try:
        response = requests.get(base_url, headers=headers, params=search_params)
        if response.status_code == 200:
            results = response.json().get("items", [])
            return [repo["full_name"] for repo in results]
    except:
        pass
    return []

def find_dependents(repo_full_name):
    """Find repositories that depend on this repository"""
    dependents = []
    # Search for repositories that mention this repo in their dependencies
    search_queries = [
        f"language:Java {repo_full_name.split('/')[1]} in:file filename:pom.xml",
        f"language:Java {repo_full_name.split('/')[1]} in:file filename:build.gradle"
    ]
    
    for query in search_queries:
        search_params = {
            "q": query,
            "sort": "stars",
            "order": "desc",
            "per_page": 10
        }
        try:
            response = requests.get("https://api.github.com/search/code", headers=headers, params=search_params)
            if response.status_code == 200:
                results = response.json().get("items", [])
                for item in results:
                    repo_name = item["repository"]["full_name"]
                    if repo_name != repo_full_name:
                        dependents.append(repo_name)
            time.sleep(0.1)  # Rate limiting
        except:
            pass
    
    return list(set(dependents))  # Remove duplicates

# Analyze dependencies and relationships for each repository


SRC_LAYER = 'layer1'
DST_LAYER = 'layer2'
try:
    for repo in repos[SRC_LAYER][580:]:
        if SRC_LAYER == 'layer1':
            repo_full_name = repo["full_name"]
        else:
            #checl if repo is a string
            if isinstance(repo, str):
                repo_full_name = repo
            else:
                repo_full_name = repo[0] # get just the first element if it's a list

        print(f"Analyzing {repo_full_name}...")
        
        try:
            # Get repository contents to find build files
            contents_url = f"https://api.github.com/repos/{repo_full_name}/contents"
            response = requests.get(contents_url, headers=headers)
            
            dependencies = []
            dependents = []
            
            if response.status_code == 200:
                contents = response.json()
                
                # Look for pom.xml
                pom_file = next((item for item in contents if item["name"] == "pom.xml"), None)
                if pom_file:
                    pom_response = requests.get(pom_file["download_url"])
                    if pom_response.status_code == 200:
                        dependencies.extend(extract_maven_dependencies(pom_response.text))
                
                # Look for build.gradle
                gradle_file = next((item for item in contents if item["name"] == "build.gradle"), None)
                if gradle_file:
                    gradle_response = requests.get(gradle_file["download_url"])
                    if gradle_response.status_code == 200:
                        dependencies.extend(extract_gradle_dependencies(gradle_response.text))
            
            # Find GitHub repositories for dependencies
            # for dep in dependencies:
            #     github_repos = search_github_for_dependency(dep['groupId'], dep['artifactId'])
            #     dep['potential_github_repos'] = github_repos
            #     time.sleep(0.1)  # Rate limiting
            
            # Find repositories that depend on this one
            dependents = find_dependents(repo_full_name)
            
            java_dependencies[DST_LAYER][repo_full_name] = dependencies
            repo_relationships[DST_LAYER][repo_full_name] = {
                'dependencies': dependencies,
                'dependents': dependents,
                'dependency_count': len(dependencies),
                'dependent_count': len(dependents)
            }

            # for dep in dependencies:
            #     if not already_exists(dep['potential_github_repos']):
            #         repos[DST_LAYER].append(dep['potential_github_repos'])
            
            for dep in dependents:
                if not already_exists(dep):
                    repos[DST_LAYER].append(dep)
            
            print(f"  Found {len(dependencies)} dependencies and {len(dependents)} dependents")
            time.sleep(.1)  # Rate limiting between repos
        except Exception as e:
            print(f"  Error analyzing {repo_full_name}: {e}")
            continue

    print("Analysis complete!")
except Exception as e:
    print(f"An error occurred: {e}")
finally:

    # Save the analysis results to a JSON file
    with open("/tmp/java_repo_analysis.json", "w") as f:
        json.dump({
            "java_dependencies": java_dependencies,
            "repo_relationships": repo_relationships
        }, f, indent=2)

    # save also repos
    with open("/tmp/java_repos_with_dependencies.json", "w") as f:
        json.dump(repos, f, indent=2)

Analyzing mimicmobile/okhttp-oauth2-client...
  Found 0 dependencies and 0 dependents
Analyzing pbakondy/cordova-plugin-speechrecognition...
  Found 0 dependencies and 0 dependents
Analyzing pbakondy/cordova-plugin-speechrecognition...
  Found 0 dependencies and 0 dependents
Analyzing LinkedBear/spring-framework-learning-code...
  Found 0 dependencies and 0 dependents
Analyzing LinkedBear/spring-framework-learning-code...
  Found 0 dependencies and 0 dependents
Analyzing alphamu/android-widget-fanmenu...
  Found 0 dependencies and 0 dependents
Analyzing alphamu/android-widget-fanmenu...
  Found 0 dependencies and 0 dependents
Analyzing gwenn/sqlite-dialect...
  Found 0 dependencies and 0 dependents
Analyzing gwenn/sqlite-dialect...


  group_id = dep.find('maven:groupId', ns) or dep.find('groupId')
  artifact_id = dep.find('maven:artifactId', ns) or dep.find('artifactId')
  version = dep.find('maven:version', ns) or dep.find('version')


  Found 0 dependencies and 0 dependents
Analyzing jMavarez/MaterialCalendar...
  Found 0 dependencies and 0 dependents
Analyzing pyloque/rpckids...
  Found 0 dependencies and 0 dependents
Analyzing pyloque/rpckids...
  Found 0 dependencies and 0 dependents
Analyzing AnotherJack/AvoidOnResult...
  Found 0 dependencies and 0 dependents
Analyzing AnotherJack/AvoidOnResult...
  Found 0 dependencies and 0 dependents
Analyzing wangchenyan/html-text...
  Found 0 dependencies and 0 dependents
Analyzing wangchenyan/html-text...
  Found 0 dependencies and 0 dependents
Analyzing Lovelcp/spring-boot-mybatis-with-redis...
  Found 0 dependencies and 0 dependents
Analyzing Lovelcp/spring-boot-mybatis-with-redis...
  Found 0 dependencies and 0 dependents
Analyzing itbaima-study/SpringBoot-Vue-Template-Session...
  Found 0 dependencies and 0 dependents
Analyzing itbaima-study/SpringBoot-Vue-Template-Session...
  Found 0 dependencies and 0 dependents
Analyzing sockeqwe/AnnotatedAdapter...
  Found 0 depe

KeyboardInterrupt: 

In [26]:
repos['layer7'] = []
java_dependencies['layer7'] = {}
repo_relationships['layer7'] = {}

In [62]:
print(len(repos['layer2']))  # Output the number of repositories found in layer6

i = 0
while repos['layer1'][i]['full_name'] != 'mimicmobile/okhttp-oauth2-client':
    i += 1
print(f"Found 'apache/commons-lang' at index {i} in layer1")

16
Found 'apache/commons-lang' at index 580 in layer1


In [38]:
# Merge all repositories from all layers into one array
final_repos = []

for layer_repos in repos.keys():
    for repo in repos[layer_repos]:
        if layer_repos == 'layer1':
                repo_full_name = repo["full_name"]
        else:
            #checl if repo is a string
            if isinstance(repo, str):
                repo_full_name = repo
            elif len(repo) > 0:
                repo_full_name = repo[0]
            else:
                continue
            
        if repo_full_name not in final_repos:
            # Add only unique repository full names
            final_repos.append(repo_full_name)

# Save to JSON file
with open("/tmp/final_java_repos.json", "w") as f:
    json.dump(final_repos, f, indent=2)

print(f"Saved {len(final_repos)} repositories to /tmp/final_java_repos.json")

Saved 6908 repositories to /tmp/final_java_repos.json


In [49]:
import os
import subprocess

# Directory to save data
output_dir = "/tmp/java_repo_details"
os.makedirs(output_dir, exist_ok=True)

def get_github_api(url, params=None):
    r = requests.get(url, headers=headers, params=params)
    if r.status_code == 200:
        return r.json()
    else:
        print(f"Failed to fetch {url}: {r.status_code}")
        return None

repo_details = {}

for repo_full_name in final_repos[::-1]:
    print(f"Processing {repo_full_name}...")
    repo_data = {}

    # 1. Download files (list of files in the repo)
    # Also download the content of each file (if it's not a directory)
    # Clone the repository into a subdirectory if not already cloned
    repo_dir = os.path.join(output_dir, repo_full_name.replace('/', '__'))
    if not os.path.exists(repo_dir):
        clone_url = f"https://github.com/{repo_full_name}.git"
        try:
            subprocess.run(
                ["git", "clone", "--depth", "500", clone_url, repo_dir],
                check=True,
                timeout=10
            )
        except Exception as e:
            print(f"  Failed to clone {repo_full_name}: {e}")

    # List the project tree
    project_files = []
    for root, dirs, files in os.walk(repo_dir):
        for file in files:
            rel_path = os.path.relpath(os.path.join(root, file), repo_dir)
            project_files.append(rel_path)

    repo_data['files'] = project_files

    # Add file contents for text-like files
    file_contents = {}
    for rel_path in project_files:
        abs_path = os.path.join(repo_dir, rel_path)
        try:
            # Read only small files (e.g., <1MB) and skip binaries
            if os.path.getsize(abs_path) < 1024 * 1024:
                with open(abs_path, "rb") as f:
                    raw = f.read()
                    try:
                        text = raw.decode("utf-8")
                    except UnicodeDecodeError:
                        try:
                            text = raw.decode("latin1")
                        except Exception:
                            continue  # skip non-text files
                    # Heuristic: skip if too many non-printable characters
                    if sum(c < " " and c not in "\n\r\t" for c in text) < 0.05 * len(text):
                        file_contents[rel_path] = text
        except Exception:
            continue  # skip unreadable files

    repo_data['file_contents'] = file_contents


    print(f"  Found {len(repo_data['files'])} files in {repo_full_name}")

    # 2. Contributors
    contributors_url = f"https://api.github.com/repos/{repo_full_name}/contributors"
    contributors = get_github_api(contributors_url)
    repo_data['contributors'] = contributors if contributors is not None else []

    print(f"  Found {len(repo_data['contributors'])} contributors in {repo_full_name}")

    # 3. Commits with statistics
    # Get commit statistics from the local git repository
    commits = []
    try:
        # Get the list of commit hashes (limit to last 100 for performance)
        result = subprocess.run(
            ["git", "-C", repo_dir, "rev-list", "--max-count=100", "HEAD"],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True
        )
        commit_hashes = result.stdout.strip().split('\n')
        for sha in commit_hashes:
            # Get commit info and stats
            show_cmd = [
                "git", "-C", repo_dir, "show", "--quiet", "--format=%H|%an|%ae|%ad|%s", sha
            ]
            stat_cmd = [
                "git", "-C", repo_dir, "show", "--stat", "--oneline", "--format=", sha
            ]
            show_result = subprocess.run(show_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            stat_result = subprocess.run(stat_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            if show_result.returncode == 0:
                parts = show_result.stdout.strip().split('|')
                if len(parts) == 5:
                    commit_info = {
                        "sha": parts[0],
                        "author_name": parts[1],
                        "author_email": parts[2],
                        "date": parts[3],
                        "message": parts[4],
                        "stats": stat_result.stdout.strip()
                    }
                    commits.append(commit_info)
    except Exception as e:
        print(f"  Failed to get commits for {repo_full_name}: {e}")

    repo_data['commits'] = commits

    print(f"  Found {len(repo_data['commits'])} commits in {repo_full_name}")

    # 4. Issues
    issues_url = f"https://api.github.com/repos/{repo_full_name}/issues"
    issues = []
    params = {"state": "all", "per_page": 100, "page": 1}
    while True:
        issues_page = get_github_api(issues_url, params=params)
        if not issues_page:
            break
        issues.extend(issues_page)
        
        break
    repo_data['issues'] = issues

    print(f"  Found {len(repo_data['issues'])} issues in {repo_full_name}")

    # Save per repo
    with open(os.path.join(output_dir, f"{repo_full_name.replace('/', '__')}.json"), "w") as f:
        json.dump(repo_data, f, indent=2)

    repo_details[repo_full_name] = repo_data

    # To avoid hitting rate limits
    time.sleep(0.2)

# Save all details to a single file
with open("/tmp/all_repo_details.json", "w") as f:
    json.dump(repo_details, f, indent=2)

print(f"Saved details for {len(final_repos)} repositories in {output_dir}")

Processing kazakago/compose_multiplatform_with_gradle...


Cloning into '/tmp/java_repo_details/kazakago__compose_multiplatform_with_gradle'...


  Found 76 files in kazakago/compose_multiplatform_with_gradle
  Found 1 contributors in kazakago/compose_multiplatform_with_gradle
  Found 1 commits in kazakago/compose_multiplatform_with_gradle
  Found 0 issues in kazakago/compose_multiplatform_with_gradle
Processing frogobox/frogo-ui...


Cloning into '/tmp/java_repo_details/frogobox__frogo-ui'...


  Found 423 files in frogobox/frogo-ui
  Found 1 contributors in frogobox/frogo-ui
  Found 95 commits in frogobox/frogo-ui
  Found 3 issues in frogobox/frogo-ui
Processing tpakis/codebreak...


Cloning into '/tmp/java_repo_details/tpakis__codebreak'...


  Found 142 files in tpakis/codebreak
  Found 1 contributors in tpakis/codebreak
  Found 37 commits in tpakis/codebreak
  Found 0 issues in tpakis/codebreak
Processing hoc081098/Compose-Multiplatform-KmpViewModel-Unsplash-Sample...


Cloning into '/tmp/java_repo_details/hoc081098__Compose-Multiplatform-KmpViewModel-Unsplash-Sample'...


  Found 174 files in hoc081098/Compose-Multiplatform-KmpViewModel-Unsplash-Sample
  Found 2 contributors in hoc081098/Compose-Multiplatform-KmpViewModel-Unsplash-Sample
  Found 100 commits in hoc081098/Compose-Multiplatform-KmpViewModel-Unsplash-Sample
  Found 100 issues in hoc081098/Compose-Multiplatform-KmpViewModel-Unsplash-Sample
Processing CharLEE-X/neumorph-ui...


Cloning into '/tmp/java_repo_details/CharLEE-X__neumorph-ui'...


  Found 294 files in CharLEE-X/neumorph-ui
  Found 1 contributors in CharLEE-X/neumorph-ui
  Found 45 commits in CharLEE-X/neumorph-ui
  Found 24 issues in CharLEE-X/neumorph-ui
Processing smarttuner/kaffeeverde...


Cloning into '/tmp/java_repo_details/smarttuner__kaffeeverde'...


  Found 191 files in smarttuner/kaffeeverde
  Found 1 contributors in smarttuner/kaffeeverde
  Found 35 commits in smarttuner/kaffeeverde
  Found 2 issues in smarttuner/kaffeeverde
Processing amirisback/nutrition-framework...


Cloning into '/tmp/java_repo_details/amirisback__nutrition-framework'...


  Failed to clone amirisback/nutrition-framework: Command '['git', 'clone', '--depth', '500', 'https://github.com/amirisback/nutrition-framework.git', '/tmp/java_repo_details/amirisback__nutrition-framework']' timed out after 9.999979791995429 seconds
  Found 20 files in amirisback/nutrition-framework


fatal: early EOF


  Found 1 contributors in amirisback/nutrition-framework
  Failed to get commits for amirisback/nutrition-framework: Command '['git', '-C', '/tmp/java_repo_details/amirisback__nutrition-framework', 'rev-list', '--max-count=100', 'HEAD']' returned non-zero exit status 128.
  Found 0 commits in amirisback/nutrition-framework
  Found 0 issues in amirisback/nutrition-framework
Processing AlexBurdu/brazel...


Cloning into '/tmp/java_repo_details/AlexBurdu__brazel'...


  Found 973 files in AlexBurdu/brazel
  Found 1 contributors in AlexBurdu/brazel
  Found 1 commits in AlexBurdu/brazel
  Found 0 issues in AlexBurdu/brazel
Processing BetterAndroid/BetterAndroid...


Cloning into '/tmp/java_repo_details/BetterAndroid__BetterAndroid'...


  Found 268 files in BetterAndroid/BetterAndroid
  Found 1 contributors in BetterAndroid/BetterAndroid
  Found 100 commits in BetterAndroid/BetterAndroid
  Found 0 issues in BetterAndroid/BetterAndroid
Processing actions-marketplace-validations/frogobox_frogo-ui...


Cloning into '/tmp/java_repo_details/actions-marketplace-validations__frogobox_frogo-ui'...


  Found 386 files in actions-marketplace-validations/frogobox_frogo-ui
  Found 1 contributors in actions-marketplace-validations/frogobox_frogo-ui
  Found 57 commits in actions-marketplace-validations/frogobox_frogo-ui
  Found 0 issues in actions-marketplace-validations/frogobox_frogo-ui
Processing Iridium-Development/IridiumSkyblock...


Cloning into '/tmp/java_repo_details/Iridium-Development__IridiumSkyblock'...


  Found 177 files in Iridium-Development/IridiumSkyblock
  Found 28 contributors in Iridium-Development/IridiumSkyblock
  Found 100 commits in Iridium-Development/IridiumSkyblock
  Found 100 issues in Iridium-Development/IridiumSkyblock
Processing romychab/effects-hilt-plugin...


Cloning into '/tmp/java_repo_details/romychab__effects-hilt-plugin'...


  Found 847 files in romychab/effects-hilt-plugin
  Found 1 contributors in romychab/effects-hilt-plugin
  Found 100 commits in romychab/effects-hilt-plugin
  Found 11 issues in romychab/effects-hilt-plugin
Processing EssentialsX/Essentials...


Cloning into '/tmp/java_repo_details/EssentialsX__Essentials'...


  Found 751 files in EssentialsX/Essentials
  Found 30 contributors in EssentialsX/Essentials
  Found 100 commits in EssentialsX/Essentials
  Found 100 issues in EssentialsX/Essentials
Processing md-5/Deadbolt...


Cloning into '/tmp/java_repo_details/md-5__Deadbolt'...


  Found 72 files in md-5/Deadbolt
  Found 8 contributors in md-5/Deadbolt
  Found 100 commits in md-5/Deadbolt
  Found 100 issues in md-5/Deadbolt
Processing LeonMangler/SuperVanish...


Cloning into '/tmp/java_repo_details/LeonMangler__SuperVanish'...
fetch-pack: unexpected disconnect while reading sideband packet


KeyboardInterrupt: 

In [69]:
# Nuovo approccio: sfruttare la dependency graph API di GitHub

import requests
import json
import time
from collections import defaultdict

def get_dependency_graph(repo_full_name):
    """
    Ottiene le dipendenze usando l'API dependency graph di GitHub
    """
    # L'API dependency graph richiede un token con scope 'repo' o 'public_repo'
    url = f"https://api.github.com/repos/{repo_full_name}/dependency-graph/manifests"
    
    try:
        response = requests.get(url, headers=headers)
        print(response.status_code, response.text)
        
        if response.status_code == 200:
            manifests = response.json()
            dependencies = []
            
            for manifest in manifests:
                # Ogni manifest può avere dipendenze
                if 'dependencies' in manifest:
                    for dep_type, deps in manifest['dependencies'].items():
                        for dep in deps:
                            dependency_info = {
                                'package_url': dep.get('package_url', ''),
                                'metadata': dep.get('metadata', {}),
                                'relationship': dep.get('relationship', dep_type),
                                'scope': dep.get('scope', 'runtime'),
                                'requirements': dep.get('requirements', '')
                            }
                            dependencies.append(dependency_info)
            
            return dependencies
            
        elif response.status_code == 404:
            print(f"  Dependency graph not available for {repo_full_name}")
            return []
        elif response.status_code == 403:
            print(f"  Access forbidden for {repo_full_name} - might need different token permissions")
            return []
        else:
            print(f"  Error {response.status_code} for {repo_full_name}: {response.text}")
            return []
            
    except Exception as e:
        print(f"  Exception getting dependencies for {repo_full_name}: {e}")
        return []

def parse_package_url(package_url):
    """
    Estrae informazioni dal package URL (purl format)
    """
    if not package_url:
        return None
    
    # Format: pkg:maven/group.id/artifact.id@version
    try:
        parts = package_url.split('/')
        if len(parts) >= 3 and parts[0] == 'pkg:maven':
            group_id = parts[1]
            artifact_version = parts[2]
            
            if '@' in artifact_version:
                artifact_id, version = artifact_version.split('@', 1)
            else:
                artifact_id = artifact_version
                version = 'unknown'
                
            return {
                'groupId': group_id,
                'artifactId': artifact_id,
                'version': version,
                'ecosystem': 'maven'
            }
    except:
        pass
    
    return None

# Struttura per raccogliere tutte le dipendenze
all_dependencies_graph = []
dependency_stats = defaultdict(int)

print("Scraping dependencies usando GitHub Dependency Graph API...")
print(f"Processando {len(repos['layer1'])} repositories...")

processed_count = 0
failed_count = 0

for i, repo in enumerate(repos['layer1']):
    repo_full_name = repo["full_name"]
    
    print(f"[{i+1}/{len(repos['layer1'])}] Analyzing {repo_full_name}...")
    
    try:
        dependencies = get_dependency_graph(repo_full_name)
        
        # Processa ogni dipendenza
        for dep in dependencies:
            parsed = parse_package_url(dep.get('package_url', ''))
            if parsed:
                dependency_entry = {
                    'repository': repo_full_name,
                    'groupId': parsed['groupId'],
                    'artifactId': parsed['artifactId'],
                    'version': parsed['version'],
                    'ecosystem': parsed['ecosystem'],
                    'relationship': dep.get('relationship', 'unknown'),
                    'scope': dep.get('scope', 'runtime'),
                    'requirements': dep.get('requirements', ''),
                    'metadata': dep.get('metadata', {})
                }
                all_dependencies_graph.append(dependency_entry)
                
                # Statistiche
                dep_key = f"{parsed['groupId']}:{parsed['artifactId']}"
                dependency_stats[dep_key] += 1
        
        print(f"  Found {len(dependencies)} dependencies")
        processed_count += 1
        
        # Rate limiting - API ha limiti più stretti
        time.sleep(0.5)
        
    except Exception as e:
        print(f"  Error processing {repo_full_name}: {e}")
        failed_count += 1
        continue

print(f"\nScraping completato!")
print(f"Repositories processati: {processed_count}")
print(f"Repositories falliti: {failed_count}")
print(f"Totale dipendenze trovate: {len(all_dependencies_graph)}")
print(f"Dipendenze uniche: {len(dependency_stats)}")

# Salva i risultati
with open("/tmp/dependencies_graph_api.json", "w") as f:
    json.dump(all_dependencies_graph, f, indent=2)

# Salva le statistiche delle dipendenze più popolari
popular_deps = sorted(dependency_stats.items(), key=lambda x: x[1], reverse=True)
with open("/tmp/popular_dependencies.json", "w") as f:
    json.dump(popular_deps, f, indent=2)

print(f"\nSalvato {len(all_dependencies_graph)} dipendenze in /tmp/dependencies_graph_api.json")
print(f"Salvate statistiche dipendenze in /tmp/popular_dependencies.json")

# Mostra le top 10 dipendenze più usate
print("\nTop 10 dipendenze più utilizzate:")
for dep, count in popular_deps[:10]:
    print(f"  {dep}: {count} repositories")

Scraping dependencies usando GitHub Dependency Graph API...
Processando 1351 repositories...
[1/1351] Analyzing winterbe/java8-tutorial...
404 {"message":"Not Found","documentation_url":"https://docs.github.com/rest","status":"404"}
  Dependency graph not available for winterbe/java8-tutorial
  Found 0 dependencies
404 {"message":"Not Found","documentation_url":"https://docs.github.com/rest","status":"404"}
  Dependency graph not available for winterbe/java8-tutorial
  Found 0 dependencies
[2/1351] Analyzing ashishps1/awesome-leetcode-resources...
404 {"message":"Not Found","documentation_url":"https://docs.github.com/rest","status":"404"}
  Dependency graph not available for ashishps1/awesome-leetcode-resources
  Found 0 dependencies
[2/1351] Analyzing ashishps1/awesome-leetcode-resources...
404 {"message":"Not Found","documentation_url":"https://docs.github.com/rest","status":"404"}
  Dependency graph not available for ashishps1/awesome-leetcode-resources
  Found 0 dependencies
[3/13

KeyboardInterrupt: 

In [68]:
print(all_dependencies_graph)

[]
