# GitLab Java Repository Scraping

This notebook performs comprehensive scraping of Java repositories from GitLab, similar to the GitHub scraping functionality. It includes:

1. **Repository Search**: Finding Java projects using GitLab API
2. **Detailed Analysis**: Extracting build files, dependencies, and project structure
3. **Dependency Extraction**: Parsing Maven and Gradle dependencies
4. **Repository Cloning**: Downloading and analyzing repository contents
5. **Enhanced Data Collection**: Getting contributors, issues, and commits

## Prerequisites

Make sure you have a GitLab token set in your `.env` file:
```
GITLAB_TOKEN=your_gitlab_token_here
```

## Output Files

The notebook generates several JSON files:
- `gitlab_java_repos.json` - Raw repository data
- `gitlab_analyzed_repos.json` - Analyzed repositories with build files
- `gitlab_dependencies.json` - All dependencies found
- `gitlab_unique_dependencies.json` - Unique dependencies list
- `gitlab_detailed_analysis.json` - Detailed file analysis
- `gitlab_enhanced_analysis.json` - Complete data with contributors and issues

In [None]:
import os
from dotenv import load_dotenv
import requests
import json
import time
from datetime import datetime

# Load environment variables
load_dotenv()

# GitLab API configuration
GITLAB_TOKEN = os.getenv("GITLAB_TOKEN")  # Add this to your .env file
GITLAB_API_URL = "https://gitlab.com/api/v4"

headers = {
    "Authorization": f"Bearer {GITLAB_TOKEN}",
    "Content-Type": "application/json"
}

print("GitLab API configured successfully")
print(f"Using GitLab API URL: {GITLAB_API_URL}")

In [None]:
# Search for Java repositories on GitLab
def search_gitlab_repositories(language="Java", per_page=100, max_pages=10):
    """
    Search for repositories by language on GitLab
    """
    all_projects = []
    
    for page in range(1, max_pages + 1):
        print(f"Fetching page {page}...")
        
        # GitLab API endpoint for searching projects
        url = f"{GITLAB_API_URL}/projects"
        
        params = {
            'search': '',  # Empty search to get all projects
            'language': language,
            'order_by': 'stars',
            'sort': 'desc',
            'per_page': per_page,
            'page': page,
            'visibility': 'public',
            'simple': 'false'  # Get full project details
        }
        
        try:
            response = requests.get(url, headers=headers, params=params)
            
            if response.status_code == 200:
                projects = response.json()
                
                if not projects:  # Empty response means no more pages
                    print(f"No more projects found on page {page}")
                    break
                
                all_projects.extend(projects)
                print(f"Found {len(projects)} projects on page {page}. Total: {len(all_projects)}")
                
                # Rate limiting - GitLab has rate limits
                time.sleep(0.5)
                
            elif response.status_code == 401:
                print("Authentication failed. Check your GitLab token.")
                break
            elif response.status_code == 403:
                print("Rate limit exceeded. Waiting...")
                time.sleep(60)  # Wait 1 minute
                continue
            else:
                print(f"Error {response.status_code}: {response.text}")
                break
                
        except Exception as e:
            print(f"Error fetching page {page}: {e}")
            continue
    
    return all_projects

# Get Java repositories from GitLab
gitlab_java_repos = search_gitlab_repositories(language="Java", per_page=100, max_pages=5)

print(f"\nTotal GitLab Java repositories found: {len(gitlab_java_repos)}")

# Save the repositories to a JSON file
with open("gitlab_java_repos.json", "w") as f:
    json.dump(gitlab_java_repos, f, indent=2)

print(f"Saved {len(gitlab_java_repos)} GitLab repositories to gitlab_java_repos.json")

In [None]:
# Get detailed project information
def get_gitlab_project_details(project_id):
    """
    Get detailed information about a specific GitLab project
    """
    url = f"{GITLAB_API_URL}/projects/{project_id}"
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error getting project {project_id}: {response.status_code}")
            return None
            
    except Exception as e:
        print(f"Exception getting project {project_id}: {e}")
        return None

def get_gitlab_project_files(project_id, ref='main', path=''):
    """
    Get files from a GitLab project repository
    """
    url = f"{GITLAB_API_URL}/projects/{project_id}/repository/tree"
    
    params = {
        'ref': ref,
        'path': path,
        'recursive': True,
        'per_page': 100
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error getting files for project {project_id}: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"Exception getting files for project {project_id}: {e}")
        return []

def get_gitlab_file_content(project_id, file_path, ref='main'):
    """
    Get the content of a specific file from GitLab
    """
    # URL encode the file path
    import urllib.parse
    encoded_path = urllib.parse.quote(file_path, safe='')
    
    url = f"{GITLAB_API_URL}/projects/{project_id}/repository/files/{encoded_path}/raw"
    
    params = {
        'ref': ref
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 200:
            return response.text
        else:
            print(f"Error getting file {file_path} from project {project_id}: {response.status_code}")
            return None
            
    except Exception as e:
        print(f"Exception getting file {file_path} from project {project_id}: {e}")
        return None

# Analyze first few repositories to get detailed data
analyzed_repos = []

for i, repo in enumerate(gitlab_java_repos[:10]):  # Start with first 10
    print(f"\nAnalyzing repository {i+1}/{min(10, len(gitlab_java_repos))}: {repo['name']}")
    
    project_id = repo['id']
    
    # Get project files
    files = get_gitlab_project_files(project_id)
    
    # Look for build files (pom.xml, build.gradle)
    build_files = []
    java_files = []
    
    for file_info in files:
        if file_info['type'] == 'blob':  # Regular file
            file_name = file_info['name']
            file_path = file_info['path']
            
            if file_name in ['pom.xml', 'build.gradle', 'build.gradle.kts']:
                build_files.append(file_path)
            elif file_name.endswith('.java'):
                java_files.append(file_path)
    
    # Get content of build files
    build_file_contents = {}
    for build_file in build_files[:3]:  # Limit to first 3 build files
        content = get_gitlab_file_content(project_id, build_file)
        if content:
            build_file_contents[build_file] = content
    
    # Store analyzed data
    repo_data = {
        'id': repo['id'],
        'name': repo['name'],
        'path_with_namespace': repo['path_with_namespace'],
        'description': repo.get('description', ''),
        'web_url': repo['web_url'],
        'star_count': repo.get('star_count', 0),
        'forks_count': repo.get('forks_count', 0),
        'created_at': repo.get('created_at', ''),
        'last_activity_at': repo.get('last_activity_at', ''),
        'default_branch': repo.get('default_branch', 'main'),
        'build_files': build_files,
        'java_files_count': len(java_files),
        'build_file_contents': build_file_contents
    }
    
    analyzed_repos.append(repo_data)
    
    # Rate limiting
    time.sleep(1)

print(f"\nAnalyzed {len(analyzed_repos)} repositories")

# Save analyzed data
with open("gitlab_analyzed_repos.json", "w") as f:
    json.dump(analyzed_repos, f, indent=2)

print(f"Saved analyzed data to gitlab_analyzed_repos.json")

In [None]:
# Extract dependencies from build files
import re
import xml.etree.ElementTree as ET

def extract_maven_dependencies_gitlab(pom_content):
    """Extract dependencies from pom.xml content"""
    dependencies = []
    try:
        root = ET.fromstring(pom_content)
        # Handle namespaces
        ns = {'maven': 'http://maven.apache.org/POM/4.0.0'}
        
        # Find dependencies
        deps = root.findall('.//maven:dependency', ns) or root.findall('.//dependency')
        
        for dep in deps:
            group_id = dep.find('maven:groupId', ns) or dep.find('groupId')
            artifact_id = dep.find('maven:artifactId', ns) or dep.find('artifactId')
            version = dep.find('maven:version', ns) or dep.find('version')
            
            if group_id is not None and artifact_id is not None:
                dependencies.append({
                    'groupId': group_id.text,
                    'artifactId': artifact_id.text,
                    'version': version.text if version is not None else 'unknown'
                })
    except ET.ParseError:
        pass
    return dependencies

def extract_gradle_dependencies_gitlab(gradle_content):
    """Extract dependencies from build.gradle content"""
    dependencies = []
    # Pattern to match various gradle dependency formats
    patterns = [
        r"implementation\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"compile\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"api\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]",
        r"testImplementation\s+['\"]([^:]+):([^:]+):([^'\"]+)['\"]"
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, gradle_content)
        for match in matches:
            dependencies.append({
                'groupId': match[0],
                'artifactId': match[1],
                'version': match[2]
            })
    return dependencies

# Extract dependencies from all analyzed repositories
all_gitlab_dependencies = []

for repo_data in analyzed_repos:
    repo_dependencies = []
    
    # Process build files
    for file_path, content in repo_data['build_file_contents'].items():
        if file_path.endswith('pom.xml'):
            deps = extract_maven_dependencies_gitlab(content)
            repo_dependencies.extend(deps)
        elif file_path.endswith('.gradle'):
            deps = extract_gradle_dependencies_gitlab(content)
            repo_dependencies.extend(deps)
    
    # Store dependencies for this repository
    repo_data['dependencies'] = repo_dependencies
    
    # Add each dependency to the global list
    for dep in repo_dependencies:
        dependency_entry = {
            'repository': repo_data['path_with_namespace'],
            'repository_id': repo_data['id'],
            'groupId': dep['groupId'],
            'artifactId': dep['artifactId'],
            'version': dep['version']
        }
        all_gitlab_dependencies.append(dependency_entry)
    
    print(f"Found {len(repo_dependencies)} dependencies in {repo_data['name']}")

print(f"\nTotal GitLab dependencies found: {len(all_gitlab_dependencies)}")

# Save all dependencies
with open("gitlab_dependencies.json", "w") as f:
    json.dump(all_gitlab_dependencies, f, indent=2)

# Save unique dependencies
unique_deps = set()
for dep in all_gitlab_dependencies:
    unique_deps.add(f"{dep['groupId']}:{dep['artifactId']}")

unique_deps_list = sorted(list(unique_deps))
with open("gitlab_unique_dependencies.json", "w") as f:
    json.dump(unique_deps_list, f, indent=2)

print(f"Saved {len(all_gitlab_dependencies)} dependencies to gitlab_dependencies.json")
print(f"Saved {len(unique_deps_list)} unique dependencies to gitlab_unique_dependencies.json")

In [None]:
# GitLab Repository Clone and Detailed Analysis
import subprocess
import tempfile
import shutil

def clone_gitlab_repo(repo_data, max_size_mb=50):
    """
    Clone a GitLab repository and analyze its structure
    """
    clone_url = repo_data['http_url_to_repo']
    repo_name = repo_data['path_with_namespace']
    
    # Create temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        repo_dir = os.path.join(temp_dir, repo_name.replace('/', '_'))
        
        try:
            # Clone with depth limit for efficiency
            subprocess.run([
                'git', 'clone', '--depth', '10', 
                clone_url, repo_dir
            ], check=True, timeout=60, capture_output=True)
            
            # Check repository size
            repo_size = sum(
                os.path.getsize(os.path.join(dirpath, filename))
                for dirpath, dirnames, filenames in os.walk(repo_dir)
                for filename in filenames
            ) / (1024 * 1024)  # Convert to MB
            
            if repo_size > max_size_mb:
                print(f"Skipping {repo_name}: too large ({repo_size:.1f} MB)")
                return None
            
            # Analyze repository structure
            files_analysis = analyze_repository_structure(repo_dir)
            
            return {
                'repository': repo_name,
                'size_mb': repo_size,
                'files': files_analysis
            }
            
        except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
            print(f"Failed to clone {repo_name}: {e}")
            return None

def analyze_repository_structure(repo_dir):
    """
    Analyze the structure of a cloned repository
    """
    files_data = []
    
    for root, dirs, files in os.walk(repo_dir):
        # Skip .git directory
        if '.git' in dirs:
            dirs.remove('.git')
        
        for file in files:
            file_path = os.path.join(root, file)
            rel_path = os.path.relpath(file_path, repo_dir)
            
            # Focus on Java and build files
            if (rel_path.endswith('.java') or 
                rel_path.endswith('.xml') or 
                rel_path.endswith('.gradle') or
                rel_path.endswith('.yml') or
                rel_path.endswith('.yaml')):
                
                try:
                    file_size = os.path.getsize(file_path)
                    if file_size < 100_000:  # Skip files larger than 100KB
                        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                            content = f.read()
                            
                        files_data.append({
                            'path': rel_path,
                            'size': file_size,
                            'lines': len(content.splitlines()),
                            'content': content[:10000]  # Limit content to first 10KB
                        })
                except (UnicodeDecodeError, PermissionError):
                    continue
    
    return files_data

# Analyze a subset of GitLab repositories
detailed_analysis = []

for i, repo in enumerate(gitlab_java_repos[:5]):  # Analyze first 5 repositories
    print(f"\nDetailed analysis {i+1}/5: {repo['path_with_namespace']}")
    
    analysis = clone_gitlab_repo(repo)
    if analysis:
        detailed_analysis.append(analysis)
        print(f"  Analyzed {len(analysis['files'])} files ({analysis['size_mb']:.1f} MB)")
    
    # Rate limiting
    time.sleep(2)

# Save detailed analysis
with open("gitlab_detailed_analysis.json", "w") as f:
    json.dump(detailed_analysis, f, indent=2)

print(f"\nCompleted detailed analysis of {len(detailed_analysis)} repositories")
print(f"Saved to gitlab_detailed_analysis.json")

In [None]:
# GitLab Contributors and Issues Analysis
def get_gitlab_contributors(project_id):
    """
    Get contributors for a GitLab project
    """
    url = f"{GITLAB_API_URL}/projects/{project_id}/repository/contributors"
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error getting contributors for project {project_id}: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"Exception getting contributors for project {project_id}: {e}")
        return []

def get_gitlab_issues(project_id, per_page=100):
    """
    Get issues for a GitLab project
    """
    url = f"{GITLAB_API_URL}/projects/{project_id}/issues"
    
    params = {
        'state': 'all',
        'per_page': per_page,
        'page': 1
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error getting issues for project {project_id}: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"Exception getting issues for project {project_id}: {e}")
        return []

def get_gitlab_commits(project_id, per_page=100):
    """
    Get commits for a GitLab project
    """
    url = f"{GITLAB_API_URL}/projects/{project_id}/repository/commits"
    
    params = {
        'per_page': per_page,
        'page': 1
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error getting commits for project {project_id}: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"Exception getting commits for project {project_id}: {e}")
        return []

# Enhanced analysis with contributors, issues, and commits
enhanced_repos = []

for i, repo_data in enumerate(analyzed_repos):
    print(f"\nEnhanced analysis {i+1}/{len(analyzed_repos)}: {repo_data['name']}")
    
    project_id = repo_data['id']
    
    # Get contributors
    contributors = get_gitlab_contributors(project_id)
    print(f"  Found {len(contributors)} contributors")
    
    # Get issues
    issues = get_gitlab_issues(project_id)
    print(f"  Found {len(issues)} issues")
    
    # Get commits
    commits = get_gitlab_commits(project_id)
    print(f"  Found {len(commits)} commits")
    
    # Enhance repo data
    enhanced_repo = repo_data.copy()
    enhanced_repo['contributors'] = contributors
    enhanced_repo['issues'] = issues
    enhanced_repo['commits'] = commits
    
    enhanced_repos.append(enhanced_repo)
    
    # Rate limiting
    time.sleep(1)

# Save enhanced analysis
with open("gitlab_enhanced_analysis.json", "w") as f:
    json.dump(enhanced_repos, f, indent=2)

print(f"\nCompleted enhanced analysis of {len(enhanced_repos)} repositories")
print(f"Saved to gitlab_enhanced_analysis.json")

In [None]:
# Final Data Processing and Summary
print("=" * 50)
print("GitLab Scraping Summary")
print("=" * 50)

# Display summary statistics
print(f"Total GitLab repositories scraped: {len(gitlab_java_repos)}")
print(f"Repositories analyzed in detail: {len(analyzed_repos)}")
print(f"Total dependencies found: {len(all_gitlab_dependencies)}")
print(f"Unique dependencies: {len(unique_deps_list)}")

# Group repositories by star count
star_ranges = {
    "0-10 stars": 0,
    "11-50 stars": 0,
    "51-100 stars": 0,
    "101-500 stars": 0,
    "500+ stars": 0
}

for repo in gitlab_java_repos:
    stars = repo.get('star_count', 0)
    if stars <= 10:
        star_ranges["0-10 stars"] += 1
    elif stars <= 50:
        star_ranges["11-50 stars"] += 1
    elif stars <= 100:
        star_ranges["51-100 stars"] += 1
    elif stars <= 500:
        star_ranges["101-500 stars"] += 1
    else:
        star_ranges["500+ stars"] += 1

print("\nRepository distribution by star count:")
for range_name, count in star_ranges.items():
    print(f"  {range_name}: {count}")

# Show top dependencies
if all_gitlab_dependencies:
    from collections import Counter
    dep_counter = Counter()
    for dep in all_gitlab_dependencies:
        dep_counter[f"{dep['groupId']}:{dep['artifactId']}"] += 1
    
    print("\nTop 10 most used dependencies:")
    for i, (dep_name, count) in enumerate(dep_counter.most_common(10), 1):
        print(f"  {i}. {dep_name}: {count} projects")

# Save a final summary
summary_data = {
    'total_repositories': len(gitlab_java_repos),
    'analyzed_repositories': len(analyzed_repos),
    'total_dependencies': len(all_gitlab_dependencies),
    'unique_dependencies': len(unique_deps_list),
    'star_distribution': star_ranges,
    'top_dependencies': dict(dep_counter.most_common(20)) if all_gitlab_dependencies else {}
}

with open("gitlab_scraping_summary.json", "w") as f:
    json.dump(summary_data, f, indent=2)

print(f"\nScraping complete! Summary saved to gitlab_scraping_summary.json")
print("\nFiles generated:")
print("- gitlab_java_repos.json")
print("- gitlab_analyzed_repos.json") 
print("- gitlab_dependencies.json")
print("- gitlab_unique_dependencies.json")
print("- gitlab_detailed_analysis.json")
print("- gitlab_enhanced_analysis.json")
print("- gitlab_scraping_summary.json")