## 1. Environment Setup

In [45]:
# Install GreenMining from PyPI
!pip install greenmining python-dotenv tqdm --quiet
print("[OK] Installation complete")

[OK] Installation complete


In [46]:
# Verify installation
!pip show greenmining | grep -E "^(Name|Version)"

Name: greenmining
Version: 1.0.3


In [47]:
# Import required libraries
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from pathlib import Path
from collections import Counter
from tqdm.notebook import tqdm

# Set up plotting style for academic papers (black & white)
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'serif'
# Use fonts commonly available on Linux (fallback chain)
plt.rcParams['font.serif'] = ['DejaVu Serif', 'Liberation Serif', 'FreeSerif', 'serif']
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 150

# Create output directories
DATA_DIR = Path('data')
FIGURES_DIR = Path('figures')
DATA_DIR.mkdir(exist_ok=True)
FIGURES_DIR.mkdir(exist_ok=True)

print("Environment setup complete.")
print(f"Data directory: {DATA_DIR.absolute()}")
print(f"Figures directory: {FIGURES_DIR.absolute()}")

Environment setup complete.
Data directory: /home/neo/Documents/greenmining/experiment/data
Figures directory: /home/neo/Documents/greenmining/experiment/figures


## 2. Experiment Configuration

Define the experimental parameters for reproducibility.

In [48]:
# Experiment Configuration
CONFIG = {
    # Repository Selection
    "max_repos": 100,           # Reduced for practical runtime (~1-2 hours total)
    "min_stars": 100,          # Higher threshold = more active/maintained repos
    "languages": [ 
        "Python",
        "Java", 
        "Go",
        "JavaScript",
        "TypeScript", 
        "C#",
        "Rust",
        "Kotlin",
        "Ruby",
        "C++",
    ],
    # Note: Keywords are now defined as OR-groups in the fetch cell
    # to work properly with GitHub's search API
    "search_keyword_groups": [
        "microservices OR kubernetes OR docker",
        "cloud-native OR serverless OR containerization",
        "energy-efficient OR green-software OR carbon-aware",
        "sustainable OR eco-friendly",
        "performance-optimization OR resource-optimization",
        "memory-efficient OR cpu-efficient",
        "event-driven OR distributed-systems",
        "scalable OR high-performance",
    ],
    
    # Commit Extraction - PRACTICAL VALUES
    # 100 commits × 50 repos = 5,000 commits (manageable in ~1-2 hours)
    "commits_per_repo": 500,
    "date_from": "2023-01-01",
    "date_to": "2025-01-01",
    
    # Analysis Features
    "enable_nlp": True,
    "enable_ml_features": True,
    "enable_enhanced_stats": True,
    "enable_temporal": True,
    "temporal_granularity": "quarter",
    
    # Pattern Database
    "pattern_source": "GSF Catalog",
    "pattern_count": 122,
    "category_count": 15
}

# Calculate estimated runtime
estimated_commits = CONFIG["max_repos"] * CONFIG["commits_per_repo"]
estimated_time_mins = (CONFIG["max_repos"] * 1.5) + (estimated_commits * 0.001)  # ~1.5 min/repo clone + analysis

# Display configuration as table
print("=" * 60)
print("EXPERIMENT CONFIGURATION")
print("=" * 60)
for key, value in CONFIG.items():
    if isinstance(value, list):
        print(f"{key:25} : {len(value)} items")
    else:
        print(f"{key:25} : {value}")
print("=" * 60)
print(f"\nTotal Languages: {len(CONFIG['languages'])}")
print(f"Total Search Keyword Groups: {len(CONFIG['search_keyword_groups'])}")
print(f"\n-- ESTIMATED SCALE:")
print(f"   Max commits: {estimated_commits:,}")
print(f"   Est. runtime: ~{estimated_time_mins:.0f}-{estimated_time_mins*1.5:.0f} minutes")
print("\nNote: Tip: For faster testing, reduce max_repos to 10-20 first")

EXPERIMENT CONFIGURATION
max_repos                 : 100
min_stars                 : 100
languages                 : 10 items
search_keyword_groups     : 8 items
commits_per_repo          : 500
date_from                 : 2023-01-01
date_to                   : 2025-01-01
enable_nlp                : True
enable_ml_features        : True
enable_enhanced_stats     : True
enable_temporal           : True
temporal_granularity      : quarter
pattern_source            : GSF Catalog
pattern_count             : 122
category_count            : 15

Total Languages: 10
Total Search Keyword Groups: 8

-- ESTIMATED SCALE:
   Max commits: 50,000
   Est. runtime: ~200-300 minutes

Note: Tip: For faster testing, reduce max_repos to 10-20 first


In [49]:
# Set GitHub Token (required for API access)
# Option 1: Set environment variable
# os.environ['GITHUB_TOKEN'] = 'your_github_token_here'

# Option 2: Load from .env file
from dotenv import load_dotenv
load_dotenv('./.env')

# Verify token is set
if os.getenv('GITHUB_TOKEN'):
    print("[OK] GitHub token configured")
else:
    print("WARNING: GitHub token not set. Set GITHUB_TOKEN environment variable.")

[OK] GitHub token configured


## 3. Pipeline Execution

Execute the GreenMining pipeline in 5 stages:
1. **FETCH** - Discover repositories from GitHub
2. **EXTRACT** - Extract commit messages using PyDriller
3. **ANALYZE** - Detect green patterns in commits
4. **AGGREGATE** - Compute statistics and metrics
5. **REPORT** - Generate analysis report

### 3.1 Stage 1: Fetch Repositories

In [50]:
%%time
# Stage 1: Fetch repositories using MULTI-KEYWORD search strategy
# 
# GitHub's search API doesn't handle comma-separated keywords well.
# Solution: Run multiple searches (one per keyword group) and merge unique results.
# This ensures comprehensive coverage for software engineering mining research.

from greenmining.config import Config
from greenmining.controllers.repository_controller import RepositoryController
from github import Github
import time

# Initialize
gm_config = Config()
github_client = Github(os.getenv('GITHUB_TOKEN'))

# Define keyword groups for comprehensive SE mining
# Group by related topics to avoid duplicates within groups
KEYWORD_GROUPS = [
    # Cloud & Infrastructure (most common in SE)
    "microservices OR kubernetes OR docker",
    "cloud-native OR serverless OR containerization",
    # Green Software & Sustainability  
    "energy-efficient OR green-software OR carbon-aware",
    "sustainable OR eco-friendly",
    # Performance & Optimization
    "performance-optimization OR resource-optimization",
    "memory-efficient OR cpu-efficient",
    # Architecture Patterns
    "event-driven OR distributed-systems",
    "scalable OR high-performance",
]

print("=" * 60)
print("MULTI-KEYWORD REPOSITORY SEARCH")
print("=" * 60)
print(f"Keyword groups: {len(KEYWORD_GROUPS)}")
print(f"Target repos: {CONFIG['max_repos']}")
print(f"Min stars: {CONFIG['min_stars']}")
print(f"Languages: {CONFIG['languages']}")
print("=" * 60)

# Collect unique repositories across all keyword searches
all_repos = {}  # Use dict to deduplicate by full_name
repos_per_keyword = CONFIG['max_repos'] // len(KEYWORD_GROUPS) + 5  # Extra buffer

for i, keyword_query in enumerate(KEYWORD_GROUPS, 1):
    print(f"\n[{i}/{len(KEYWORD_GROUPS)}] Searching: {keyword_query}")
    
    # Build GitHub search query
    query_parts = [
        keyword_query,
        f"stars:>={CONFIG['min_stars']}"
    ]
    query = " ".join(query_parts)
    
    try:
        results = github_client.search_repositories(query=query, sort="stars", order="desc")
        found = results.totalCount
        print(f"    Found: {found} repositories")
        
        # Collect repos from this search
        added = 0
        for repo in results:
            if added >= repos_per_keyword:
                break
            
            # Check language filter
            if repo.language and repo.language in CONFIG['languages']:
                full_name = repo.full_name
                if full_name not in all_repos:
                    all_repos[full_name] = {
                        'id': repo.id,
                        'name': repo.name,
                        'owner': repo.owner.login,
                        'full_name': full_name,
                        'description': repo.description,
                        'url': repo.html_url,
                        'clone_url': repo.clone_url,
                        'language': repo.language,
                        'stars': repo.stargazers_count,
                        'forks': repo.forks_count,
                        'created_at': str(repo.created_at),
                        'updated_at': str(repo.updated_at),
                        'search_keyword': keyword_query.split(' OR ')[0]  # Primary keyword
                    }
                    added += 1
        
        print(f"    Added: {added} new unique repos (total: {len(all_repos)})")
        
        # Rate limiting - GitHub allows 30 search requests/min
        time.sleep(2)
        
    except Exception as e:
        print(f"   Error: {e}")
        continue
    
    # Stop if we have enough
    if len(all_repos) >= CONFIG['max_repos']:
        print(f"\n[OK] Reached target of {CONFIG['max_repos']} repositories")
        break

# Convert to list and limit to max_repos
repositories = list(all_repos.values())[:CONFIG['max_repos']]

# Save to our data directory
with open(DATA_DIR / 'repositories.json', 'w') as f:
    json.dump(repositories, f, indent=2, default=str)

print("\n" + "=" * 60)
print(f"[OK] TOTAL: {len(repositories)} unique repositories fetched")
print(f"[OK] Saved to: {DATA_DIR / 'repositories.json'}")

# Show language distribution
lang_dist = {}
for r in repositories:
    lang = r.get('language', 'Unknown')
    lang_dist[lang] = lang_dist.get(lang, 0) + 1

print("\nLanguage Distribution:")
for lang, count in sorted(lang_dist.items(), key=lambda x: -x[1])[:10]:
    print(f"  {lang}: {count}")
print("=" * 60)



MULTI-KEYWORD REPOSITORY SEARCH
Keyword groups: 8
Target repos: 100
Min stars: 100
Languages: ['Python', 'Java', 'Go', 'JavaScript', 'TypeScript', 'C#', 'Rust', 'Kotlin', 'Ruby', 'C++']

[1/8] Searching: microservices OR kubernetes OR docker
    Found: 1000 repositories
    Added: 17 new unique repos (total: 17)

[2/8] Searching: cloud-native OR serverless OR containerization
    Found: 1000 repositories
    Added: 17 new unique repos (total: 34)

[3/8] Searching: energy-efficient OR green-software OR carbon-aware
    Found: 32 repositories
    Added: 17 new unique repos (total: 51)

[4/8] Searching: sustainable OR eco-friendly
    Found: 62 repositories
    Added: 17 new unique repos (total: 68)

[5/8] Searching: performance-optimization OR resource-optimization
    Found: 217 repositories
    Added: 17 new unique repos (total: 85)

[6/8] Searching: memory-efficient OR cpu-efficient
    Found: 172 repositories
    Added: 17 new unique repos (total: 102)

[OK] Reached target of 100 rep

In [51]:
# Load and display fetched repositories
with open(DATA_DIR / 'repositories.json', 'r') as f:
    repositories = json.load(f)

print(f"Fetched {len(repositories)} repositories")
print("\nSample repositories:")
repos_df = pd.DataFrame(repositories[:10])[['name', 'owner', 'stars', 'language', 'url']]
display(repos_df)

Fetched 100 repositories

Sample repositories:


Unnamed: 0,name,owner,stars,language,url
0,kubernetes,kubernetes,119680,Go,https://github.com/kubernetes/kubernetes
1,gin,gin-gonic,87626,Go,https://github.com/gin-gonic/gin
2,mall,macrozheng,82635,Java,https://github.com/macrozheng/mall
3,uptime-kuma,louislam,80931,JavaScript,https://github.com/louislam/uptime-kuma
4,devops-exercises,bregman-arie,80525,Python,https://github.com/bregman-arie/devops-exercises
5,nest,nestjs,74164,TypeScript,https://github.com/nestjs/nest
6,Stirling-PDF,Stirling-Tools,72811,TypeScript,https://github.com/Stirling-Tools/Stirling-PDF
7,moby,moby,71318,Go,https://github.com/moby/moby
8,traefik,traefik,60923,Go,https://github.com/traefik/traefik
9,minio,minio,59622,Go,https://github.com/minio/minio


### 3.2 Stage 2: Extract Commits

In [52]:
%%time
# Stage 2: Extract commits using Python API
from greenmining.services.commit_extractor import CommitExtractor

# Calculate days back from date range
date_from = datetime.strptime(CONFIG["date_from"], "%Y-%m-%d")
date_to = datetime.strptime(CONFIG["date_to"], "%Y-%m-%d")
days_back = (datetime.now() - date_from).days

# Initialize extractor
extractor = CommitExtractor(
    max_commits=CONFIG["commits_per_repo"],
    skip_merges=True,
    days_back=days_back,
    github_token=os.getenv('GITHUB_TOKEN'),
    timeout=120  # 2 minutes per repo
)

# Extract commits from all repositories
commits = extractor.extract_from_repositories(repositories)

# Save commits
with open(DATA_DIR / 'commits.json', 'w') as f:
    json.dump(commits, f, indent=2, default=str)

print(f"\n[OK] Extracted {len(commits)} commits from {len(repositories)} repositories")
print(f"[OK] Saved to: {DATA_DIR / 'commits.json'}")


Extracting commits from 100 repositories...
Settings: max_commits=500, skip_merges=True, days_back=1102


Processing repositories:  11%|█         | 11/100 [1:05:32<9:00:24, 364.32s/repo, commits=5500, failed=0]Request GET /repos/wagoodman/dive/commits/50d776e84592b01d65732c9f17db4d9f30a115e7 failed with 403: Forbidden
Setting next backoff to 206.710302s
Request GET /repos/wagoodman/dive/commits/d2c661eaf7e321aa992396552f54dc8c3db358a8 failed with 403: Forbidden
Setting next backoff to 200.536644s
Processing repositories:  21%|██        | 21/100 [2:07:13<7:57:21, 362.55s/repo, commits=10402, failed=0]Request GET /repos/ClickHouse/ClickHouse/commits/f81ccc56a808a0245695977a4718bf6ba32db436 failed with 403: Forbidden
Setting next backoff to 190.499281s
Request GET /repos/ClickHouse/ClickHouse/commits/54392bc2a8583600210e7a6d6033b792435f5138 failed with 403: Forbidden
Setting next backoff to 100.755894s
Processing repositories:  30%|███       | 30/100 [3:06:29<7:26:04, 382.36s/repo, commits=14902, failed=0]Request GET /repos/envoyproxy/envoy/commits/7644b22d2dfc522105e0017de465ca8ddfa6bff2 fai

Error extracting commits from chaoss/augur: Repository extraction timeout
Attempt 1/3 failed: Repository extraction timeout
Retrying in 5.0 seconds...


Processing repositories:  56%|█████▌    | 56/100 [5:04:33<4:28:43, 366.43s/repo, commits=24254, failed=0]Request GET /repos/pogopaule/awesome-sustainability-jobs/commits/88c6431cf634a81cbdc21648ebf21b583f1fd8ba failed with 403: Forbidden
Setting next backoff to 173.179069s
Processing repositories:  71%|███████   | 71/100 [6:08:40<2:14:22, 278.03s/repo, commits=29195, failed=0]Request GET /repos/fastruby/fast-ruby/commits/8d9e752f5cdfa644d94da5c3b66b5ddbe60e98cf failed with 403: Forbidden
Setting next backoff to 97.911763s
Request GET /repos/fastruby/fast-ruby/commits/d6e1ac0ef58931479e5c89478992a78c0271acd9 failed with 403: Forbidden
Setting next backoff to 17.900002s
Processing repositories:  85%|████████▌ | 85/100 [7:06:42<57:15, 229.03s/repo, commits=34005, failed=0]  Request GET /repos/vllm-project/vllm/commits/1d9e9ae8a4498782de0dd51627ab1fddac4692ef failed with 403: Forbidden
Setting next backoff to 251.509298s
Request GET /repos/vllm-project/vllm/commits/b7036c87a13bd94fabf9e464


[OK] Extracted 39664 commits from 100 repositories
[OK] Saved to: data/commits.json
CPU times: user 1min 16s, sys: 3.56 s, total: 1min 19s
Wall time: 8h 21min 10s


In [53]:
# Load and summarize extracted commits
with open(DATA_DIR / 'commits.json', 'r') as f:
    commits = json.load(f)

print(f"Extracted {len(commits)} commits")
print("\nSample commit messages:")
for i, commit in enumerate(commits[:5]):
    msg = commit.get('message', '')[:80]
    print(f"  {i+1}. {msg}...")

Extracted 39664 commits

Sample commit messages:
  1. Move dummy testing to subpackage

Change-Id: I52863cf256fc52b863c182932eb9520f36...
  2. run codegen...
  3. feat: skip validation for types that are Lists in GetTargets function...
  4. Enable nomaps rule for Kube API Linter (#134852)

* tested how many errors

* ad...
  5. leasecandidate: Improve goroutine management

Make sure all goroutines are termi...


### 3.3 Stage 3: Analyze Commits

In [54]:
%%time
# Stage 3: Analyze commits for green patterns using Python API
from greenmining.services.data_analyzer import DataAnalyzer

# Initialize analyzer with configured options
analyzer = DataAnalyzer(
    enable_nlp=CONFIG["enable_nlp"],
    enable_ml_features=CONFIG["enable_ml_features"]
)

# Analyze all commits
analysis_results = analyzer.analyze_commits(commits)

# Save results
with open(DATA_DIR / 'analysis_results.json', 'w') as f:
    json.dump(analysis_results, f, indent=2, default=str)

# Summary
green_commits = [r for r in analysis_results if r.get('is_green_aware', False)]
print(f"\n[OK] Analyzed {len(analysis_results)} commits")
print(f"[OK] Green-aware commits: {len(green_commits)} ({len(green_commits)/len(analysis_results)*100:.2f}%)")
print(f"[OK] Saved to: {DATA_DIR / 'analysis_results.json'}")

NLP analysis enabled (morphological variants + synonyms)
ML feature extraction enabled

Analyzing 39664 commits for green practices...


Analyzing commits: 100%|██████████| 39664/39664 [00:28<00:00, 1368.18commit/s]



[OK] Analyzed 39664 commits
[OK] Green-aware commits: 0 (0.00%)
[OK] Saved to: data/analysis_results.json
CPU times: user 30.3 s, sys: 126 ms, total: 30.4 s
Wall time: 30.5 s


In [55]:
# Load analysis results
with open(DATA_DIR / 'analysis_results.json', 'r') as f:
    analysis_results = json.load(f)

# Count green-aware commits
green_commits = [r for r in analysis_results if r.get('is_green_aware', False)]
print(f"Total commits analyzed: {len(analysis_results)}")
print(f"Green-aware commits: {len(green_commits)}")
print(f"Green awareness rate: {len(green_commits)/len(analysis_results)*100:.2f}%")

Total commits analyzed: 39664
Green-aware commits: 0
Green awareness rate: 0.00%


### 3.4 Stage 4: Aggregate Statistics

In [56]:
%%time
# Stage 4: Aggregate statistics using Python API
from greenmining.services.data_aggregator import DataAggregator

# Initialize aggregator with configured options
aggregator = DataAggregator(
    enable_enhanced_stats=CONFIG["enable_enhanced_stats"],
    enable_temporal=CONFIG["enable_temporal"],
    temporal_granularity=CONFIG["temporal_granularity"]
)

# Aggregate results
stats = aggregator.aggregate(analysis_results, repositories)

# Save aggregated statistics
with open(DATA_DIR / 'aggregated_statistics.json', 'w') as f:
    json.dump(stats, f, indent=2, default=str)

print(f"\n[OK] Aggregation complete")
print(f"[OK] Saved to: {DATA_DIR / 'aggregated_statistics.json'}")
print("\nAvailable data sections:")
for key in stats.keys():
    print(f"  - {key}")

Enhanced statistical analysis enabled
Temporal analysis enabled (granularity: quarter)

Aggregating analysis results...
[DONE] Enhanced statistical analysis complete
[DONE] Temporal trend analysis complete

[OK] Aggregation complete
[OK] Saved to: data/aggregated_statistics.json

Available data sections:
  - summary
  - known_patterns
  - emergent_patterns
  - per_repo_stats
  - per_language_stats
  - enhanced_statistics
  - temporal_analysis
CPU times: user 769 ms, sys: 3.82 ms, total: 773 ms
Wall time: 773 ms


In [57]:
# Load aggregated statistics
with open(DATA_DIR / 'aggregated_statistics.json', 'r') as f:
    stats = json.load(f)

print("Aggregation complete. Available data sections:")
for key in stats.keys():
    print(f"  - {key}")

Aggregation complete. Available data sections:
  - summary
  - known_patterns
  - emergent_patterns
  - per_repo_stats
  - per_language_stats
  - enhanced_statistics
  - temporal_analysis


### 3.5 Stage 5: Generate Report

In [58]:
# Stage 5: Generate report using Python API
from greenmining.services.reports import ReportGenerator

# Initialize report generator
report_gen = ReportGenerator()

# Wrap data in expected format with metadata
# The ReportGenerator expects dict with 'metadata' keys
repos_data_wrapped = {
    "metadata": {
        "languages": CONFIG["languages"],
        "search_keywords": [kw.split(" OR ")[0] for kw in CONFIG["search_keyword_groups"]],
        "min_stars": CONFIG["min_stars"],
        "total_repos": len(repositories),
    },
    "repositories": repositories
}

analysis_data_wrapped = {
    "metadata": {
        "total_commits": len(analysis_results),
        "date_from": CONFIG["date_from"],
        "date_to": CONFIG["date_to"],
    },
    "results": analysis_results
}

# Generate markdown report
report_content = report_gen.generate_report(
    aggregated_data=stats,
    analysis_data=analysis_data_wrapped,
    repos_data=repos_data_wrapped
)

# Save report
report_path = DATA_DIR / 'green_software_analysis_report.md'
with open(report_path, 'w') as f:
    f.write(report_content)

print(f"[OK] Report generated: {report_path}")
print(f"\nReport preview (first 500 chars):\n")
print(report_content[:500] + "...")

[OK] Report generated: data/green_software_analysis_report.md

Report preview (first 500 chars):

# Mining Software Repositories for Green Microservices
## Comprehensive Analysis Report

**Report Generated:** 2026-01-07 21:29:05
**Analysis Type:** Keyword and Heuristic-Based Pattern Detection

---

### Executive Summary

This report presents findings from analyzing **39,664 commits** across **100 microservice-based repositories** to identify green software engineering practices.

**Key Findings:**

- **46.0%** of commits (18,253) explicitly mention energy efficiency, performance optimization...


In [59]:
# Display Summary Statistics
summary = stats.get('summary', {})

print("=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
print(f"Total Commits Analyzed:      {summary.get('total_commits', 'N/A'):,}")
print(f"Green-Aware Commits:         {summary.get('green_aware_count', 'N/A'):,}")
print(f"Green Awareness Rate:        {summary.get('green_aware_percentage', 'N/A'):.2f}%")
print(f"Total Repositories:          {summary.get('total_repos', 'N/A')}")
print(f"Repos with Green Commits:    {summary.get('repos_with_green_commits', 'N/A')}")
print("=" * 60)

SUMMARY STATISTICS
Total Commits Analyzed:      39,664
Green-Aware Commits:         18,253
Green Awareness Rate:        46.02%
Total Repositories:          100
Repos with Green Commits:    98
