The purpose of this file is to mine the desired data using an url list. The global attributes and included libraries are located at the top.

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import requests
from urllib.parse import urlparse
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from datetime import datetime, timezone, timedelta
import os
import glob
from pydriller import Repository

In [None]:
GITHUB_TOKEN = "" #API key for activity check on projects
TIME_LIMIT_MONTHS = 30 # data collection limit for commits
NUMBER_OF_PROJECTS = 40

This function checks for the last 3 months activity. API limit may hit. The extracted file has additional column last90 for the number of commits in the last 90 days.

In [None]:
API_BASE = "https://api.github.com/repos/{}/commits"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"} if GITHUB_TOKEN else {}
DAYS_TO_ANALYZE = 90

def extract_repo_name(url):
    """Convert GitHub repo URL to 'owner/repo' format."""
    parsed_url = urlparse(url)

    path_parts = parsed_url.path.strip("/").split("/")
    if len(path_parts) < 2:
        return None

    return f"{path_parts[0]}/{path_parts[1]}"

def get_commit_count(repo):
    if not repo:
        return None

    since_date = (datetime.utcnow() - timedelta(days=DAYS_TO_ANALYZE)).isoformat() + "Z"
    url = API_BASE.format(repo)
    params = {"since": since_date, "per_page": 100}

    commit_count = 0
    page = 1

    while True:
        response = requests.get(url, headers=HEADERS, params={**params, "page": page})

        commits = response.json()
        commit_count += len(commits)

        if len(commits) < 100:
            break

        page += 1

    return commit_count if commit_count > 0 else None

csv_file_path = "output.csv"
df = pd.read_csv(csv_file_path)

if "url" not in df.columns:
    raise ValueError("The CSV file must contain a 'url' column with GitHub repository links.")

df["repo_name"] = df["url"].apply(extract_repo_name)

df["last90"] = df["repo_name"].apply(get_commit_count)

df = df.dropna(subset=["last90"])

df = df.sort_values(by="last90", ascending=False)

df.to_csv("17k_active_90days.csv", index=False)


This function uses the existing features to create a sample set of a relatively low activity set.

In [None]:
def sample(
    df: pd.DataFrame,
    primary_cols: list[str],
    other_cols: list[str],
    n_samples: int = NUMBER_OF_PROJECTS,
    oversample_factor: int = 5,
) -> pd.DataFrame:
    
    pool = df.dropna(subset=primary_cols + other_cols).copy()
    Xp = pool[primary_cols].astype(float).values
    scaler_p = StandardScaler().fit(Xp)
    Xp_z = scaler_p.transform(Xp)
    median_p = np.median(Xp_z, axis=0).reshape(1,-1)
    d_primary = pairwise_distances(Xp_z, median_p, metric="euclidean").ravel()
    pool["_d_primary"] = d_primary
    
    K = min(len(pool), n_samples * oversample_factor)
    
    topK = pool.nsmallest(K, "_d_primary").reset_index(drop=True)
    Xo = topK[other_cols].astype(float).values
    scaler_o = StandardScaler().fit(Xo)
    Xo_z = scaler_o.transform(Xo)
    
    chosen_idxs = []
    # start by the one *closest* to primary median
    chosen_idxs.append(int(topK["_d_primary"].idxmin()))
    
    # greedy max–min selection
    while len(chosen_idxs) < n_samples:
        not_chosen = [i for i in range(len(topK)) if i not in chosen_idxs]
        min_dists = []
        for i in not_chosen:
            dists = np.linalg.norm(Xo_z[chosen_idxs] - Xo_z[i], axis=1)
            min_dists.append(dists.min())
            
        # pick the candidate with the *largest* such min‐distance
        best = not_chosen[int(np.argmax(min_dists))]
        chosen_idxs.append(best)
    
    return topK.loc[chosen_idxs].drop(columns=["_d_primary"])

df = pd.read_csv("17k_active_90days.csv")
primary = ["lines","last90","commit_count"]
others  = [
    "committer_count","author_count",
    "dominant_domain_committers","dominant_domain_authors"
]
sample_set = sample(df, primary, others, n_samples=NUMBER_OF_PROJECTS)
sample_set.to_csv("average_projects.csv", index=False)


This part samples the high activity set

In [None]:
df = pd.read_csv('17k_active_90days.csv')

df_filtered = df[df['last90'] <= 200].copy()

df_top = df_filtered.sort_values('last90', ascending=False)

sample_set = df_top.head(NUMBER_OF_PROJECTS)

In [None]:
cutoff_date = datetime.now(timezone.utc) - relativedelta(months=TIME_LIMIT_MONTHS)

OUT_DIR = "data"
os.makedirs(OUT_DIR, exist_ok=True)

def sanitize_name(url: str) -> str:
    parts = url.rstrip("/").split("/")[-2:]
    return "-".join(parts).replace(".git", "")

def collect_commits_with_dmm(repo_url: str, since: datetime, ) -> pd.DataFrame:
    records = []
    repo = Repository(path_to_repo=repo_url, since=since)
    for commit in repo.traverse_commits():
        author = commit.author.email or commit.author.name or "<unknown>"
        records.append({
            'url'                 : repo_url,
            'developer'           : author,
            'commit_hash'         : commit.hash,
            'commit_date'         : commit.committer_date,
            'churn'               : commit.lines,
            'dmm_unit_size'       : commit.dmm_unit_size,
            'dmm_unit_complexity' : commit.dmm_unit_complexity,
            'dmm_unit_interfacing': commit.dmm_unit_interfacing
        })
    df = pd.DataFrame(records)
    if not df.empty:
        df['commit_date'] = (
            pd.to_datetime(df['commit_date'], utc=True)
              .dt.tz_localize(None)
        )
    return df

repo_urls = sample_set['url'].dropna().unique().tolist()

with tqdm(repo_urls, desc="Repos", unit="repo") as pbar:
    for url in pbar:
        name = sanitize_name(url)
        out_path = os.path.join(OUT_DIR, f"{name}.csv")
        # if already mined, skip
        if os.path.exists(out_path):
            pbar.set_postfix_str(f"skipping {name} (exists)")
            continue

        df = collect_commits_with_dmm(url, cutoff_date)
        n = len(df)
        pbar.set_postfix_str(f"{n} commits since {cutoff_date.date()}")
        if n:
            df.to_csv(out_path, index=False)

files = glob.glob(os.path.join(OUT_DIR, "*.csv"))
if not files:
    print("No per - repo data found—nothing to combine.")
else:
    combined = pd.concat((pd.read_csv(f, parse_dates=['commit_date']) for f in files),
                         ignore_index=True)
    combined.to_csv('commits_with_churn_and_dmm_limited.csv', index=False)
    print(f"\nSaved {len(combined)} total commits from {len(files)} repos.")
