In [1]:
import pandas as pd
from typing import List, Tuple, Dict, Any, Optional
import subprocess

In [2]:
data_prefix: str = 'data'
repo_prefix: str = f'{data_prefix}/repos'

data_name: str = 'original.parquet'

repo_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/{data_name}', engine = 'pyarrow')

In [3]:
# define template to crawl data
get_prev_commit_template: str = '''
cd ./{}/{}

git rev-parse {}^
'''

get_diff_2_commit_template: str = '''
cd ./{}/{}

git diff --name-only {} {}
'''

get_file_at_commit_template: str = '''
cd ./{}/{}

git show {}:{}
'''

get_diff_file_template: str = '''
cd ./{}/{}

git diff {}..{} -- {}
'''

necessary_cols: List[str] = [
    'id',
    'fromLib',
    'toLib',
    'repoName',
    'repoOwner',
    'repoSplitName',
    'startCommit',
    'endCommit',
    'fileName',
    'startCode',
    'endCode',
    'diff',
    'startCommitChanges',
    'endCommitChanges',
]
sample_template: Dict[str, Any] = {k: None for k in necessary_cols}
final_df: pd.DataFrame = pd.DataFrame(columns = necessary_cols)
# final_df.set_index(id)

def get_prev_commit(repo_prefix: str, repo_name: str, changed_commit: str) -> str:
    get_prev_commit_script: str = get_prev_commit_template.format(repo_prefix, repo_name, changed_commit)
    sub: subprocess.CompletedProcess = subprocess.run(get_prev_commit_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')

    prev_commit: str = sub.stdout.strip()

    return prev_commit

def get_diff_2_commit(repo_prefix: str, repo_name: str, commit1: str, commit2: str) -> List[str]:
    get_diff_2_commit_script: str = get_diff_2_commit_template.format(repo_prefix, repo_name, commit1, commit2)
    sub: subprocess.CompletedProcess = subprocess.run(get_diff_2_commit_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')

    diff: str = sub.stdout
    diff_files: List[str] = diff.split('\n')

    return diff_files

def get_start_end_commit_code(repo_prefix: str, repo_name: str, file_name: str, start_commit: str, end_commit: str) -> Tuple[str, str]:
    try:
        get_file_script: str = get_file_at_commit_template.format(repo_prefix, repo_name, start_commit, file_name)
        sub: subprocess.CompletedProcess = subprocess.run(get_file_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')
        start_commit_code: str = sub.stdout
    except Exception as e:
        start_commit_code: str = ''


    try:
        get_file_script: str = get_file_at_commit_template.format(repo_prefix, repo_name, end_commit, file_name)
        sub: subprocess.CompletedProcess = subprocess.run(get_file_script, shell = True, capture_output = True, encoding='utf-8', errors='ignore')
        end_commit_code: str = sub.stdout
    except Exception as e:
        end_commit_code: str = ''

    return start_commit_code, end_commit_code

def get_diff_file(repo_prefix: str, repo_name: str, file_name: str, start_commit: str, end_commit: str) -> str:
    get_diff_file_script: str = get_diff_file_template.format(repo_prefix, repo_name, start_commit, end_commit, file_name)
    sub: subprocess.CompletedProcess = subprocess.run(get_diff_file_script, shell = True, capture_output = True, encoding = 'utf-8', errors = 'ignore')
    diff: str = sub.stdout

    return diff

def str_normalize(x: str) -> str:
    if (x is None):
        return ''
    elif (len(x) == 0):
        return ''

    return x

def create_data_rows(samples: pd.DataFrame, repo_name: str, sample_template: Dict[str, Any] = sample_template, sample_cnt: int = 0) -> Tuple[int, pd.DataFrame]:
    sample_template.update({
        # 'id': sample_cnt,
        'repoName': repo_name,
        'fromLib': samples.iloc[0]['fromLib'],
        'toLib': samples.iloc[0]['toLib'],
        'repoOwner': samples.iloc[0]['repoOwner'],
        'repoSplitName': samples.iloc[0]['repoSplitName'],
        'startCommit': samples.iloc[0]['startCommit'],
        'endCommit': samples.iloc[0]['endCommit'],
        'startCode': '',
        'endCode': '',
        'startCommitChanges': samples.iloc[0]['startCommitChanges'],
        'endCommitChanges': samples.iloc[0]['endCommitChanges']
    })

    # get unique startCommit values for this repository's samples
    changed_commits: List[str] = samples['startCommit'].unique().tolist()

    res_df: pd.DataFrame = pd.DataFrame(columns = necessary_cols)

    # get the diff of each commit and its previous commib
    for commit_id in range(len(changed_commits)):
        changed_commit: str = changed_commits[commit_id]

        # get the previous commit hash and the diff
        prev_commit: str = get_prev_commit(repo_prefix = repo_prefix, repo_name = repo_name, changed_commit = changed_commit)
        diff_files: str = get_diff_2_commit(repo_prefix = repo_prefix, repo_name = repo_name,
                                        commit1 = changed_commit, commit2 = prev_commit)

        for file_name in diff_files:
            try:
                start_code, end_code = get_start_end_commit_code(repo_prefix = repo_prefix, repo_name = repo_name, file_name = file_name,
                                                                start_commit = prev_commit, end_commit = changed_commit)
                diff = get_diff_file(repo_prefix = repo_prefix, repo_name = repo_name, file_name = file_name,
                                    start_commit = prev_commit, end_commit = changed_commit)
            except Exception as e:
                print(e)
                print(f'file: {file_name}')
                print(f'start: {prev_commit}, end: {changed_commit}')
                print(f'start code: {start_code}')
                print(f'end code: {end_code}')
                print('-' * 50)

                return None

            sample_template['id'] = sample_cnt
            sample_template['fileName'] = file_name
            sample_template['startCode'], sample_template['endCode'] = str_normalize(start_code), str_normalize(end_code)
            sample_template['diff'] = diff

            res_df = pd.concat([res_df, pd.DataFrame([sample_template], columns = necessary_cols)], ignore_index = True)

            sample_cnt += 1

    return sample_cnt, res_df

In [4]:
repo_df

Unnamed: 0,id,fromLib,toLib,repoName,fileName,startCommit,endCommit,startCommitChanges,endCommitChanges,startCommitMessage,endCommitMessage,startCommitTime,endCommitTime,Category,repoOwner,repoSplitName
0,0,ant:ant,org.apache.ant:ant,bobmcwhirter_drools,pom.xml,0504ef4b2349dd7edc4d4991d29f688dc15da939,0504ef4b2349dd7edc4d4991d29f688dc15da939,+org.apache.ant:ant\n+org.apache.ant:ant-apach...,+org.apache.ant:ant\n+org.apache.ant:ant-apach...,JBRULES-2737 Build with maven 3 fails on Guvno...,JBRULES-2737 Build with maven 3 fails on Guvno...,2010-10-20,2010-10-20,0,bobmcwhirter,drools
1,1,ant:ant,org.apache.ant:ant,wocommunity_wolips,woproject-ant-tasks/pom.xml,1d02c267e787b0c97ef9a3fb3b63da33e8f3c6d9,1d02c267e787b0c97ef9a3fb3b63da33e8f3c6d9,+org.apache.ant:ant\n+org.apache.ant:ant-junit...,+org.apache.ant:ant\n+org.apache.ant:ant-junit...,Using Ant 1.7.1 as dependency for woproject-an...,Using Ant 1.7.1 as dependency for woproject-an...,2008-09-06,2008-09-06,0,wocommunity,wolips
2,2,ant:ant,org.apache.ant:ant,teiid_teiid,pom.xml,204ea9614df7b99c46cc054877c5bea4d145b460,204ea9614df7b99c46cc054877c5bea4d145b460,+org.apache.ant:ant\n-ant:ant,+org.apache.ant:ant\n-ant:ant,updating ant groupid\n,updating ant groupid\n,2009-01-19,2009-01-19,0,teiid,teiid
3,3,ant:ant,org.apache.ant:ant,apache_axis2-java,modules/parent/pom.xml,2233b385c9f07b726c11bd5e1e2c00a350622eb3,2233b385c9f07b726c11bd5e1e2c00a350622eb3,+org.apache.ant:ant\n-ant:ant,+org.apache.ant:ant\n-ant:ant,fix the group id for latest version of ant\n\n...,fix the group id for latest version of ant\n\n...,2007-02-05,2007-02-05,0,apache,axis2-java
4,4,ant:ant,org.apache.ant:ant,apache_axis2-java,modules/tool/axis2-idea-plugin/pom.xml,2233b385c9f07b726c11bd5e1e2c00a350622eb3,2233b385c9f07b726c11bd5e1e2c00a350622eb3,+org.apache.ant:ant\n-ant:ant,+org.apache.ant:ant\n-ant:ant,fix the group id for latest version of ant\n\n...,fix the group id for latest version of ant\n\n...,2007-02-05,2007-02-05,0,apache,axis2-java
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14329,14329,xstream:xstream,com.thoughtworks.xstream:xstream,kiegroup_kogito-runtimes,drools-core/pom.xml,ae5bf7e576bce9bc7c1871b323dc81bbded46b55,ae5bf7e576bce9bc7c1871b323dc81bbded46b55,+com.thoughtworks.xstream:xstream\n-xstream:xs...,+com.thoughtworks.xstream:xstream\n-xstream:xs...,JBRULES-1128 JBRULES-992: optional dependencie...,JBRULES-1128 JBRULES-992: optional dependencie...,2007-08-25,2007-08-25,30,kiegroup,kogito-runtimes
14330,14330,xstream:xstream,com.thoughtworks.xstream:xstream,sakaiproject_sakai,gradebook/service/impl/pom.xml,d32f1ecd4a80a1f4b1ccda1763c6d7f0f5462584,d32f1ecd4a80a1f4b1ccda1763c6d7f0f5462584,+com.thoughtworks.xstream:xstream\n-xstream:xs...,+com.thoughtworks.xstream:xstream\n-xstream:xs...,Fixed issue with xstream\n\n\ngit-svn-id: http...,Fixed issue with xstream\n\n\ngit-svn-id: http...,2007-01-30,2007-01-30,30,sakaiproject,sakai
14331,14331,xstream:xstream,com.thoughtworks.xstream:xstream,sakaiproject_sakai,gradebook/service/sakai-pack/pom.xml,d32f1ecd4a80a1f4b1ccda1763c6d7f0f5462584,d32f1ecd4a80a1f4b1ccda1763c6d7f0f5462584,+com.thoughtworks.xstream:xstream\n-xstream:xs...,+com.thoughtworks.xstream:xstream\n-xstream:xs...,Fixed issue with xstream\n\n\ngit-svn-id: http...,Fixed issue with xstream\n\n\ngit-svn-id: http...,2007-01-30,2007-01-30,30,sakaiproject,sakai
14332,14332,xstream:xstream,com.thoughtworks.xstream:xstream,apache_shindig,1.0.x-incubating/java/social-api/pom.xml,f6cabfb61fbbc6a6afbda13b39189c005c681967,f6cabfb61fbbc6a6afbda13b39189c005c681967,+com.thoughtworks.xstream:xstream\n-xstream:xs...,+com.thoughtworks.xstream:xstream\n-xstream:xs...,SHINDIG-837 | Patch from Vincent Siveton | Del...,SHINDIG-837 | Patch from Vincent Siveton | Del...,2009-01-12,2009-01-12,30,apache,shindig


In [3]:
errors_df: pd.DataFrame = pd.read_parquet(f'{data_prefix}/errors.parquet', engine = 'pyarrow')

In [6]:
unique_repos = repo_df['repoName'].unique()

In [4]:
unique_repos = errors_df['repoName'].unique()

In [5]:
len(unique_repos)

278

In [11]:
check_dict: Dict[int, Dict[str, Any]] = {}

def get_main_branch(repo_path: str) -> Optional[str]:
    """
    Determines the main branch of a Git repository (either 'main' or 'master').

    :param repo_path: Path to the local Git repository.
    :return: The name of the main branch ('main' or 'master') if found, or None if neither exists.
    """
    try:
        # Get remote branches and search for 'origin/main' or 'origin/master'
        result = subprocess.run(
            ["git", "-C", repo_path, "branch", "-r"],
            capture_output=True,
            text=True,
            check=True
        )
        branches = result.stdout.splitlines()
        for branch in branches:
            if "origin/main" in branch:
                return "main"
            elif "origin/master" in branch:
                return "master"

        print(f"No 'main' or 'master' branch found in repository at {repo_path}")
        return None

    except subprocess.CalledProcessError as e:
        print(f"Error determining main branch for repository at {repo_path}: {e}")
        return None

def is_repo_up_to_date(repo_path: str, branch: str) -> Tuple[bool, str]:
    """
    Checks if the local branch is up-to-date with the specified remote branch.

    :param repo_path: Path to the local Git repository.
    :param branch: Name of the branch to compare (either 'main' or 'master').
    :return: Tuple (True if up-to-date, detailed output for debugging).
    """
    try:
        # Fetch latest changes from the remote
        subprocess.run(["git", "-C", repo_path, "fetch", "origin"], check=True, capture_output=True, text=True)

        # Get the latest commit on the local branch
        local_commit = subprocess.run(
            ["git", "-C", repo_path, "rev-parse", "HEAD"],
            capture_output=True,
            text=True,
            check=True
        ).stdout.strip()

        # Get the latest commit on the remote branch
        remote_commit = subprocess.run(
            ["git", "-C", repo_path, "rev-parse", f"origin/{branch}"],
            capture_output=True,
            text=True,
            check=True
        ).stdout.strip()

        # Compare commits
        is_up_to_date = local_commit == remote_commit
        details = f"Local commit: {local_commit}\nRemote commit: {remote_commit}\nBranch: {branch}"
        return is_up_to_date, details

    except subprocess.CalledProcessError as e:
        return False, f"Error checking update status for branch '{branch}' in repository at {repo_path}: {e}"

# Process each repository and store results in check_dict
for idx, repo_name in tqdm(enumerate(unique_repos), total=len(unique_repos), desc="Checking repositories"):
    repo_path = f"{repo_prefix}/{repo_name}"
    branch = get_main_branch(repo_path)

    if branch is None:
        check_dict[idx] = {
            'repoName': repo_name,
            'status': 'unknown',
            'details': "Could not determine main branch (main or master)"
        }
        continue

    is_up_to_date, details = is_repo_up_to_date(repo_path, branch)
    status = 'up-to-date' if is_up_to_date else 'outdated'

    check_dict[idx] = {
        'repoName': repo_name,
        'status': status,
        'details': details
    }

Checking repositories:   0%|          | 0/278 [00:00<?, ?it/s]

Error determining main branch for repository at data/repos/apache_axis2-java: Command '['git', '-C', 'data/repos/apache_axis2-java', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_shindig
No 'main' or 'master' branch found in repository at data/repos/apache_geronimo
No 'main' or 'master' branch found in repository at data/repos/apache_aries
No 'main' or 'master' branch found in repository at data/repos/apache_felix
No 'main' or 'master' branch found in repository at data/repos/SeleniumHQ_selenium


Checking repositories:   3%|▎         | 8/278 [00:07<04:59,  1.11s/it]

Error determining main branch for repository at data/repos/Alfresco_community-edition-old: Command '['git', '-C', 'data/repos/Alfresco_community-edition-old', 'branch', '-r']' returned non-zero exit status 128.
Error determining main branch for repository at data/repos/maxirosson_jdroid-android: Command '['git', '-C', 'data/repos/maxirosson_jdroid-android', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:   4%|▍         | 11/278 [00:10<04:53,  1.10s/it]

Error determining main branch for repository at data/repos/fracturedatlas_ATHENA: Command '['git', '-C', 'data/repos/fracturedatlas_ATHENA', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_cocoon
Error determining main branch for repository at data/repos/hsiafan_requests: Command '['git', '-C', 'data/repos/hsiafan_requests', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:   6%|▋         | 18/278 [00:26<09:23,  2.17s/it]

Error determining main branch for repository at data/repos/semuxproject_semux: Command '['git', '-C', 'data/repos/semuxproject_semux', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/torodb_server
No 'main' or 'master' branch found in repository at data/repos/apache_directmemory


Checking repositories:  10%|▉         | 27/278 [00:46<11:34,  2.77s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_tomcat-maven-plugin


Checking repositories:  10%|█         | 29/278 [00:49<09:35,  2.31s/it]

Error determining main branch for repository at data/repos/qwazr_QWAZR: Command '['git', '-C', 'data/repos/qwazr_QWAZR', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  12%|█▏        | 32/278 [00:55<09:29,  2.32s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_dubbo-admin
Error determining main branch for repository at data/repos/yangqifang_QuickStart-admin-Cloud: Command '['git', '-C', 'data/repos/yangqifang_QuickStart-admin-Cloud', 'branch', '-r']' returned non-zero exit status 128.
Error determining main branch for repository at data/repos/JacksonTu_hdw-dubbo: Command '['git', '-C', 'data/repos/JacksonTu_hdw-dubbo', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  13%|█▎        | 36/278 [00:59<06:04,  1.51s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_dubbo


Checking repositories:  14%|█▎        | 38/278 [01:02<06:12,  1.55s/it]

Error determining main branch for repository at data/repos/bazaarvoice_ostrich: Command '['git', '-C', 'data/repos/bazaarvoice_ostrich', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/dropwizard_metrics
No 'main' or 'master' branch found in repository at data/repos/dswarm_dswarm


Checking repositories:  15%|█▌        | 42/278 [01:06<05:04,  1.29s/it]

Error determining main branch for repository at data/repos/Talend_components: Command '['git', '-C', 'data/repos/Talend_components', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/neo4j_neo4j
Error determining main branch for repository at data/repos/mhlx_mblog: Command '['git', '-C', 'data/repos/mhlx_mblog', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  17%|█▋        | 47/278 [01:13<05:34,  1.45s/it]

Error determining main branch for repository at data/repos/rackerlabs_atlas-lb: Command '['git', '-C', 'data/repos/rackerlabs_atlas-lb', 'branch', '-r']' returned non-zero exit status 128.
Error determining main branch for repository at data/repos/att_XACML: Command '['git', '-C', 'data/repos/att_XACML', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  18%|█▊        | 50/278 [01:17<05:02,  1.33s/it]

No 'main' or 'master' branch found in repository at data/repos/Activiti_Activiti


Checking repositories:  19%|█▉        | 53/278 [01:23<06:25,  1.71s/it]

No 'main' or 'master' branch found in repository at data/repos/joinfaces_joinfaces
No 'main' or 'master' branch found in repository at data/repos/jboss-fuse_fabric8
No 'main' or 'master' branch found in repository at data/repos/apache_giraph
Error determining main branch for repository at data/repos/deeplearning4j_DataVec: Command '['git', '-C', 'data/repos/deeplearning4j_DataVec', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  22%|██▏       | 61/278 [01:37<07:24,  2.05s/it]

No 'main' or 'master' branch found in repository at data/repos/ninjaframework_ninja


Checking repositories:  24%|██▎       | 66/278 [01:51<09:43,  2.75s/it]

No 'main' or 'master' branch found in repository at data/repos/square_retrofit


Checking repositories:  25%|██▌       | 70/278 [02:03<10:38,  3.07s/it]

Error determining main branch for repository at data/repos/perfectsense_brightspot-cms: Command '['git', '-C', 'data/repos/perfectsense_brightspot-cms', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  26%|██▌       | 72/278 [02:06<08:52,  2.59s/it]

No 'main' or 'master' branch found in repository at data/repos/syndesisio_syndesis
Error determining main branch for repository at data/repos/perfectsense_dari: Command '['git', '-C', 'data/repos/perfectsense_dari', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/bonitasoft_bonita-engine


Checking repositories:  27%|██▋       | 76/278 [02:10<05:39,  1.68s/it]

Error determining main branch for repository at data/repos/hawkular_hawkular-services: Command '['git', '-C', 'data/repos/hawkular_hawkular-services', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  29%|██▉       | 82/278 [02:27<08:48,  2.70s/it]

Error determining main branch for repository at data/repos/cloudera_search: Command '['git', '-C', 'data/repos/cloudera_search', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  31%|███       | 85/278 [02:34<08:19,  2.59s/it]

Error determining main branch for repository at data/repos/jasonish_jetty-springmvc-thymeleaf-template: Command '['git', '-C', 'data/repos/jasonish_jetty-springmvc-thymeleaf-template', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  31%|███▏      | 87/278 [02:37<07:14,  2.28s/it]

Error determining main branch for repository at data/repos/baomidou_mybatisplus-spring-boot: Command '['git', '-C', 'data/repos/baomidou_mybatisplus-spring-boot', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_stanbol


Checking repositories:  33%|███▎      | 91/278 [02:44<06:26,  2.07s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_jackrabbit-oak


Checking repositories:  34%|███▍      | 94/278 [02:52<07:11,  2.35s/it]

No 'main' or 'master' branch found in repository at data/repos/Intel-bigdata_SSM


Checking repositories:  35%|███▍      | 97/278 [02:59<07:11,  2.38s/it]

No 'main' or 'master' branch found in repository at data/repos/jogetworkflow_jw-community
No 'main' or 'master' branch found in repository at data/repos/apache_manifoldcf
Error determining main branch for repository at data/repos/Talend_data-prep: Command '['git', '-C', 'data/repos/Talend_data-prep', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  36%|███▋      | 101/278 [03:03<04:56,  1.68s/it]

Error determining main branch for repository at data/repos/cloudera_sentry: Command '['git', '-C', 'data/repos/cloudera_sentry', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  37%|███▋      | 103/278 [03:06<04:52,  1.67s/it]

No 'main' or 'master' branch found in repository at data/repos/uwolfer_gerrit-intellij-plugin


Checking repositories:  39%|███▉      | 109/278 [03:24<07:46,  2.76s/it]

No 'main' or 'master' branch found in repository at data/repos/myabc_appfuse
No 'main' or 'master' branch found in repository at data/repos/paypal_SeLion
No 'main' or 'master' branch found in repository at data/repos/DbMaintain_dbmaintain
No 'main' or 'master' branch found in repository at data/repos/europeana_corelib


Checking repositories:  42%|████▏     | 116/278 [03:35<05:55,  2.19s/it]

No 'main' or 'master' branch found in repository at data/repos/Netcentric_accesscontroltool


Checking repositories:  42%|████▏     | 118/278 [03:38<05:20,  2.01s/it]

No 'main' or 'master' branch found in repository at data/repos/cloudera_mahout


Checking repositories:  45%|████▍     | 124/278 [03:55<07:20,  2.86s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_wink
No 'main' or 'master' branch found in repository at data/repos/apache_chemistry-opencmis
Error determining main branch for repository at data/repos/cloudera_crunch: Command '['git', '-C', 'data/repos/cloudera_crunch', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_servicemix4-features
Error determining main branch for repository at data/repos/apache_wss4j: Command '['git', '-C', 'data/repos/apache_wss4j', 'branch', '-r']' returned non-zero exit status 128.
Error determining main branch for repository at data/repos/xap_xap: Command '['git', '-C', 'data/repos/xap_xap', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_activemq-apollo
No 'main' or 'master' branch found in repository at data/repos/apache_ftpserver
No 'main' or 'master' branch found in repository at data/r

Checking repositories:  49%|████▉     | 137/278 [04:09<04:12,  1.79s/it]

No 'main' or 'master' branch found in repository at data/repos/intendia-oss_rxjava-gwt
Error determining main branch for repository at data/repos/liuyadu_open-cloud: Command '['git', '-C', 'data/repos/liuyadu_open-cloud', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  51%|█████▏    | 143/278 [04:23<05:32,  2.47s/it]

No 'main' or 'master' branch found in repository at data/repos/eclipse-ee4j_jersey


Checking repositories:  53%|█████▎    | 146/278 [04:30<05:16,  2.40s/it]

No 'main' or 'master' branch found in repository at data/repos/looly_hutool
Error determining main branch for repository at data/repos/yonyou-iuap_iuap-pap-baseservice: Command '['git', '-C', 'data/repos/yonyou-iuap_iuap-pap-baseservice', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  54%|█████▎    | 149/278 [04:34<03:54,  1.82s/it]

No 'main' or 'master' branch found in repository at data/repos/sdl_dxa-web-application-java


Checking repositories:  55%|█████▌    | 154/278 [04:47<05:15,  2.55s/it]

No 'main' or 'master' branch found in repository at data/repos/sialcasa_mvvmFX
No 'main' or 'master' branch found in repository at data/repos/neo4j_neo4j-java-driver


Checking repositories:  58%|█████▊    | 160/278 [05:00<05:06,  2.60s/it]

Error determining main branch for repository at data/repos/jcustenborder_kafka-connect-cdc: Command '['git', '-C', 'data/repos/jcustenborder_kafka-connect-cdc', 'branch', '-r']' returned non-zero exit status 128.
Error determining main branch for repository at data/repos/automenta_narchy: Command '['git', '-C', 'data/repos/automenta_narchy', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/AlmasB_FXGL
Error determining main branch for repository at data/repos/Mercateo_factcast: Command '['git', '-C', 'data/repos/Mercateo_factcast', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/Johnnei_JavaTorrent
Error determining main branch for repository at data/repos/cscannerio_opensource: Command '['git', '-C', 'data/repos/cscannerio_opensource', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  60%|██████    | 167/278 [05:04<02:07,  1.14s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_sqoop


Checking repositories:  61%|██████    | 170/278 [05:11<02:55,  1.62s/it]

No 'main' or 'master' branch found in repository at data/repos/cloudera_sqoop2


Checking repositories:  63%|██████▎   | 175/278 [05:25<04:12,  2.45s/it]

Error determining main branch for repository at data/repos/Berico-Technologies_CLAVIN: Command '['git', '-C', 'data/repos/Berico-Technologies_CLAVIN', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  64%|██████▎   | 177/278 [05:28<03:36,  2.15s/it]

No 'main' or 'master' branch found in repository at data/repos/pnowy_NativeCriteria


Checking repositories:  64%|██████▍   | 179/278 [05:31<03:16,  1.98s/it]

Error determining main branch for repository at data/repos/boundlessgeo_GeoGig: Command '['git', '-C', 'data/repos/boundlessgeo_GeoGig', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  65%|██████▌   | 182/278 [05:38<03:25,  2.14s/it]

Error determining main branch for repository at data/repos/apache_webservices-axiom: Command '['git', '-C', 'data/repos/apache_webservices-axiom', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_flume
Error determining main branch for repository at data/repos/cloudera_flume-ng: Command '['git', '-C', 'data/repos/cloudera_flume-ng', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_maven-2


Checking repositories:  67%|██████▋   | 187/278 [05:41<01:59,  1.31s/it]

No 'main' or 'master' branch found in repository at data/repos/CONNECT-Solution_CONNECT


Checking repositories:  68%|██████▊   | 189/278 [05:45<02:03,  1.39s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_qpid


Checking repositories:  69%|██████▊   | 191/278 [05:48<02:06,  1.45s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_jackrabbit


Checking repositories:  69%|██████▉   | 193/278 [05:51<02:07,  1.50s/it]

No 'main' or 'master' branch found in repository at data/repos/datastax_java-driver


Checking repositories:  71%|███████   | 196/278 [05:58<02:30,  1.84s/it]

No 'main' or 'master' branch found in repository at data/repos/owlcs_owlapi
No 'main' or 'master' branch found in repository at data/repos/IBMStreams_streamsx.messaging


Checking repositories:  72%|███████▏  | 199/278 [06:01<02:00,  1.53s/it]

No 'main' or 'master' branch found in repository at data/repos/pxb1988_dex2jar


Checking repositories:  73%|███████▎  | 203/278 [06:11<02:45,  2.21s/it]

No 'main' or 'master' branch found in repository at data/repos/apache_ambari
No 'main' or 'master' branch found in repository at data/repos/data-integrations_wrangler


Checking repositories:  74%|███████▍  | 206/278 [06:15<02:07,  1.78s/it]

Error determining main branch for repository at data/repos/hotelbeds-sdk_hotel-api-sdk-java: Command '['git', '-C', 'data/repos/hotelbeds-sdk_hotel-api-sdk-java', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  75%|███████▍  | 208/278 [06:18<02:01,  1.73s/it]

Error determining main branch for repository at data/repos/Talend_data-quality: Command '['git', '-C', 'data/repos/Talend_data-quality', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  76%|███████▌  | 211/278 [06:25<02:13,  1.99s/it]

No 'main' or 'master' branch found in repository at data/repos/cdapio_cdap


Checking repositories:  78%|███████▊  | 217/278 [06:43<02:54,  2.87s/it]

Error determining main branch for repository at data/repos/evrythng_evrythng-java-sdk: Command '['git', '-C', 'data/repos/evrythng_evrythng-java-sdk', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_oltu


Checking repositories:  79%|███████▉  | 220/278 [06:46<01:55,  1.99s/it]

Error determining main branch for repository at data/repos/Talend_tesb-rt-se: Command '['git', '-C', 'data/repos/Talend_tesb-rt-se', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  80%|███████▉  | 222/278 [06:50<01:47,  1.91s/it]

Error determining main branch for repository at data/repos/pveentjer_Hazelblast: Command '['git', '-C', 'data/repos/pveentjer_Hazelblast', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_mina


Checking repositories:  81%|████████▏ | 226/278 [06:56<01:37,  1.88s/it]

Error determining main branch for repository at data/repos/dianping_pigeon: Command '['git', '-C', 'data/repos/dianping_pigeon', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/jooby-project_jooby
No 'main' or 'master' branch found in repository at data/repos/dadoonet_legacy-search
Error determining main branch for repository at data/repos/syhily_gossip: Command '['git', '-C', 'data/repos/syhily_gossip', 'branch', '-r']' returned non-zero exit status 128.
Error determining main branch for repository at data/repos/automatictester_lightning: Command '['git', '-C', 'data/repos/automatictester_lightning', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/tylanbin_platform-ng


Checking repositories:  84%|████████▍ | 233/278 [07:00<00:44,  1.00it/s]

No 'main' or 'master' branch found in repository at data/repos/jboss-developer_jboss-jdg-quickstarts
No 'main' or 'master' branch found in repository at data/repos/ekoontz_hadoop-common
Error determining main branch for repository at data/repos/adaikiss_xun: Command '['git', '-C', 'data/repos/adaikiss_xun', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_hadoop-common
Error determining main branch for repository at data/repos/klose911_payment: Command '['git', '-C', 'data/repos/klose911_payment', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  86%|████████▌ | 239/278 [07:03<00:31,  1.24it/s]

Error determining main branch for repository at data/repos/Dytanic_CloudNet: Command '['git', '-C', 'data/repos/Dytanic_CloudNet', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  87%|████████▋ | 242/278 [07:10<00:43,  1.21s/it]

No 'main' or 'master' branch found in repository at data/repos/headwirecom_peregrine-cms
No 'main' or 'master' branch found in repository at data/repos/apache_plc4x


Checking repositories:  88%|████████▊ | 245/278 [07:13<00:38,  1.17s/it]

No 'main' or 'master' branch found in repository at data/repos/powermock_powermock
Error determining main branch for repository at data/repos/boundlessgeo_suite: Command '['git', '-C', 'data/repos/boundlessgeo_suite', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/terrier-org_terrier-core
Error determining main branch for repository at data/repos/jabox_jabox: Command '['git', '-C', 'data/repos/jabox_jabox', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  90%|████████▉ | 250/278 [07:16<00:26,  1.05it/s]

Error determining main branch for repository at data/repos/wuyc_xpress: Command '['git', '-C', 'data/repos/wuyc_xpress', 'branch', '-r']' returned non-zero exit status 128.
Error determining main branch for repository at data/repos/lattebank_rate-limiter: Command '['git', '-C', 'data/repos/lattebank_rate-limiter', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/liquigraph_liquigraph


Checking repositories:  92%|█████████▏| 256/278 [07:26<00:31,  1.44s/it]

Error determining main branch for repository at data/repos/yqbjtu_KafkaTutorials: Command '['git', '-C', 'data/repos/yqbjtu_KafkaTutorials', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  93%|█████████▎| 259/278 [07:33<00:33,  1.78s/it]

No 'main' or 'master' branch found in repository at data/repos/jboss-developer_jboss-brms-quickstarts


Checking repositories:  95%|█████████▍| 263/278 [07:42<00:34,  2.29s/it]

Error determining main branch for repository at data/repos/PocketServer_PocketServer-Ref: Command '['git', '-C', 'data/repos/PocketServer_PocketServer-Ref', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  96%|█████████▋| 268/278 [07:56<00:27,  2.73s/it]

Error determining main branch for repository at data/repos/shopping24_geoip-api: Command '['git', '-C', 'data/repos/shopping24_geoip-api', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  97%|█████████▋| 270/278 [07:59<00:18,  2.27s/it]

Error determining main branch for repository at data/repos/yahoojapan_multiple-dimension-spread: Command '['git', '-C', 'data/repos/yahoojapan_multiple-dimension-spread', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  98%|█████████▊| 272/278 [08:02<00:12,  2.05s/it]

Error determining main branch for repository at data/repos/aipocom_aipo: Command '['git', '-C', 'data/repos/aipocom_aipo', 'branch', '-r']' returned non-zero exit status 128.


Checking repositories:  99%|█████████▊| 274/278 [08:06<00:07,  1.91s/it]

Error determining main branch for repository at data/repos/eclipse_rt.equinox.framework: Command '['git', '-C', 'data/repos/eclipse_rt.equinox.framework', 'branch', '-r']' returned non-zero exit status 128.
No 'main' or 'master' branch found in repository at data/repos/apache_abdera
No 'main' or 'master' branch found in repository at data/repos/apache_ace


Checking repositories: 100%|██████████| 278/278 [08:10<00:00,  1.76s/it]


In [9]:
status_df: pd.DataFrame = pd.DataFrame.from_dict(check_dict, orient = 'index')

In [10]:
status_df

Unnamed: 0,repoName,status,details
5,SeleniumHQ_selenium,outdated,Error: Could not determine main or master branch
0,apache_axis2-java,outdated,Error: Could not determine main or master branch
8,Alfresco_community-edition-old,outdated,Error: Could not determine main or master branch
12,apache_cocoon,outdated,Error: Could not determine main or master branch
2,apache_geronimo,outdated,Error: Could not determine main or master branch
...,...,...,...
266,groovy_gmaven,outdated,Local commit: b5fa664ecea354548d039d39e04011a4...
269,openpnp_openpnp,outdated,Local commit: 5dd12c73972c74a5f3fa1810fc9167ce...
273,terasolunaorg_terasoluna-tourreservation-mybatis3,outdated,Local commit: a1f3d93f0e1fc36b65ebf461e2ecbde1...
271,terasolunaorg_terasoluna-tourreservation,outdated,Local commit: ff7979ec5eeb305cf9936d2e0f16ddc7...


In [None]:
for id in range(len(unique_repos)):
    print(id, check_dict[id])

In [None]:
for id in range(len(status_df)):
    line = status_df.iloc[id]

    if (line['status'] == 'outdated'):
        print(id)

In [25]:
errors = status_df[status_df['status'] == 'outdated']

In [26]:
errors

Unnamed: 0,repoName,status,details
0,apache_axis2-java,outdated,/bin/sh: 2: cd: can't cd to ./data/repos/apach...
4,apache_shindig,outdated,fatal: ambiguous argument 'origin/': unknown r...
9,apache_geronimo,outdated,fatal: ambiguous argument 'origin/': unknown r...
11,apache_aries,outdated,fatal: ambiguous argument 'origin/': unknown r...
8,apache_felix,outdated,fatal: ambiguous argument 'origin/': unknown r...
...,...,...,...
350,terasolunaorg_terasoluna-tourreservation-mybatis3,outdated,Local commit: a1f3d93f0e1fc36b65ebf461e2ecbde1...
351,eclipse_rt.equinox.framework,outdated,/bin/sh: 2: cd: can't cd to ./data/repos/eclip...
353,apache_abdera,outdated,fatal: ambiguous argument 'origin/': unknown r...
354,apache_ace,outdated,fatal: ambiguous argument 'origin/': unknown r...


In [28]:
errors.to_parquet(f'{data_prefix}/errors.parquet', engine = 'pyarrow')

In [29]:
errors

Unnamed: 0,repoName,status,details
0,apache_axis2-java,outdated,/bin/sh: 2: cd: can't cd to ./data/repos/apach...
4,apache_shindig,outdated,fatal: ambiguous argument 'origin/': unknown r...
9,apache_geronimo,outdated,fatal: ambiguous argument 'origin/': unknown r...
11,apache_aries,outdated,fatal: ambiguous argument 'origin/': unknown r...
8,apache_felix,outdated,fatal: ambiguous argument 'origin/': unknown r...
...,...,...,...
350,terasolunaorg_terasoluna-tourreservation-mybatis3,outdated,Local commit: a1f3d93f0e1fc36b65ebf461e2ecbde1...
351,eclipse_rt.equinox.framework,outdated,/bin/sh: 2: cd: can't cd to ./data/repos/eclip...
353,apache_abdera,outdated,fatal: ambiguous argument 'origin/': unknown r...
354,apache_ace,outdated,fatal: ambiguous argument 'origin/': unknown r...


In [32]:
print(errors.iloc[1]['details'])

fatal: ambiguous argument 'origin/': unknown revision or path not in the working tree.
Use '--' to separate paths from revisions, like this:
'git <command> [<revision>...] -- [<file>...]'


In [18]:
rm_template = 'rm -rf ./{}/{}'

for id in range(len(errors)):
    line = errors.iloc[id]
    repo_name = line['repoName']


    rm_script: str = rm_template.format(repo_prefix, repo_name)

    # Run the script
    # sub: subprocess.CompletedProcess = subprocess.run(rm_script, shell=True, capture_output=True, encoding='utf-8', errors='ignore')

    print(rm_script)

rm -rf ./data/repos/apache_axis2-java
rm -rf ./data/repos/SeleniumHQ_selenium
rm -rf ./data/repos/codehaus-cargo_cargo
rm -rf ./data/repos/mulesoft_mule
rm -rf ./data/repos/apache_shindig
rm -rf ./data/repos/apache_cayenne
rm -rf ./data/repos/openl-tablets_openl-tablets
rm -rf ./data/repos/apache_cxf
rm -rf ./data/repos/apache_felix
rm -rf ./data/repos/apache_geronimo
rm -rf ./data/repos/nuxeo_nuxeo
rm -rf ./data/repos/apache_aries
rm -rf ./data/repos/Alfresco_community-edition-old
rm -rf ./data/repos/maxirosson_jdroid-android
rm -rf ./data/repos/OpenWiseSolutions_openhub-framework
rm -rf ./data/repos/OpenNMS_opennms
rm -rf ./data/repos/dhis2_dhis2-core
rm -rf ./data/repos/apache_cocoon
rm -rf ./data/repos/apache_tapestry-5
rm -rf ./data/repos/sakaiproject_sakai
rm -rf ./data/repos/fracturedatlas_ATHENA
rm -rf ./data/repos/apache_flink
rm -rf ./data/repos/apache_camel
rm -rf ./data/repos/hsiafan_requests
rm -rf ./data/repos/semuxproject_semux
rm -rf ./data/repos/torodb_server
rm -rf ./