In [67]:
import os
from dotenv import load_dotenv
from mongoengine import connect, disconnect
import pandas as pd
from pycoshark.mongomodels import Issue, Project, Commit, FileAction, VCSSystem, PullRequest, PullRequestCommit
import re
from datetime import datetime, timedelta
import tqdm
import random

In [68]:
# Load environment variables from .env file
load_dotenv('.env')

# Retrieve the password from the environment variable
user = os.getenv('DB_USER')
password = os.getenv('DB_PW')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')

disconnect()

connect(host=f'mongodb://{user}:{password}@{db_host}:{db_port}/{db_name}?authSource=admin')

project = "elasticsearch"

Project.objects.get(name=project)

<Project: Project object>

In [None]:
es = pd.read_csv('elasticsearch_bic.csv', sep='\t')

In [None]:
regex_change_id = re.compile(r"Change-Id: ?(\w+)", re.IGNORECASE)
regex_bp_id = re.compile(r"(?:bp|blueprint)(?::? |/)((?:\w+-?)+)", re.IGNORECASE)

project = Project.objects.get(name="elasticsearch")
vcs = VCSSystem.objects.get(project_id=project.id)

result = []

for index, row in tqdm.tqdm(es.iterrows()):
    issue_id = row['issue_id']
    bic_revision_hash = row['bic_revision_hash']

    bic: Commit = Commit.objects(revision_hash=re.compile(rf'^{bic_revision_hash}'), vcs_system_id=vcs.id).first()

    search_startdate = bic.committer_date - timedelta(days=30)
    search_enddate = bic.committer_date + timedelta(days=30)

    possible_non_bics = list(Commit.objects(committer_date__gte=search_startdate, committer_date__lte=search_enddate, vcs_system_id=vcs.id, parents__size=1, branches=re.compile(r'origin\/main$'))) # , linked_issue_ids__not__size=0
    random.shuffle(possible_non_bics)
    possible_non_bic: Commit = None

    for possible_non_bic in possible_non_bics:
        is_non_bic = not any(len(file_action.induces) for file_action in FileAction.objects(commit_id=possible_non_bic.id))

        if is_non_bic:

            linked_issues = []
            linked_issues += Issue.objects(id__in=possible_non_bic.linked_issue_ids)

            related_reviews = []

            related_reviews += PullRequest.objects(external_id__in=[issue.external_id for issue in linked_issues])
            pr_commits = PullRequestCommit.objects(commit_sha=possible_non_bic.revision_hash)
            if len(pr_commits):
                related_reviews += PullRequest.objects(id__in=[pr_commit.pull_request_id for pr_commit in pr_commits])

            result.append({
                'issue_id': issue_id,
                'bic_revision_hash': bic_revision_hash,
                'bic_committer_date': bic.committer_date,
                'non_bic_revision_hash': possible_non_bic.revision_hash,
                'non_bic_committer_date': possible_non_bic.committer_date,
                'parents': len(possible_non_bic.parents),
                'non_bic_linked_issues': list(set([issue.external_id for issue in linked_issues])),
                'non_bic_related_reviews': list(set([review.external_id for review in related_reviews]))
            })
            break

result = pd.DataFrame(result)
result

In [None]:
result.to_csv('output/sampling_output_elasticsearch_cg.csv')