In [1]:
import os
from dotenv import load_dotenv
from mongoengine import connect, disconnect, DoesNotExist
import pandas as pd
from pycoshark.mongomodels import Issue, Project, People, CodeReview, CodeReviewSystem, IssueSystem, CodeReviewChangeLog, CodeReviewRevision, Commit
import re
from datetime import datetime, timedelta
import progressbar
import random
import sys; sys.stdout.flush()

In [12]:
# Load environment variables from .env file
load_dotenv('../.env', override=True)

# Retrieve the password from the environment variable
user = os.getenv('DB_USER')
password = os.getenv('DB_PW')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')

disconnect()

host = f'mongodb://{user}:{password}@{db_host}:{db_port}/{db_name}?authSource=admin'
print(host)

connect(host=host)

projects = Project.objects().only('name')

random.seed(42)

for project in projects:
    print(f'Project: {project.name}')

mongodb://root:upinndonau@localhost:27017/smartshark?authSource=admin
Project: nova
Project: elasticsearch


In [None]:
df = pd.read_csv('../input/all_information.csv', sep="\t", keep_default_na=False)
df_nova = df[df['Project'] == 'nova']

In [6]:
def str_to_list(s: str) -> list:
    l = s.replace('\n', '').replace('\'', '').strip('[]').split(',')
    l = [x.strip(' ') for x in l]
    return [x for x in l if x != '']

In [7]:
systems =  {
    "Jenkins".lower(): {
        'success': r"build +(?:successful|succeeded)",
        'fail': r"build failed",
    },
    "DB Datasets CI".lower(): {
        'success': r"database migration testing successful",
        'fail': None,
    },
    "IBM PowerKVM CI".lower(): {
        'success': r"(?:build +succeeded|testing +completed +SUCCESS)",
        'fail': r"(?:(?:build|merge) +failed|testing +completed +failure)",
    },
    "Citrix XenServer CI".lower(): {
        'success': r"passed",
        'fail': r"failed",
    },
    "Microsoft Hyper-V CI".lower(): {
        'success': r"build +succeeded",
        'fail': r"(?:build +failed|for +rechecking +failure)",
    },
    "SmokeStack CI".lower(): {
        'success': r"works +for +me",
        'fail': None,
    },
    "VMware NSX CI".lower(): {
        'success': r"(:?passed|works +for +me|build +successful)",
        'fail': r"(:?build +failed|recheck-vmware)",
    },
    "Intel PCI CI".lower(): {
        'success': r"build +successful",
        'fail': r"build +failed",
    },
}

In [19]:
project = Project.objects.get(name="nova")
its = IssueSystem.objects.get(project_id=project.id)
crs = CodeReviewSystem.objects.get(project_id=project.id)

people_cache = {}

def get_review_status(review: CodeReview, bug_id):
    status = {}
    for key in systems.keys():
        status[key] = None

    for revision in CodeReviewRevision.objects(code_review_id=review.id).order_by("-revision_number"):
        if all(status.values()):
            break

        change_logs = CodeReviewChangeLog.objects(revision_id=revision.id, author_id__exists=True).order_by("-created_at")


        for log in change_logs:
            author = people_cache[log.author_id] if log.author_id in people_cache else People.objects.get(id=log.author_id)
            people_cache[log.author_id] = author
            author_name = author.name.lower()

            if author_name in systems and status[author_name] is None:

                re_success = systems[author_name]['success']
                re_fail = systems[author_name]['fail']
                
                if re_success and re.findall(re_success, log.message, re.IGNORECASE):
                    status[author_name] = True
                elif re_fail and re.findall(re_fail, log.message, re.IGNORECASE):
                    status[author_name] = False

    return status

def get_item(bug_id, review_id, current_status = {}, prev_review_id = None, prev_status= {}):
    res = []
    for key in systems.keys():
        res.append({
            "bug": bug_id,
            "review": review_id,
            "prev_review": prev_review_id,
            "build_id": key,
            "build_status": current_status.get(key, None),
            "build_status_changed": current_status.get(key, None) != prev_status.get(key, None),
            "build_status_prev": prev_status.get(key, None)
        })

    return res

results = []

for _, row in progressbar.progressbar(df_nova.iterrows()):
    bug_id = row['Bug_ID']

    reviews = str_to_list(row['CG_CodeReview'])

    for review_id in reviews:

        try: 
            review = CodeReview.objects.get(code_review_system_ids=crs.id, external_number=review_id)

            current_status = get_review_status(review, bug_id)

        except DoesNotExist:
            print(f"review {review_id} for bug {bug_id} does not exist")
            results.extend(get_item(bug_id, review_id))
            continue


        review_commit_parents = Commit.objects.get(revision_hash=review.current_revision_commit_hash).parents
        prev_reviews = CodeReview.objects(current_revision_commit_hash__in=review_commit_parents)

        if len(list(prev_reviews)):
            for i, prev_review in enumerate(prev_reviews):
                change = {}
                prev_status = get_review_status(prev_review, bug_id)

                for key in systems.keys():
                    results.extend(get_item(bug_id, review.external_number, current_status, prev_review.external_number, prev_status))
        else:
            for key in systems.keys():
                results.extend(get_item(bug_id, review.external_number, current_status))


/ |#                                                  | 0 Elapsed Time: 0:00:00
- |                                         #         | 1 Elapsed Time: 0:00:06
\ |      #                                            | 2 Elapsed Time: 0:00:09
| |                           #                       | 3 Elapsed Time: 0:00:12
/ |                                                 # | 4 Elapsed Time: 0:00:15
- |                  #                                | 5 Elapsed Time: 0:00:22
\ |                            #                      | 6 Elapsed Time: 0:00:27
| |               #                                   | 7 Elapsed Time: 0:00:32
/ |        #                                          | 8 Elapsed Time: 0:00:41
- |                    #                              | 9 Elapsed Time: 0:00:42
\ |                                      #           | 10 Elapsed Time: 0:00:46
| |                 #                                | 11 Elapsed Time: 0:00:51
/ |                                     

In [20]:
pd.DataFrame(results).to_csv('review_logs_ci_status_nova_cg.csv')

len(results)

2112