In [1]:
import os
from dotenv import load_dotenv
from mongoengine import connect, disconnect
import pandas as pd
from pycoshark.mongomodels import Issue, Project, Commit, PullRequestCommit
import re

In [2]:
# Load environment variables from .env file
load_dotenv('.env')

# Retrieve the password from the environment variable
user = os.getenv('DB_USER')
password = os.getenv('DB_PW')
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_name = os.getenv('DB_NAME')

disconnect()

connect(host=f'mongodb://{user}:{password}@{db_host}:{db_port}/{db_name}?authSource=admin')


MongoClient(host=['37.114.42.132:9001'], document_class=dict, tz_aware=False, connect=True, authsource='admin', read_preference=Primary())

In [3]:
project = "elasticsearch"

Project.objects.get(name=project)

<Project: Project object>

In [4]:
issue_bfc = [(row['issue_id'], row['revision_hash']) for (index, row) in pd.read_csv(f'{project}.csv', sep='\t').iterrows()]

In [5]:
result = {}

def flatten(xss):
    return [x for xs in xss for x in xs]

def get_data(commits: [Commit]):
    return [{
        "revision_hash": commit.revision_hash,
        "author_date": commit.author_date,
        "is_merge": len(commit.parents) > 1,
        "is_rebase": any([parent.author_date > commit.author_date for parent in Commit.objects(revision_hash__in=commit.parents)]),
        "is_review_commit": PullRequestCommit.objects(commit_sha=commit.revision_hash).count()
    } for commit in commits] if len(commits) else [{"revision_hash": None, "author_date": None, "is_merge": None, "is_rebase": None, "is_review_commit": None}]

for issue_id, rev_hash in issue_bfc:    

    if not rev_hash:
        print(issue_id, "No root found")
        continue

    regex = re.compile(rev_hash)
    root = Commit.objects(revision_hash=regex).first()


    parents = Commit.objects(revision_hash__in=root.parents)
    grandparents = Commit.objects(revision_hash__in=flatten([parent.parents for parent in parents]))
    children =Commit.objects(parents=root.revision_hash)
    children_parents = Commit.objects(revision_hash__in=flatten([child.parents for child in children]))

    result[f'{issue_id}'] = {
        'root': rev_hash,
        'true_root': get_data([root])[0],
        'parents': get_data(parents),
        'grandparents': get_data(grandparents),
        'children': get_data(children),
        'children_parents': get_data(children_parents)
    }

result = pd.DataFrame(result).T

In [6]:
result = result.explode('parents')
result = result.explode('grandparents')
result = result.explode('children')
result = result.explode('children_parents')

In [7]:
def make_list(prefix):
    return [f'{prefix}_{col}' for col in ['revision_hash', 'author_date', 'is_merge', 'is_rebase', 'is_review_commit']]

result[make_list('true_root')] = result['true_root'].apply(pd.Series)
result.drop('true_root', axis=1, inplace=True)

result[make_list('parents')] = result['parents'].apply(pd.Series)
result.drop('parents', axis=1, inplace=True)

result[make_list('grandparents')] = result['grandparents'].apply(pd.Series)
result.drop('grandparents', axis=1, inplace=True)

result[make_list('children')] = result['children'].apply(pd.Series)
result.drop('children', axis=1, inplace=True)

result[make_list('children_parents')] = result['children_parents'].apply(pd.Series)
result.drop('children_parents', axis=1, inplace=True)


In [8]:
result.head()

Unnamed: 0,root,true_root_revision_hash,true_root_author_date,true_root_is_merge,true_root_is_rebase,true_root_is_review_commit,parents_revision_hash,parents_author_date,parents_is_merge,parents_is_rebase,...,children_revision_hash,children_author_date,children_is_merge,children_is_rebase,children_is_review_commit,children_parents_revision_hash,children_parents_author_date,children_parents_is_merge,children_parents_is_rebase,children_parents_is_review_commit
Elastic Search 1134,10660d390d760cde78fb115107c04e1cf104a4da,10660d390d760cde78fb115107c04e1cf104a4da,2011-07-18 19:15:22,False,False,0,04086985f415896c245a3ea2fc853042f022822c,2011-07-18 18:29:29,False,False,...,d905e6220970898c44b43d56f936e56b53fa3f4f,2011-07-18 19:56:28,False,False,0,10660d390d760cde78fb115107c04e1cf104a4da,2011-07-18 19:15:22,False,False,0
Elastic Search 1154,7a38e384c9ff783f17e9db92a730bdf654dbda6b,7a38e384c9ff783f17e9db92a730bdf654dbda6b,2010-03-04 23:48:56,False,False,0,7bf0f1ffca589df6e626d61182689bde005ce649,2010-03-04 23:39:04,False,False,...,45489ed1af13c6090486c9357e8745c4929d4a83,2010-03-04 23:50:40,False,False,0,7a38e384c9ff783f17e9db92a730bdf654dbda6b,2010-03-04 23:48:56,False,False,0
Elastic Search 1162,e5b041c8efd408fdc71fd2f2c84439e5a4985244,e5b041c8efd408fdc71fd2f2c84439e5a4985244,2010-08-26 21:41:55,False,False,0,19ddee5ec9544cf93a1db48e58b7b4b7d841c370,2010-08-26 15:12:51,False,False,...,e85c47db744334bf049a04656e7d8b19ac312af0,2010-08-27 10:52:58,False,False,0,e5b041c8efd408fdc71fd2f2c84439e5a4985244,2010-08-26 21:41:55,False,False,0
Elastic Search 12193,e88535a67e9594f3135465b5021ba9b502fef950,e88535a67e9594f3135465b5021ba9b502fef950,2015-06-13 09:28:05,False,False,0,364cbbd28232a2a111158de6993692875f7fe1d8,2015-06-13 08:34:36,False,False,...,58ccb39deea9afb136d8ad9ce6defc7c67b52cad,2015-06-13 18:29:18,False,False,0,e88535a67e9594f3135465b5021ba9b502fef950,2015-06-13 09:28:05,False,False,0
Elastic Search 1380,16a046f686c26309ee0041df8987a4d2ffedd956,16a046f686c26309ee0041df8987a4d2ffedd956,2011-04-07 11:30:12,False,False,0,7decb701d9c3cbdeafdd1abe1fd3bd2c3d995d8c,2011-04-07 11:16:21,False,False,...,7fcf82af807608a3e05a388ae16e713b55982cd0,2011-04-07 11:44:34,False,False,0,16a046f686c26309ee0041df8987a4d2ffedd956,2011-04-07 11:30:12,False,False,0


In [9]:
u = set()

prefix = ""

def my_function(row):
    if row[f'{prefix}_revision_hash'] in u:
        row[f'{prefix}_revision_hash'] = None
        row[f'{prefix}_author_date'] = None
        row[f'{prefix}_is_merge'] = None
        row[f'{prefix}_is_rebase'] = None
        row[f'{prefix}_is_review_commit'] = None
    else:
        u.add(row[f'{prefix}_revision_hash'])

    return row

for prefix in ['true_root', 'parents', 'grandparents', 'children', 'children_parents']:
    u = set()
    result = result.apply(my_function, axis=1)


In [10]:
result

Unnamed: 0,root,true_root_revision_hash,true_root_author_date,true_root_is_merge,true_root_is_rebase,true_root_is_review_commit,parents_revision_hash,parents_author_date,parents_is_merge,parents_is_rebase,...,children_revision_hash,children_author_date,children_is_merge,children_is_rebase,children_is_review_commit,children_parents_revision_hash,children_parents_author_date,children_parents_is_merge,children_parents_is_rebase,children_parents_is_review_commit
Elastic Search 1134,10660d390d760cde78fb115107c04e1cf104a4da,10660d390d760cde78fb115107c04e1cf104a4da,2011-07-18 19:15:22,False,False,0.0,04086985f415896c245a3ea2fc853042f022822c,2011-07-18 18:29:29,False,False,...,d905e6220970898c44b43d56f936e56b53fa3f4f,2011-07-18 19:56:28,False,False,0.0,10660d390d760cde78fb115107c04e1cf104a4da,2011-07-18 19:15:22,False,False,0.0
Elastic Search 1154,7a38e384c9ff783f17e9db92a730bdf654dbda6b,7a38e384c9ff783f17e9db92a730bdf654dbda6b,2010-03-04 23:48:56,False,False,0.0,7bf0f1ffca589df6e626d61182689bde005ce649,2010-03-04 23:39:04,False,False,...,45489ed1af13c6090486c9357e8745c4929d4a83,2010-03-04 23:50:40,False,False,0.0,7a38e384c9ff783f17e9db92a730bdf654dbda6b,2010-03-04 23:48:56,False,False,0.0
Elastic Search 1162,e5b041c8efd408fdc71fd2f2c84439e5a4985244,e5b041c8efd408fdc71fd2f2c84439e5a4985244,2010-08-26 21:41:55,False,False,0.0,19ddee5ec9544cf93a1db48e58b7b4b7d841c370,2010-08-26 15:12:51,False,False,...,e85c47db744334bf049a04656e7d8b19ac312af0,2010-08-27 10:52:58,False,False,0.0,e5b041c8efd408fdc71fd2f2c84439e5a4985244,2010-08-26 21:41:55,False,False,0.0
Elastic Search 12193,e88535a67e9594f3135465b5021ba9b502fef950,e88535a67e9594f3135465b5021ba9b502fef950,2015-06-13 09:28:05,False,False,0.0,364cbbd28232a2a111158de6993692875f7fe1d8,2015-06-13 08:34:36,False,False,...,58ccb39deea9afb136d8ad9ce6defc7c67b52cad,2015-06-13 18:29:18,False,False,0.0,e88535a67e9594f3135465b5021ba9b502fef950,2015-06-13 09:28:05,False,False,0.0
Elastic Search 1380,16a046f686c26309ee0041df8987a4d2ffedd956,16a046f686c26309ee0041df8987a4d2ffedd956,2011-04-07 11:30:12,False,False,0.0,7decb701d9c3cbdeafdd1abe1fd3bd2c3d995d8c,2011-04-07 11:16:21,False,False,...,7fcf82af807608a3e05a388ae16e713b55982cd0,2011-04-07 11:44:34,False,False,0.0,16a046f686c26309ee0041df8987a4d2ffedd956,2011-04-07 11:30:12,False,False,0.0
Elastic Search 14782,a75adaaaaaa632c036464da8fa24770796f77852,a75adaaaaaa632c036464da8fa24770796f77852,2015-08-17 13:52:36,False,False,0.0,d1c93fb57354d5e0b176ac4501db1167f0af4549,2015-08-17 09:43:47,False,True,...,c908c582c280a2f39b896797e683ea93b7ec902b,2015-08-17 14:14:20,True,False,0.0,a75adaaaaaa632c036464da8fa24770796f77852,2015-08-17 13:52:36,False,False,0.0
Elastic Search 14782,a75adaaaaaa632c036464da8fa24770796f77852,,NaT,,,,,NaT,,,...,,NaT,,,,a9b2b033e3eb8badbd6ffed04722d299dbfcbf6f,2015-08-15 15:53:43,False,False,1.0
Elastic Search 15858,4088236cf75b793c7a2a80f5f7be5b326b607378,4088236cf75b793c7a2a80f5f7be5b326b607378,2011-09-27 10:47:59,False,False,1.0,f63727e3c6c16b995099ff5a9c1f29635a23c7ed,2011-09-27 10:21:20,False,False,...,de8644d95a216c0251ca483f3c7fdcffc7d94547,2011-09-27 18:45:10,False,False,1.0,4088236cf75b793c7a2a80f5f7be5b326b607378,2011-09-27 10:47:59,False,False,1.0
Elastic Search 16246,7caee2fa4d918d4c6978bcf986a158d03bf7db2a,7caee2fa4d918d4c6978bcf986a158d03bf7db2a,2015-11-25 17:54:48,False,False,2.0,c3f97e7642f6f5208656b35629af08cccaaaeb22,2015-11-25 17:04:09,False,False,...,9aa9447016755884836c33909ec720cb79324247,2015-11-25 19:31:17,False,False,2.0,7caee2fa4d918d4c6978bcf986a158d03bf7db2a,2015-11-25 17:54:48,False,False,2.0
Elastic Search 1626,1f30681a84fcafe9cf6e6b8a2801ea1791b9edc0,1f30681a84fcafe9cf6e6b8a2801ea1791b9edc0,2011-08-27 14:40:00,False,True,0.0,ba352e0dc6f3078344a3bf1bec2c1e4233357848,2011-08-29 14:46:01,False,False,...,7420e99e7df6a33296850b5b25674ff06bf28959,2011-08-28 21:34:38,False,False,0.0,1f30681a84fcafe9cf6e6b8a2801ea1791b9edc0,2011-08-27 14:40:00,False,True,0.0


In [11]:
result.to_csv(f'{project}_family.csv')