In [1]:
import pandas as pd
import numpy as np
import os.path as osp
from datetime import datetime
from bs4 import BeautifulSoup
import requests as rq
import ast
import utils.helpers as hpr
import re

In [2]:
df = hpr.combine_openstack_data()

In [None]:
df['code_churn'] = df['insertions'] + df['deletions']

In [5]:
df["messages"] = df["messages"].map(ast.literal_eval)

In [79]:
df["reviewers"] = df["reviewers"].apply(ast.literal_eval)

In [260]:
df["revisions"] = df["revisions"].apply(ast.literal_eval)

In [80]:
df["reviewers_ids"] = df["reviewers"].map(lambda revs: [rev["_account_id"] for rev in revs if "tags" not in rev.keys()])

### Functions

In [10]:
def extract_date(nbr):
    res = df.loc[df['number'] == nbr, 'created'].values
    return res[0] if len(res) > 0 else None
    
def reorder_source_target(row):
    if row['created_source'] > row['created_target']:
        for c in ['created_', 'project_', 'owner_account_id_', 'is_owner_bot_', 'status_']:
            row[f'{c}source'], row[f'{c}target'] = row[f'{c}target'], row[f'{c}source']
            row['Source'], row['Target'] = row['Target'], row['Source']
    return row


def is_cross_team(row):
    if (row["Source_team"] == "None") or (row["Target_team"] == "None"):
        return 'None'
    if row["Source_team"] != row["Target_team"]:
        return "Cross"
    return "Within"

def retrieve_reviewers(number):
    return df.loc[df["number"]==number, "reviewers_ids"].values[0]

def is_same_developer(row):
    return "Same" if row["Source_developer"] == row["Target_developer"] else "Different"

def count_changes_ptg(row):
    created = row[1]
    df_sub =  df[(df["project"] == row[2])&(df["created"] <= created)]
    num_all_cha = df_sub['code_churn'].sum()
    num_owner_cha = df_sub.loc[df_sub["owner_account_id"] == row[0], 'code_churn'].sum()
    return 100 * (num_owner_cha / num_all_cha) if num_all_cha != 0 else 0

def identify_change_size(row):
    if row["Source_exp"] > row["Target_exp"]:
        return row["Source_churn"] / row["Target_churn"] if row["Target_churn"] != 0 else row["Source_churn"]
    return row["Target_churn"] / row["Source_churn"] if row["Source_churn"] != 0 else row["Target_churn"]

def identify_merge_duration(row):
    if row["Source_exp"] > row["Target_exp"]:
        return row["Source_duration"] / row["Target_duration"] if row["Target_duration"] != 0 else row["Source_duration"]
    return row["Target_duration"] / row["Source_duration"] if row["Source_duration"] != 0 else row["Target_duration"]

def time_diff(start, end):
    if start > end:
        start, end = end, start
    current_date =  datetime.strptime(end, "%Y-%m-%d %H:%M:%S") 
    previous_date = datetime.strptime(start, "%Y-%m-%d %H:%M:%S") 
    diff = current_date - previous_date
    diff = float("{:.2f}".format(diff.total_seconds() / 3600))
    return diff

def compute_review_duration(nbr):
    messages = df.loc[df.number==nbr, "messages"].values[0]
    messages.sort(key=lambda x: x["date"])
    if len(messages) >1:
        return time_diff(messages[0]["date"][:-10], messages[-1]["date"][:-10])
    return 0

def compute_time_diff(revisions):
    if len(revisions) == 1:
        return 0
    dates = [rev["created"][:-10] for rev in revisions]
    dates.sort()
    return time_diff(dates[0], dates[-1])

def compute_intersection(row):
    all_devs = len(set(row["Source_reviewers"]).union(row["Target_reviewers"]))
    return 0 if all_devs == 0 else len(set(row["Source_reviewers"]).intersection(row["Target_reviewers"])) * 100 / all_devs

def identify_review_kind(row, review_type):
    messages = df.loc[df["number"]==row[1], "messages"].values[0]
    if review_type == "UPLOAD":
        for msg in messages:
            if msg["account_id"] == row[0] and msg["content"].startswith("Upload patch set"):
                return True
    elif review_type == "COMMENT":
        for msg in messages:
            if msg["account_id"] == row[0] and msg["content"].startswith("Patch Set") and msg["content"].endswith("comment)"):
                return True
    elif review_type == "RATE":
        rating_labels = ["Workflow-", "Code-Review-", "Verified-", "Review-Priority"]
        for msg in messages:
            if msg["account_id"] == row[0] and re.search(r"Patch Set [\d+]: Code\-Review|Verified|Workflow|Review\-Priority[\-|\+]*", msg["content"]):
                return True
    return False

def identify_less_more_exp_metrics(row, metric_type):
    if row["Source_exp"] == row["Target_exp"]:
        return None
    
    if metric_type == "LESS":
        if row["Source_exp"] < row["Target_exp"]:
            return row[2]
        return row[3]
    
    if row["Source_exp"] > row["Target_exp"]:
            return row[2]
    return row[3]

In [261]:
df["merge_duration"] = df["revisions"].map(compute_time_diff)

### Count the number of cross ans single component dependecies ?

In [4]:
# df_all_dependencies = generate_deps()
df_all_dependencies = pd.read_csv("./Files/all_dependencies.csv")
# df_all_dependencies['Source_repo'] = df_all_dependencies['Source_repo'].map(lambda x: x[10:])
# df_all_dependencies['Target_repo'] = df_all_dependencies['Target_repo'].map(lambda x: x[10:])

In [42]:
df_all_dependencies["is_cross"] = df_all_dependencies.apply(lambda row: True if row["Source_repo"]!=row["Target_repo"] else False, axis=1)

In [70]:
request = rq.get("https://releases.openstack.org/")
soup = BeautifulSoup(request.text, 'html.parser')

tbody = soup.find("tbody")
openstack_releases = {}

tr_list = tbody.find_all("tr")
for i in range(len(tr_list)):
    tr = tr_list[i]
    td_list = tr.find_all("td")
    initial_release = td_list[2].text[:10]
    release_name = td_list[0].select_one('span', {"class": "doc"}).text.lower()
    openstack_releases.update({initial_release: release_name})

openstack_releases_keys = list(openstack_releases.keys())
openstack_releases_keys.sort()
openstack_releases = {i: openstack_releases[i] for i in openstack_releases_keys}

online_repositories = pd.read_csv("./all_os_components.csv")
online_repositories["related_projects"] = online_repositories["related_projects"].apply(ast.literal_eval)
online_repositories = dict(zip(online_repositories["main_project"].values.reshape(-1), online_repositories["related_projects"].values))

def retrieve_release(nbr):
    change = df.loc[df["number"]==nbr, ["created", "branch"]]
    branch = change["branch"].values[0]
    if branch.startswith("stable/") and branch.split("/")[1] in openstack_releases.values():
        return branch.split("/")[1]
    
    date = change["created"].values[0][:-19]
    keys = openstack_releases.keys()
    for k in keys:
        if date <= k:
            return openstack_releases.get(k)
    return None

In [7]:
inv_pro_team = pd.read_csv(osp.join('.', 'RQs', 'PQ', 'Files', 'inv_pro_team.csv'))

In [94]:
df_all_dependencies = pd.merge(
    left=df_all_dependencies, 
    right=inv_pro_team, 
    left_on='Source_repo', 
    right_on='project', 
    how='left',
    suffixes=('_source', '_target')
)

df_all_dependencies = pd.merge(
    left=df_all_dependencies, 
    right=inv_pro_team, 
    left_on='Target_repo', 
    right_on='project', 
    how='left',
    suffixes=('_source', '_target')
)

for c in ['Source_service', 'Target_service', 'project_source', 'project_target']:
    del df_all_dependencies[c]

df_all_dependencies = df_all_dependencies.rename(columns={'team_source': 'Source_team', 'team_target': 'Target_team'})

In [103]:
df_all_dependencies.loc[df_all_dependencies['Source_team'].isnull(), 'Source_team'] = 'None'
df_all_dependencies.loc[df_all_dependencies['Target_team'].isnull(), 'Target_team'] = 'None'

In [104]:
df_all_dependencies["is_cross_team"] = df_all_dependencies[["Source_team", "Target_team"]].apply(is_cross_team, axis=1)

In [52]:
df_all_dependencies["Source_status"] = df_all_dependencies["Source"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])
df_all_dependencies["Target_status"] = df_all_dependencies["Target"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])

In [53]:
df_all_dependencies["Source_datetime"] = df_all_dependencies["Source"].map(lambda id: df.loc[df["number"]==id, "created"].values[0])
df_all_dependencies["Target_datetime"] = df_all_dependencies["Target"].map(lambda id: df.loc[df["number"]==id, "created"].values[0])

In [74]:
df_all_dependencies["lag"] = df_all_dependencies[["Source_datetime", "Target_datetime"]].apply(lambda row: time_diff(row[0][0:-10], row[1][0:-10]), axis=1)

In [75]:
df_all_dependencies["Source_release"] = df_all_dependencies["Source"].map(retrieve_release)
df_all_dependencies["Target_release"] = df_all_dependencies["Target"].map(retrieve_release)

In [76]:
df_all_dependencies["is_source_bot"] = df_all_dependencies["Source"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])
df_all_dependencies["is_target_bot"] = df_all_dependencies["Target"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])

In [77]:
df_all_dependencies["Source_developer"] = df_all_dependencies["Source"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])
df_all_dependencies["Target_developer"] = df_all_dependencies["Target"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])

In [81]:
df_all_dependencies["Source_reviewers"] = df_all_dependencies["Source"].map(retrieve_reviewers)
df_all_dependencies["Target_reviewers"] = df_all_dependencies["Target"].map(retrieve_reviewers)

In [82]:
df_all_dependencies["Source_rev_target"] = df_all_dependencies[["Source_developer", "Target_reviewers"]].apply(lambda row: True if row["Source_developer"] in row["Target_reviewers"] else False, axis=1)
df_all_dependencies["Target_rev_source"] = df_all_dependencies[["Target_developer", "Source_reviewers"]].apply(lambda row: True if row["Target_developer"] in row["Source_reviewers"] else False, axis=1)

In [85]:
df_all_dependencies["is_same_dev"] = df_all_dependencies.apply(is_same_developer, axis=1)

In [301]:
df_all_dependencies["Source_churn"] = df_all_dependencies["Source"].map(lambda nbr: df.loc[df.number == nbr, ["insertions", "deletions"]].values[0].sum())
df_all_dependencies["Target_churn"] = df_all_dependencies["Target"].map(lambda nbr: df.loc[df.number == nbr, ["insertions", "deletions"]].values[0].sum())

In [11]:
df_all_dependencies["Source_exp"] = df_all_dependencies.loc[
    (df_all_dependencies["Source_status"]=="MERGED")&
    (df_all_dependencies["Target_status"]=="MERGED")&
    (df_all_dependencies["is_cross"]=="Cross")&
    (df_all_dependencies["is_same_dev"]=="Same"), 
    ["Source_developer", "Source_datetime", "Source_repo"]
].apply(count_changes_ptg, axis=1)
df_all_dependencies["Target_exp"] = df_all_dependencies.loc[
    (df_all_dependencies["Source_status"]=="MERGED")&
    (df_all_dependencies["Target_status"]=="MERGED")&
    (df_all_dependencies["is_cross"]=="Cross")&
    (df_all_dependencies["is_same_dev"]=="Same"), 
    ["Target_developer", "Target_datetime", "Target_repo"]
].apply(count_changes_ptg, axis=1)

In [26]:
df_all_dependencies.loc[(df_all_dependencies["is_cross"]=="Cross")&(df_all_dependencies["is_same_dev"]=="Same"), ["Source", "Source_repo", "Source_exp"]]

Unnamed: 0,Source,Source_repo,Source_exp
5,864479,openstack/os-brick,
9,865185,openstack/manila-tempest-plugin,
10,863139,openstack/puppet-tripleo,
11,863140,openstack/tripleo-common,
12,864299,openstack/openstack-helm-infra,
...,...,...,...
197389,850182,openstack/charm-glance,0.078864
197635,787780,openstack/neutron,0.005588
197638,787963,openstack/neutron,0.005647
197640,787964,openstack/neutron,0.005708


In [22]:
df[(df.project=="openstack/os-brick")&(df.created < "2022-11-15 14:26:55.000000000")]

Unnamed: 0,id,project,branch,change_id,subject,status,created,updated,submitted,insertions,...,discussion_messages_count,reviewers_count,revisions_count,owner_account_id,owner_name,owner_username,is_owner_bot,commit_message,changed_files,files_count
102,openstack%2Fos-brick~master~Iba31b709ef0009891...,openstack/os-brick,master,Iba31b709ef0009891d0098600387e3fe6b1af26d,nit: correct spelling of Rescanning in debug log,MERGED,2022-09-24 18:48:34.000000000,2022-11-18 15:45:17.000000000,2022-11-18 15:44:14.000000000,1,...,18,4,1,27615,Rajat Dhasmana,whoami-rajat,0,nit: correct spelling of Rescanning in debug l...,['os_brick/initiator/linuxfc.py'],1
517,openstack%2Fos-brick~master~I9ab95a274f79bbcd8...,openstack/os-brick,master,I9ab95a274f79bbcd8a706662e9e03d0bdcb45654,Update metadata in setup.cfg,MERGED,2022-08-29 11:20:21.000000000,2022-11-09 11:50:30.000000000,2022-11-09 11:49:30.000000000,4,...,39,9,3,30615,Tushar Trambak Gite,tushargite96,0,Update metadata in setup.cfg\n\nwe are using s...,[],0
1074,openstack%2Fos-brick~stable%2Fwallaby~Ie373ab0...,openstack/os-brick,stable/wallaby,Ie373ab050dcc0a35c749d9a53b6cf5ca060bcb58,Fix encryption symlink issues,MERGED,2022-09-09 05:58:26.000000000,2022-10-28 16:27:06.000000000,2022-10-28 16:25:42.000000000,567,...,14,3,2,5689,Masayuki Igawa,igawa,0,Fix encryption symlink issues\n\nThis patch fi...,[],0
5526,openstack%2Fos-brick~master~I27a565b8ca250f679...,openstack/os-brick,master,I27a565b8ca250f6790061932ab000946dedd97c5,mypy: lvm.py,MERGED,2021-04-16 21:15:59.000000000,2022-08-27 22:51:46.000000000,2022-08-25 18:10:58.000000000,69,...,134,7,10,4523,Eric Harney,eharney,0,mypy: lvm.py\n\nChange-Id: I27a565b8ca250f6790...,[],0
5527,openstack%2Fos-brick~master~Id9a3bf8b1e8aea14c...,openstack/os-brick,master,Id9a3bf8b1e8aea14cd2cf1eae6907c9719aad3b0,mypy: initiator/linuxrbd,MERGED,2022-08-17 18:05:48.000000000,2022-08-27 20:56:09.000000000,2022-08-25 18:10:56.000000000,41,...,73,5,4,4523,Eric Harney,eharney,0,mypy: initiator/linuxrbd\n\nAdd type coverage ...,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418659,openstack%2Fos-brick~master~I672497e819c3bffd7...,openstack/os-brick,master,I672497e819c3bffd71a0b7c6074c7a946eb74c91,Use target_portals/iqns/luns for alternative t...,MERGED,2015-02-23 21:36:48.000000000,2015-02-27 22:22:44.000000000,2015-02-27 22:22:42.000000000,33,...,10,4,2,9176,Tomoki Sekiyama,tsekiyama,0,Use target_portals/iqns/luns for alternative t...,[],0
419034,openstack%2Fos-brick~master~I1c714c03c162446b0...,openstack/os-brick,master,I1c714c03c162446b00f1df279df6962ddd95c507,Fix comments style according to Hacking rules,MERGED,2015-02-19 18:02:10.000000000,2015-02-26 00:33:35.000000000,2015-02-26 00:33:35.000000000,5,...,15,5,3,5997,Walt,walter-boring,0,Fix comments style according to Hacking rules\...,[],0
419666,openstack%2Fos-brick~master~Ic08d018d3b5f282ec...,openstack/os-brick,master,Ic08d018d3b5f282ece823971576043f76bad1b2d,Failover to alternative iSCSI portals on login...,MERGED,2015-02-18 23:10:42.000000000,2015-02-23 16:44:47.000000000,2015-02-23 16:44:46.000000000,107,...,7,3,1,9176,Tomoki Sekiyama,tsekiyama,0,Failover to alternative iSCSI portals on login...,"['os_brick/tests/initiator/test_connector.py',...",2
420119,openstack%2Fos-brick~master~Ia0fe0118207bbdc3c...,openstack/os-brick,master,Ia0fe0118207bbdc3cf698dfe09c0b71ebddd57f3,Update the documentation for os-brick,MERGED,2015-02-19 00:05:38.000000000,2015-02-19 22:53:59.000000000,2015-02-19 22:53:59.000000000,202,...,25,7,5,5997,Walt,walter-boring,0,Update the documentation for os-brick\n\nThis ...,[],0


In [7]:
df_all_dependencies["max"] = df_all_dependencies[["Source_exp", "Target_exp"]].apply(lambda row: max(row.values), axis=1)
df_all_dependencies["min"] = df_all_dependencies[["Source_exp", "Target_exp"]].apply(lambda row: min(row.values), axis=1)
df_all_dependencies["max_min"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]=='Cross')&(df_all_dependencies["is_same_dev"]=='Same'), ["min", "max"]].apply(lambda row: row["max"]/row["min"] if row["min"] != 0 else 0, axis=1)

In [93]:
df_all_dependencies["change_size"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]=="Same"),["Source_exp", "Target_exp", "Source_churn", "Target_churn"]].apply(identify_change_size, axis=1)

In [9]:
df_all_dependencies["Source_duration"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Source"].map(compute_review_duration)
df_all_dependencies["Target_duration"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Target"].map(compute_review_duration)

In [265]:
df_all_dependencies["merge_duration_diff"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]=="Different"),["Source_exp", "Target_exp", "Source_duration", "Target_duration"]].apply(identify_merge_duration, axis=1)

In [285]:
df_all_dependencies["Source_revisions"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Source"].map(lambda nbr: df.loc[df["number"]==nbr,"revisions_count"].values[0])
df_all_dependencies["Target_revisions"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Target"].map(lambda nbr: df.loc[df["number"]==nbr,"revisions_count"].values[0])

In [108]:
df_all_dependencies["intersect_rev"] = df_all_dependencies[["Source_reviewers", "Target_reviewers"]].apply(compute_intersection, axis=1)

In [171]:
df_all_dependencies["is_source_upload"] = df_all_dependencies[["Source_developer", "Target"]].apply(identify_review_kind, axis=1, args=("UPLOAD",))
df_all_dependencies["is_source_comment"] = df_all_dependencies[["Source_developer", "Target"]].apply(identify_review_kind, axis=1, args=("COMMENT",))
df_all_dependencies["is_source_rate"] = df_all_dependencies[["Source_developer", "Target"]].apply(identify_review_kind, axis=1, args=("RATE",))

df_all_dependencies["is_target_upload"] = df_all_dependencies[["Target_developer", "Source"]].apply(identify_review_kind, axis=1, args=("UPLOAD",))
df_all_dependencies["is_target_comment"] = df_all_dependencies[["Target_developer", "Source"]].apply(identify_review_kind, axis=1, args=("COMMENT",))
df_all_dependencies["is_target_rate"] = df_all_dependencies[["Target_developer", "Source"]].apply(identify_review_kind, axis=1, args=("RATE",))

In [337]:
df_all_dependencies["Churn_less"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_churn", "Target_churn"]].apply(identify_less_more_exp_metrics, axis=1, args=("LESS",))
df_all_dependencies["Churn_more"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_churn", "Target_churn"]].apply(identify_less_more_exp_metrics, axis=1, args=("MORE",))
df_all_dependencies["Duration_less"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_duration", "Target_duration"]].apply(identify_less_more_exp_metrics, axis=1, args=("LESS",))
df_all_dependencies["Duration_more"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_duration", "Target_duration"]].apply(identify_less_more_exp_metrics, axis=1, args=("MORE",))
df_all_dependencies["Revisions_less"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_revisions", "Target_revisions"]].apply(identify_less_more_exp_metrics, axis=1, args=("LESS",))
df_all_dependencies["Revisions_more"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_revisions", "Target_revisions"]].apply(identify_less_more_exp_metrics, axis=1, args=("MORE",))

In [7]:
df_all_dependencies["lag"] = df_all_dependencies[["Source_datetime", "Target_datetime"]].apply(lambda row: time_diff(row[0][0:-10], row[1][0:-10]), axis=1)

In [11]:
df_all_dependencies.loc[(df_all_dependencies["is_cross"] == True) &
                    (df_all_dependencies["is_same_dev"] == "Different") &
                    (df_all_dependencies["Source_status"] == "MERGED") &
                    (df_all_dependencies["Target_status"] == "MERGED") &
                    (df_all_dependencies["is_source_bot"] == False) &
                    (df_all_dependencies["is_target_bot"] == False) &
                    (df_all_dependencies["is_source_rate"] == False) &
                    (df_all_dependencies["is_target_rate"] == False) &
                    (df_all_dependencies["is_source_comment"] == False) &
                    (df_all_dependencies["is_target_comment"] == False) &
                    ((df_all_dependencies["Source_rev_target"] == True)| 
                     (df_all_dependencies["Target_rev_source"] == True)), ["Source", "Target", "Source_developer", "Target_developer", "Source_rev_target", "Target_rev_source", "is_source_rate", "is_target_rate", "is_source_comment", "is_target_comment"]]

Unnamed: 0,Source,Target,Source_developer,Target_developer,Source_rev_target,Target_rev_source,is_source_rate,is_target_rate,is_source_comment,is_target_comment
1103,831935,844680,9708,29870,True,False,False,False,False,False
1415,835073,834176,16515,34647,True,False,False,False,False,False
1446,803761,823645,32926,16515,True,True,False,False,False,False
1683,835425,835249,29775,32926,True,False,False,False,False,False
1735,835475,835974,8367,14611,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
193684,98882,99345,1561,9656,True,False,False,False,False,False
193686,98882,99348,1561,9656,True,False,False,False,False,False
194883,415607,450203,9414,1004,True,False,False,False,False,False
194972,7889,9906,1561,1247,True,True,False,False,False,False


In [9]:
df_all_dependencies["is_same_dev"] = df_all_dependencies["is_same_dev"].map(lambda x: "Same" if x == True else "Different")
df_all_dependencies["is_cross"] = df_all_dependencies["is_cross"].map(lambda x: "Cross" if x == True else "Within")
df_all_dependencies["is_cross_team"] = df_all_dependencies["is_cross_team"].map(lambda x: "Cross" if x == True else "Within")

In [10]:
df_all_dependencies_source = df_all_dependencies.loc[
    (df_all_dependencies["is_cross"] == True) &
    (df_all_dependencies["Source_status"] == "MERGED") &
    (df_all_dependencies["Target_status"] == "MERGED") &
    (df_all_dependencies["is_source_bot"] == False) &
    (df_all_dependencies["is_target_bot"] == False), ["Source", "is_same_dev", "Source_churn", "Source_revisions", "Source_duration", "Label"]]
df_all_dependencies_source["Project_type"] = "Source"
df_all_dependencies_source.rename(columns={"Source": "Number", "Source_churn": "Churn", "Source_revisions": "Revisions", "Source_duration": "Duration"}, inplace=True)

df_all_dependencies_target = df_all_dependencies.loc[
    (df_all_dependencies["is_cross"] == True) &
    (df_all_dependencies["Source_status"] == "MERGED") &
    (df_all_dependencies["Target_status"] == "MERGED") &
    (df_all_dependencies["is_source_bot"] == False) &
    (df_all_dependencies["is_target_bot"] == False) , ["Target", "is_same_dev", "Target_churn", "Target_revisions", "Target_duration", "Label"]]
df_all_dependencies_target["Project_type"] = "Target"
df_all_dependencies_target.rename(columns={"Target": "Number", "Target_churn": "Churn", "Target_revisions": "Revisions", "Target_duration": "Duration"}, inplace=True)

In [8]:
df_all_dependencies.to_csv("./Files/all_dependencies.csv", index=False)

In [12]:
df_all_dependencies_summarized = pd.concat((df_all_dependencies_source, df_all_dependencies_target))

In [13]:
df_all_dependencies_summarized.to_csv("./Files/all_dependencies_summarized.csv", index=False)

In [318]:
df_all_dependencies_summarized.loc[(df_all_dependencies_summarized["Project_type"]=="Target")&(df_all_dependencies_summarized["is_same_dev"]!="Same"), ["Churn", "Duration", "Revisions"]].median()

Churn        10.00
Duration      2.12
Revisions     2.00
dtype: float64

In [5]:
extended_paths_number = pd.read_csv("./Files/Number/extended_merged.csv")
extended_paths_number["Path"] = extended_paths_number.Path.map(ast.literal_eval)

In [6]:
depended_changes = set(hpr.flatten_list(extended_paths_number.Path.values))
# all_dependencies = pd.read_csv("./Files/all_dependencies.csv")

#### Number of cross/within-project changes

In [36]:
cross_project_changes = list(set(hpr.flatten_list(df_all_dependencies.loc[df_all_dependencies["is_cross"]=="Cross", ["Source", "Target"]].values)))# *100/ 165033
single_project_changes = list(depended_changes.difference(cross_project_changes))

#### the # of cross and single-service dependent changes

In [6]:
len(set(hpr.flatten_list(
            df_all_dependencies.loc[df_all_dependencies["is_cross"]=="Cross",
                                 ["Source", "Target"]].values)))

52069

In [121]:
cross_team_changes = set(hpr.flatten_list(df_all_dependencies.loc[df_all_dependencies['is_cross_team']=='Cross', ['Source', 'Target']].values.tolist()))
within_team_changes = set(hpr.flatten_list(df_all_dependencies.loc[df_all_dependencies['is_cross_team']=='Within', ['Source', 'Target']].values.tolist()))
within_team_changes = {change for change in within_team_changes if change not in cross_team_changes}

### How many changes are related to co-changing components ?

In [136]:
def compute_array_length(row):
    return len(set(row["Path"]))

def compute_teams(row):
    projects = df.loc[df["number"].isin(row["Path"]), "project"].unique()
    teams = inv_pro_team.loc[inv_pro_team['project'].isin(projects), 'team'].nunique()
    return teams

In [134]:
extended_paths_number = pd.read_csv("./Files/Number/extended_merged.csv")

extended_paths_number["Path"] = extended_paths_number["Path"].apply(ast.literal_eval)
extended_paths_number["length"] = extended_paths_number.Path.map(lambda path: df.loc[df["number"].isin(path), "project"].nunique())

In [139]:
extended_paths_number["team_size"] = extended_paths_number.apply(compute_teams, axis=1)

In [201]:
extended_paths_number["length"].to_csv("./RQs/RQ1/Files/component_chains.csv", index=False)

In [157]:
extended_paths_number[extended_paths_number["team_size"]>1].to_csv("./RQs/RQ1/Files/team_chains.csv", index=False)