In [1]:
import pandas as pd
import numpy as np
import collections
from datetime import datetime
from bs4 import BeautifulSoup
import requests as rq
from collections import OrderedDict
import ast
import utils.helpers as hpr
import re

In [30]:
def combine_openstack_data():
    '''Combine generated csv files into a single DataFrame object
    '''
    df = pd.DataFrame([])
    data_path = "%sChanges/" % hpr.DIR
    changes_file_names = hpr.list_file(data_path)
    for f in changes_file_names:
        df_per_file = pd.read_csv("%s%s" % (data_path, f))
        df = pd.concat((df, df_per_file))

    df = df.drop_duplicates(subset=["number"])

    df = df.sort_values(by="updated", ascending=False).reset_index(drop=True)

    return df

In [31]:
df = combine_openstack_data()

In [5]:
df["messages"] = df["messages"].map(ast.literal_eval)

In [79]:
df["reviewers"] = df["reviewers"].apply(ast.literal_eval)

In [260]:
df["revisions"] = df["revisions"].apply(ast.literal_eval)

In [80]:
df["reviewers_ids"] = df["reviewers"].map(lambda revs: [rev["_account_id"] for rev in revs if "tags" not in rev.keys()])

### Functions

In [80]:
def is_cross_service(row):
    if (row["Source_service"] == "None") or (row["Target_service"] == "None"):
        return None
    if row["Source_service"] != row["Target_service"]:
        return "Cross"
    return "Within"

def retrieve_reviewers(number):
    return df.loc[df["number"]==number, "reviewers_ids"].values[0]

def is_same_developer(row):
    return "Same" if row["Source_developer"] == row["Target_developer"] else "Different"

def count_changes(row):
    return len(df[(df["owner_account_id"] == row[0])
       & (df["project"] == row[2]) &
       (df["created"] < row[1])])

def identify_change_size(row):
    if row["Source_exp"] > row["Target_exp"]:
        return row["Source_churn"] / row["Target_churn"] if row["Target_churn"] != 0 else row["Source_churn"]
    return row["Target_churn"] / row["Source_churn"] if row["Source_churn"] != 0 else row["Target_churn"]

def identify_merge_duration(row):
    if row["Source_exp"] > row["Target_exp"]:
        return row["Source_duration"] / row["Target_duration"] if row["Target_duration"] != 0 else row["Source_duration"]
    return row["Target_duration"] / row["Source_duration"] if row["Source_duration"] != 0 else row["Target_duration"]

def time_diff(start, end):
    if start > end:
        start, end = end, start
    current_date =  datetime.strptime(end, "%Y-%m-%d %H:%M:%S") 
    previous_date = datetime.strptime(start, "%Y-%m-%d %H:%M:%S") 
    diff = current_date - previous_date
    diff = float("{:.2f}".format(diff.total_seconds() / 3600))
    return diff

def compute_review_duration(nbr):
    messages = df.loc[df.number==nbr, "messages"].values[0]
    messages.sort(key=lambda x: x["date"])
    if len(messages) >1:
        return time_diff(messages[0]["date"][:-10], messages[-1]["date"][:-10])
    return 0

def compute_time_diff(revisions):
    if len(revisions) == 1:
        return 0
    dates = [rev["created"][:-10] for rev in revisions]
    dates.sort()
    return time_diff(dates[0], dates[-1])

def compute_intersection(row):
    all_devs = len(set(row["Source_reviewers"]).union(row["Target_reviewers"]))
    return 0 if all_devs == 0 else len(set(row["Source_reviewers"]).intersection(row["Target_reviewers"])) * 100 / all_devs

def identify_review_kind(row, review_type):
    messages = df.loc[df["number"]==row[1], "messages"].values[0]
    if review_type == "UPLOAD":
        for msg in messages:
            if msg["account_id"] == row[0] and msg["content"].startswith("Upload patch set"):
                return True
    elif review_type == "COMMENT":
        for msg in messages:
            if msg["account_id"] == row[0] and msg["content"].startswith("Patch Set") and msg["content"].endswith("comment)"):
                return True
    elif review_type == "RATE":
        rating_labels = ["Workflow-", "Code-Review-", "Verified-", "Review-Priority"]
        for msg in messages:
            if msg["account_id"] == row[0] and re.search(r"Patch Set [\d+]: Code\-Review|Verified|Workflow|Review\-Priority[\-|\+]*", msg["content"]):
                return True
    return False

def identify_less_more_exp_metrics(row, metric_type):
    if row["Source_exp"] == row["Target_exp"]:
        return None
    
    if metric_type == "LESS":
        if row["Source_exp"] < row["Target_exp"]:
            return row[2]
        return row[3]
    
    if row["Source_exp"] > row["Target_exp"]:
            return row[2]
    return row[3]

In [261]:
df["merge_duration"] = df["revisions"].map(compute_time_diff)

### Count the number of cross ans single component dependecies ?

In [4]:
df_depends_needed = pd.read_csv("./Files/source_target_evolution.csv")
df_related_bug = pd.read_csv("./Files/Number/related_bug.csv")
df_related_bug["Path"] = df_related_bug["Path"].map(ast.literal_eval)
df_change_id = pd.read_csv("./Files/Number/change_id.csv")
df_change_id["Path"] = df_change_id["Path"].map(ast.literal_eval)

#### Cross and single-component dependencies

In [12]:
def build_source_target(chains):
    result = []
    for chain in chains:
        for path in chain:
            treated_items = []
            path.sort()
            for i in range(len(path)):
                numberA = path[i]
                for j in range(len(path)):
                    numberB = path[j]
                    if (numberA != numberB) and (numberB not in treated_items):
                        itemA = df.loc[df["number"] == numberA, ["project", "created"]].values[0]
                        itemB = df.loc[df["number"] == numberB, ["project", "created"]].values[0]
                        if itemA[1] > itemB[1]:
                            itemA, itemB = itemB, itemA
                        source_repo = itemA[0]
                        target_repo = itemB[0]
                        result.append({"Source": numberA, "Target": numberB, "Source_repo": source_repo, "Target_repo": target_repo})
                treated_items.append(numberA)
    return result

In [19]:
source_target_related_bug = build_source_target([df_related_bug["Path"].values])
source_target_change_id = build_source_target([df_change_id["Path"].values])

In [34]:
df_depends_needed["Label"] = "D-O/N-B"
df_source_target_related_bug = pd.DataFrame(source_target_related_bug)
df_source_target_related_bug["Label"] = "R-B"
df_source_target_related_bug.drop_duplicates(inplace=True)
df_source_target_change_id = pd.DataFrame(source_target_change_id)
df_source_target_change_id["Label"] = "C-I"

In [2]:
df_all_dependencies = pd.concat((df_depends_needed, df_source_target_related_bug, df_source_target_change_id))

In [3]:
len(df_all_dependencies)

197754

In [41]:
df_all_dependencies.drop_duplicates(subset=["Source", "Target"], inplace=True, keep="first")
df_all_dependencies.reset_index(drop=True, inplace=True)

In [42]:
df_all_dependencies["is_cross"] = df_all_dependencies.apply(lambda row: True if row["Source_repo"]!=row["Target_repo"] else False, axis=1)

In [70]:
request = rq.get("https://releases.openstack.org/")
soup = BeautifulSoup(request.text, 'html.parser')

tbody = soup.find("tbody")
openstack_releases = {}

tr_list = tbody.find_all("tr")
for i in range(len(tr_list)):
    tr = tr_list[i]
    td_list = tr.find_all("td")
    initial_release = td_list[2].text[:10]
    release_name = td_list[0].select_one('span', {"class": "doc"}).text.lower()
    openstack_releases.update({initial_release: release_name})

openstack_releases_keys = list(openstack_releases.keys())
openstack_releases_keys.sort()
openstack_releases = {i: openstack_releases[i] for i in openstack_releases_keys}

online_repositories = pd.read_csv("./all_os_components.csv")
online_repositories["related_projects"] = online_repositories["related_projects"].apply(ast.literal_eval)
online_repositories = dict(zip(online_repositories["main_project"].values.reshape(-1), online_repositories["related_projects"].values))

def invert_projects_services():
    return {"openstack/%s" % p:service for service, projects in online_repositories.items() for p in projects}

def retrieve_release(nbr):
    change = df.loc[df["number"]==nbr, ["created", "branch"]]
    branch = change["branch"].values[0]
    if branch.startswith("stable/") and branch.split("/")[1] in openstack_releases.values():
        return branch.split("/")[1]
    
    date = change["created"].values[0][:-19]
    keys = openstack_releases.keys()
    for k in keys:
        if date <= k:
            return openstack_releases.get(k)
    return None

In [71]:
project_services_inv = invert_projects_services()

In [72]:
df_all_dependencies["Source_service"] = df_all_dependencies["Source_repo"].map(lambda src_repo: project_services_inv.get(src_repo))
df_all_dependencies["Target_service"] = df_all_dependencies["Target_repo"].map(lambda trgt_repo: project_services_inv.get(trgt_repo))

In [79]:
df_all_dependencies.loc[df_all_dependencies["Source_service"].isnull(), "Source_service"] = "None"
df_all_dependencies.loc[df_all_dependencies["Target_service"].isnull(), "Target_service"] = "None"

In [52]:
df_all_dependencies["Source_status"] = df_all_dependencies["Source"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])
df_all_dependencies["Target_status"] = df_all_dependencies["Target"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])

In [53]:
df_all_dependencies["Source_datetime"] = df_all_dependencies["Source"].map(lambda id: df.loc[df["number"]==id, "created"].values[0])
df_all_dependencies["Target_datetime"] = df_all_dependencies["Target"].map(lambda id: df.loc[df["number"]==id, "created"].values[0])

In [81]:
df_all_dependencies["is_cross_service"] = df_all_dependencies[["Source_service", "Target_service"]].apply(is_cross_service, axis=1)

In [74]:
df_all_dependencies["lag"] = df_all_dependencies[["Source_datetime", "Target_datetime"]].apply(lambda row: time_diff(row[0][0:-10], row[1][0:-10]), axis=1)

In [75]:
df_all_dependencies["Source_release"] = df_all_dependencies["Source"].map(retrieve_release)
df_all_dependencies["Target_release"] = df_all_dependencies["Target"].map(retrieve_release)

In [76]:
df_all_dependencies["is_source_bot"] = df_all_dependencies["Source"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])
df_all_dependencies["is_target_bot"] = df_all_dependencies["Target"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])

In [77]:
df_all_dependencies["Source_developer"] = df_all_dependencies["Source"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])
df_all_dependencies["Target_developer"] = df_all_dependencies["Target"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])

In [81]:
df_all_dependencies["Source_reviewers"] = df_all_dependencies["Source"].map(retrieve_reviewers)
df_all_dependencies["Target_reviewers"] = df_all_dependencies["Target"].map(retrieve_reviewers)

In [82]:
df_all_dependencies["Source_rev_target"] = df_all_dependencies[["Source_developer", "Target_reviewers"]].apply(lambda row: True if row["Source_developer"] in row["Target_reviewers"] else False, axis=1)
df_all_dependencies["Target_rev_source"] = df_all_dependencies[["Target_developer", "Source_reviewers"]].apply(lambda row: True if row["Target_developer"] in row["Source_reviewers"] else False, axis=1)

In [85]:
df_all_dependencies["is_same_dev"] = df_all_dependencies.apply(is_same_developer, axis=1)

In [301]:
df_all_dependencies["Source_churn"] = df_all_dependencies["Source"].map(lambda nbr: df.loc[df.number == nbr, ["insertions", "deletions"]].values[0].sum())
df_all_dependencies["Target_churn"] = df_all_dependencies["Target"].map(lambda nbr: df.loc[df.number == nbr, ["insertions", "deletions"]].values[0].sum())

In [90]:
df_all_dependencies["Source_exp"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]=="Same"), ["Source_developer", "Source_datetime", "Source_repo"]].apply(count_changes, axis=1)
df_all_dependencies["Target_exp"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]=="Same"), ["Target_developer", "Target_datetime", "Target_repo"]].apply(count_changes, axis=1)

In [322]:
df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]=="Same"), ["Source_exp"]]

Unnamed: 0,Source_exp
5,0.0
9,26.0
10,60.0
11,29.0
12,2.0
...,...
197389,28.0
197635,114.0
197638,115.0
197640,116.0


In [92]:
df_all_dependencies["max"] = df_all_dependencies[["Source_exp", "Target_exp"]].apply(lambda row: max(row.values), axis=1)
df_all_dependencies["min"] = df_all_dependencies[["Source_exp", "Target_exp"]].apply(lambda row: min(row.values), axis=1)
df_all_dependencies["max_min"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]==True), ["min", "max"]].apply(lambda row: row["max"]/row["min"] if row["min"] != 0 else 0, axis=1)

In [93]:
df_all_dependencies["change_size"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]=="Same"),["Source_exp", "Target_exp", "Source_churn", "Target_churn"]].apply(identify_change_size, axis=1)

In [9]:
df_all_dependencies["Source_duration"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Source"].map(compute_review_duration)
df_all_dependencies["Target_duration"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Target"].map(compute_review_duration)

In [265]:
df_all_dependencies["merge_duration_diff"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["is_same_dev"]=="Different"),["Source_exp", "Target_exp", "Source_duration", "Target_duration"]].apply(identify_merge_duration, axis=1)

In [285]:
df_all_dependencies["Source_revisions"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Source"].map(lambda nbr: df.loc[df["number"]==nbr,"revisions_count"].values[0])
df_all_dependencies["Target_revisions"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True), "Target"].map(lambda nbr: df.loc[df["number"]==nbr,"revisions_count"].values[0])

In [108]:
df_all_dependencies["intersect_rev"] = df_all_dependencies[["Source_reviewers", "Target_reviewers"]].apply(compute_intersection, axis=1)

In [171]:
df_all_dependencies["is_source_upload"] = df_all_dependencies[["Source_developer", "Target"]].apply(identify_review_kind, axis=1, args=("UPLOAD",))
df_all_dependencies["is_source_comment"] = df_all_dependencies[["Source_developer", "Target"]].apply(identify_review_kind, axis=1, args=("COMMENT",))
df_all_dependencies["is_source_rate"] = df_all_dependencies[["Source_developer", "Target"]].apply(identify_review_kind, axis=1, args=("RATE",))

df_all_dependencies["is_target_upload"] = df_all_dependencies[["Target_developer", "Source"]].apply(identify_review_kind, axis=1, args=("UPLOAD",))
df_all_dependencies["is_target_comment"] = df_all_dependencies[["Target_developer", "Source"]].apply(identify_review_kind, axis=1, args=("COMMENT",))
df_all_dependencies["is_target_rate"] = df_all_dependencies[["Target_developer", "Source"]].apply(identify_review_kind, axis=1, args=("RATE",))

In [337]:
df_all_dependencies["Churn_less"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_churn", "Target_churn"]].apply(identify_less_more_exp_metrics, axis=1, args=("LESS",))
df_all_dependencies["Churn_more"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_churn", "Target_churn"]].apply(identify_less_more_exp_metrics, axis=1, args=("MORE",))
df_all_dependencies["Duration_less"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_duration", "Target_duration"]].apply(identify_less_more_exp_metrics, axis=1, args=("LESS",))
df_all_dependencies["Duration_more"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_duration", "Target_duration"]].apply(identify_less_more_exp_metrics, axis=1, args=("MORE",))
df_all_dependencies["Revisions_less"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_revisions", "Target_revisions"]].apply(identify_less_more_exp_metrics, axis=1, args=("LESS",))
df_all_dependencies["Revisions_more"] = df_all_dependencies[["Source_exp", "Target_exp", "Source_revisions", "Target_revisions"]].apply(identify_less_more_exp_metrics, axis=1, args=("MORE",))

In [7]:
df_all_dependencies["lag"] = df_all_dependencies[["Source_datetime", "Target_datetime"]].apply(lambda row: time_diff(row[0][0:-10], row[1][0:-10]), axis=1)

In [11]:
df_all_dependencies.loc[(df_all_dependencies["is_cross"] == True) &
                    (df_all_dependencies["is_same_dev"] == "Different") &
                    (df_all_dependencies["Source_status"] == "MERGED") &
                    (df_all_dependencies["Target_status"] == "MERGED") &
                    (df_all_dependencies["is_source_bot"] == False) &
                    (df_all_dependencies["is_target_bot"] == False) &
                    (df_all_dependencies["is_source_rate"] == False) &
                    (df_all_dependencies["is_target_rate"] == False) &
                    (df_all_dependencies["is_source_comment"] == False) &
                    (df_all_dependencies["is_target_comment"] == False) &
                    ((df_all_dependencies["Source_rev_target"] == True)| 
                     (df_all_dependencies["Target_rev_source"] == True)), ["Source", "Target", "Source_developer", "Target_developer", "Source_rev_target", "Target_rev_source", "is_source_rate", "is_target_rate", "is_source_comment", "is_target_comment"]]

Unnamed: 0,Source,Target,Source_developer,Target_developer,Source_rev_target,Target_rev_source,is_source_rate,is_target_rate,is_source_comment,is_target_comment
1103,831935,844680,9708,29870,True,False,False,False,False,False
1415,835073,834176,16515,34647,True,False,False,False,False,False
1446,803761,823645,32926,16515,True,True,False,False,False,False
1683,835425,835249,29775,32926,True,False,False,False,False,False
1735,835475,835974,8367,14611,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
193684,98882,99345,1561,9656,True,False,False,False,False,False
193686,98882,99348,1561,9656,True,False,False,False,False,False
194883,415607,450203,9414,1004,True,False,False,False,False,False
194972,7889,9906,1561,1247,True,True,False,False,False,False


In [9]:
df_all_dependencies["is_same_dev"] = df_all_dependencies["is_same_dev"].map(lambda x: "Same" if x == True else "Different")
df_all_dependencies["is_cross"] = df_all_dependencies["is_cross"].map(lambda x: "Cross" if x == True else "Within")
df_all_dependencies["is_cross_service"] = df_all_dependencies["is_cross_service"].map(lambda x: "Cross" if x == True else "Within")

In [10]:
df_all_dependencies_source = df_all_dependencies.loc[
    (df_all_dependencies["is_cross"] == True) &
    (df_all_dependencies["Source_status"] == "MERGED") &
    (df_all_dependencies["Target_status"] == "MERGED") &
    (df_all_dependencies["is_source_bot"] == False) &
    (df_all_dependencies["is_target_bot"] == False), ["Source", "is_same_dev", "Source_churn", "Source_revisions", "Source_duration", "Label"]]
df_all_dependencies_source["Project_type"] = "Source"
df_all_dependencies_source.rename(columns={"Source": "Number", "Source_churn": "Churn", "Source_revisions": "Revisions", "Source_duration": "Duration"}, inplace=True)

df_all_dependencies_target = df_all_dependencies.loc[
    (df_all_dependencies["is_cross"] == True) &
    (df_all_dependencies["Source_status"] == "MERGED") &
    (df_all_dependencies["Target_status"] == "MERGED") &
    (df_all_dependencies["is_source_bot"] == False) &
    (df_all_dependencies["is_target_bot"] == False) , ["Target", "is_same_dev", "Target_churn", "Target_revisions", "Target_duration", "Label"]]
df_all_dependencies_target["Project_type"] = "Target"
df_all_dependencies_target.rename(columns={"Target": "Number", "Target_churn": "Churn", "Target_revisions": "Revisions", "Target_duration": "Duration"}, inplace=True)

In [11]:
df_all_dependencies.to_csv("./Files/all_dependencies.csv", index=False)

In [12]:
df_all_dependencies_summarized = pd.concat((df_all_dependencies_source, df_all_dependencies_target))

In [13]:
df_all_dependencies_summarized.to_csv("./Files/all_dependencies_summarized.csv", index=False)

In [318]:
df_all_dependencies_summarized.loc[(df_all_dependencies_summarized["Project_type"]=="Target")&(df_all_dependencies_summarized["is_same_dev"]!="Same"), ["Churn", "Duration", "Revisions"]].median()

Churn        10.00
Duration      2.12
Revisions     2.00
dtype: float64

In [5]:
extended_paths_number = pd.read_csv("./Files/Number/extended_merged.csv")
extended_paths_number["Path"] = extended_paths_number.Path.map(ast.literal_eval)

In [6]:
depended_changes = set(hpr.flatten_list(extended_paths_number.Path.values))
# all_dependencies = pd.read_csv("./Files/all_dependencies.csv")

#### Number of cross/within-project changes

In [36]:
cross_project_changes = list(set(hpr.flatten_list(df_all_dependencies.loc[df_all_dependencies["is_cross"]=="Cross", ["Source", "Target"]].values)))# *100/ 165033
single_project_changes = list(depended_changes.difference(cross_project_changes))

#### Number of cross/within-service changes

In [105]:
cross_service_changes = list(set(hpr.flatten_list(df_all_dependencies.loc[df_all_dependencies["is_cross_service"]=="Cross", ["Source", "Target"]].values)))# *100/ 165033
single_service_changes = list(set(hpr.flatten_list(df_all_dependencies.loc[df_all_dependencies["is_cross_service"]=="Within", ["Source", "Target"]].values)).difference(cross_project_changes))

In [107]:
len(cross_service_changes)

15628

In [101]:
cross_project_change_abandoned = 0
for change in cross_project_changes:
    status = df.loc[df["number"]==change, "status"].values[0]
    if status == "ABANDONED":
        cross_project_change_abandoned += 1

single_project_change_abandoned = 0
for change in single_project_changes:
    status = df.loc[df["number"]==change, "status"].values[0]
    if status == "ABANDONED":
        single_project_change_abandoned += 1

In [108]:
cross_service_change_abandoned = 0
for change in cross_service_changes:
    status = df.loc[df["number"]==change, "status"].values[0]
    if status == "ABANDONED":
        cross_service_change_abandoned += 1

single_service_change_abandoned = 0
for change in single_service_changes:
    status = df.loc[df["number"]==change, "status"].values[0]
    if status == "ABANDONED":
        single_service_change_abandoned += 1

### % of abndoned cross-project/service dependent changes

##### Within/cross-project

In [110]:
cross_project_change_abandoned/len(cross_project_changes)

0.2079356238836928

In [None]:
single_project_change_abandoned/len(single_project_changes)

##### Within/cross-service

In [109]:
cross_service_change_abandoned/len(cross_service_changes)

0.2595341694394676

In [112]:
single_service_change_abandoned/len(single_service_changes)

0.07796035739960974

##### Ordinary changes

In [None]:
abandoned_ordinary_change_numbers = df.loc[(~df["number"].isin(depended_changes))&(df["status"]=="ABANDONED"), "number"].values
ordinary_change_numbers = df.loc[(~df["number"].isin(depended_changes)), "number"].values

len(abandoned_ordinary_change_numbers)/len(ordinary_change_numbers)

#### the # of cross and single-service dependent changes

In [12]:
cross_single_service_changes_count = {"cross":0, "single": 0}

In [None]:
for dc in depended_changes:
    services = set(
        hpr.flatten_list(
            df_all_dependencies.loc[(df_all_dependencies["is_cross_service"].notnull()) &
                                 ((df_all_dependencies["Source"] == dc) |
                                  (df_all_dependencies["Target"] == dc)),
                                 ["Source_service", "Target_service"]].values))
    if len(services) == 1:
        cross_single_service_changes_count["single"] += 1
    elif len(services) > 1:
        cross_single_service_changes_count["cross"] += 1

In [27]:
len(set(hpr.flatten_list(
            df_all_dependencies.loc[df_all_dependencies["is_cross"]=="Cross",
                                 ["Source", "Target"]].values)))

52069

In [420]:
cross_single_service_changes_count

{'cross': 15617, 'single': 123849}

In [4]:
15617/(15617+123849)

0.11197711270130355

In [429]:
len(all_dependencies[all_dependencies["Service_source"].notnull()&all_dependencies["Service_target"].notnull()&(all_dependencies["Service_source"]!=all_dependencies["Service_target"])])

28092

#### Cross and single-service dependencies

In [63]:
len(null_services_dependencies)*100/(len(services_dependencies)+len(null_services_dependencies))

7.8160025984122

In [64]:
services_dependencies["is_cross"] = services_dependencies.apply(lambda row: True if row["Service_source"]!=row["Service_target"] else False, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  services_dependencies["is_cross"] = services_dependencies.apply(lambda row: True if row["Service_source"]!=row["Service_target"] else False, axis=1)


In [67]:
services_dependencies_count = {"cross": 0, "single": 0}
services_dependencies_count["cross"] = len(services_dependencies[services_dependencies["is_cross"]==True])
services_dependencies_count["single"] = len(services_dependencies[services_dependencies["is_cross"]==False])

In [68]:
print("Cross-service dependencies: %.2f " %(services_dependencies_count["cross"]*100/(services_dependencies_count["cross"]+services_dependencies_count["single"])))
print("Single-service dependencies: %.2f " %(services_dependencies_count["single"]*100/(services_dependencies_count["cross"]+services_dependencies_count["single"])))

Cross-service dependencies: 16.23 
Single-service dependencies: 83.77 


In [71]:
services_dependencies_count["cross"]*2

56184

### How many changes are related to co-changing components ?

In [26]:
online_repositories = pd.read_csv("./all_os_components.csv")

online_repositories["related_projects"] = online_repositories["related_projects"].apply(ast.literal_eval)

online_repositories = dict(zip(online_repositories["main_project"].values.reshape(-1), online_repositories["related_projects"].values))

def invert_projects_services():
    return {"openstack/%s" % p:service for service, projects in online_repositories.items() for p in projects}

project_services_inv = invert_projects_services()

In [30]:
def compute_array_length(row):
    return len(set(row["Path"]))

def compute_array_length_services(row):
    projects = df.loc[df["number"].isin(row["Path"]), "project"].unique()
    services = [project_services_inv.get(p) for p in projects]
    return len(set(services))

In [441]:
extended_paths_number = pd.read_csv("./Files/Number/extended_merged.csv")

extended_paths_number["Path"] = extended_paths_number["Path"].apply(ast.literal_eval)

extended_paths_number["length"] = extended_paths_number.apply(compute_array_length, axis=1)
# co_changing_components_changes = list(dict.fromkeys(hpr.flatten_list(co_changing_components_changes)))

In [31]:
extended_paths_number["length_service"] = extended_paths_number.apply(compute_array_length_services, axis=1)

In [442]:
extended_paths_repo = extended_paths_number.copy()

In [443]:
extended_paths_repo["Path"] = extended_paths_repo.Path.map(lambda path: df.loc[df["number"].isin(path), "project"].unique())

In [458]:
extended_paths_repo["length"] = extended_paths_repo.apply(compute_array_length, axis=1)
extended_paths_repo["length_service"] = extended_paths_repo.apply(compute_array_length_services, axis=1)
# co_changing_components_changes = extended_paths_number.loc[extended_paths_number["length"] > 1]
# co_changing_components_changes = list(dict.fromkeys(hpr.flatten_list(co_changing_components_changes["Path"].values)))

In [449]:
# len(
cross_project_deps = extended_paths_repo[extended_paths_repo["length"]>1]
# /len(extended_paths_repo)

In [450]:
cross_project_deps["service_length"] = cross_project_deps.apply(compute_array_length_services, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cross_project_deps["service_length"] = cross_project_deps.apply(compute_array_length_services, axis=1)


In [456]:
len(extended_paths_repo[extended_paths_repo["length"]>1]),len(extended_paths_repo)

(15467, 59375)

In [465]:
len(extended_paths_repo[extended_paths_repo["length_service"]>1]),len(extended_paths_repo)

(7377, 59375)

In [472]:
len(cross_project_deps[cross_project_deps["service_length"]==2])/len(cross_project_deps)

0.41449537725480057

In [201]:
cross_project_deps[["length"]].to_csv("./RQs/RQ1/Files/component_chains.csv", index=False)

In [464]:
cross_project_deps.sort_values("service_length")

Unnamed: 0,Path,length,service_length
59345,"[openstack/networking-ovn, openstack/neutron]",2,1
13820,"[openstack/api-site, openstack/openstack-manuals]",2,1
6926,"[openstack/project-config, openstack/openstack...",2,1
13821,"[openstack/puppet-cinder, openstack/puppet-ope...",2,1
13824,"[openstack/fuel-main, openstack/fuel-library, ...",3,1
...,...,...,...
15079,"[openstack/tacker-specs, openstack/cinder-spec...",55,28
14723,"[openstack/masakari-monitors, openstack/tap-as...",80,31
16101,"[openstack/searchlight, openstack/congress, op...",118,37
2185,"[openstack/nova, openstack/manila-test-image, ...",155,40


In [202]:
cross_project_deps[cross_project_deps["service_length"]>1].to_csv("./RQs/RQ1/Files/service_chains.csv", index=False)

In [212]:
longest_chain = extended_paths_number.sort_values("length").tail(1)["Path"].values[0]

In [217]:
# len(df.loc[df.number.isin(chain_224)&(df.status == "MERGED")]),len(df.loc[df.number.isin(chain_224)])

(199, 231)

### How many cochnages ended up being abandoned ?

#### How many abandoned changes not depending on other changes

In [10]:
def is_all_abandoned(row):
    result = df.loc[df["number"].isin(row["Path"]), "status"].unique()

    if len(result) == 1 and result[0] == "ABANDONED":
        return True
    return False

def is_cross_project(row):
    result = df.loc[df["number"].isin(row["Path"]), "project"].unique()

    if len(result) > 1:
        return True
    return False

In [594]:
extended_paths_number["all_abandoned"] = extended_paths_number.apply(is_all_abandoned, axis=1)

In [12]:
extended_paths_number = pd.read_csv("./Files/Number/extended_merged.csv")
extended_paths_number["Path"] = extended_paths_number["Path"].map(ast.literal_eval)

In [13]:
extended_paths_number["is_cross_project"] = extended_paths_number.apply(is_cross_project, axis=1)

In [23]:
len(extended_paths_number[(extended_paths_number["all_abandoned"]==True)&(extended_paths_number["is_cross_project"]==True)]),len(extended_paths_number[(extended_paths_number["is_cross_project"]==True)])

(1584, 15467)

In [8]:
len(extended_paths_number[extended_paths_number.all_abandoned==True])/len(extended_paths_number)

0.04205473684210526

In [None]:
# len(
numbers = extended_paths_number[extended_paths_number.all_abandoned==True].sort_values("length").tail(1)["Path"].values[0]
    # )/len(extended_paths_number)

In [44]:
len(extended_paths_number[(extended_paths_number["all_abandoned"]==True)&(extended_paths_number["is_cross_project"]==False)])/len(extended_paths_number[extended_paths_number["is_cross_project"]==False])

0.020793477270656828

In [45]:
len(extended_paths_number[(extended_paths_number["all_abandoned"]==True)&(extended_paths_number["length_service"]==1)])/len(extended_paths_number[extended_paths_number["length_service"]==1])
# len(extended_paths_number[extended_paths_number["length_service"]>1])/len(extended_paths_number)

0.034847494134389784

In [None]:
len(extended_paths_number[(extended_paths_number["all_abandoned"]==True)&(extended_paths_number["length_service"]==1)])/len(extended_paths_number[extended_paths_number["length_service"]>1])

### How many abandoned changes that are for not to be merged with DNM(Do Not Merge) or including keywords test

In [113]:
depended_abandoned_changes = []

In [115]:
treated_items = []
for idx,row in df_all_dependencies.iterrows():
    if row["Source_status"] == "ABANDONED":
        depended_abandoned_changes.append(row["Source"])
    if row["Target_status"] == "ABANDONED":
        depended_abandoned_changes.append(row["Target"])

In [116]:
depended_abandoned_changes = list(set(depended_abandoned_changes))

In [117]:
df_depended_abandoned_changes = pd.DataFrame({"change": depended_abandoned_changes})

In [118]:
df_depended_abandoned_changes["commit_message"] = df_depended_abandoned_changes["change"].map(lambda change: df.loc[df["number"]==change, "commit_message"].values[0])

In [119]:
df_depended_abandoned_changes["DNM"] = df_depended_abandoned_changes["commit_message"].map(lambda message: True if "DNM" in message else False)
df_depended_abandoned_changes["test"] = df_depended_abandoned_changes["commit_message"].map(lambda message: True if "test" in message else False)

In [120]:
len(df_depended_abandoned_changes[df_depended_abandoned_changes["DNM"]|df_depended_abandoned_changes["test"]])/len(df_depended_abandoned_changes)

0.35716812370508866

In [121]:
df["DNM"] = df["commit_message"].map(lambda message: True if "DNM" in message else False)
df["test"] = df["commit_message"].map(lambda message: True if "test" in message else False)

In [35]:
len(df[(df["status"]=="ABANDONED")&(~df["number"].isin(depended_abandoned_changes))&(df["DNM"]|df["test"])])/len(df[(df["status"]=="ABANDONED")&(~df["number"].isin(depended_abandoned_changes))])

0.23219455703532135