In [2]:
import pandas as pd
import ast
import utils.helpers as hpr
from datetime import datetime
import re

In [43]:
def combine_openstack_data():
    '''Combine generated csv files into a single DataFrame object
    '''
    df = pd.DataFrame([])
    data_path = "%sChanges/" % hpr.DIR
    changes_file_names = hpr.list_file(data_path)
    for f in changes_file_names:
        df_per_file = pd.read_csv("%s%s" % (data_path, f))
        df = pd.concat((df, df_per_file))

    df = df.drop_duplicates(subset=["number"])

    df = df.sort_values(by="updated", ascending=False).reset_index(drop=True)

    return df

In [42]:
df = combine_openstack_data()

#### link service to projects

In [92]:
online_repositories = pd.read_csv("./all_os_components.csv")

online_repositories["related_projects"] = online_repositories["related_projects"].apply(ast.literal_eval)

online_repositories = dict(zip(online_repositories["main_project"].values.reshape(-1), online_repositories["related_projects"].values))

switched_key_values = {"openstack/%s" % item: key for key, value in online_repositories.items() for item in value}

### Measure the duration and the number of revisions a developer need to work out until a dependency is identified

In [186]:
def time_diff(start, end):
    if start > end:
        start, end = end, start
    current_date =  datetime.strptime(end, "%Y-%m-%d %H:%M:%S") 
    previous_date = datetime.strptime(start, "%Y-%m-%d %H:%M:%S") 
    diff = current_date - previous_date
    diff = float("{:.2f}".format(diff.total_seconds() / 3600))
    return diff


def extract_attr(x, attr):
    '''Extracts the passed-on parameter values out of the commit message 
    '''
    rs = re.findall("%s:\s[a-zA-Z0-9/\.\:\+\-\#]{6,}" % (attr), x)
    result = []
    for row in rs:
        row = row[len(attr) + 2:]
        change_id_pattern = re.search(r"[a-zA-Z0-9]{41}", row)
        if change_id_pattern:
            result.append(change_id_pattern[0])
            continue
        number_pattern = re.search("#?https?[\:][/]{2}review[\.](opendev|openstack)[\.]org([a-z0-9A-Z\-\+/\.#]*)\d+", row)
        if number_pattern:
            result.append(int(re.search("\d+$", number_pattern[0][0:])[0]))
    return result if len(result) != 0 else None


def retrieve_revision_date(row, attr, return_revision_date=True):
    number = None
    second_number = None

    if attr == "Depends-On":
        number = row["Target"]
        second_number = row["Source"]
        change_id = row["Source_change_id"]
    else:
        number = row["Source"]
        second_number = row["Target"]
        change_id = row["Target_change_id"]

    df_row = df.loc[df["number"] == number]
    revisions = ast.literal_eval(df_row["revisions"].values[0])
    revisions = sorted(revisions, key=lambda x: x["created"])
    if  len(revisions) == 1:
        if return_revision_date:
            return revisions[0]["created"]
        else:
            return 1

    first_revision = revisions[0]
    first_message = first_revision["message"]

    results = extract_attr(first_message, attr)

    if results and ((change_id in results) or (second_number in results)):
        if return_revision_date:
            return first_revision["created"]
        else:
            return 1

    for i in range(1,len(revisions)):
        current_message = revisions[i]["message"]
        created = revisions[i]["created"]
        results = extract_attr(current_message, attr)
        
        if results and ((change_id in results) or (second_number in results)):

            if return_revision_date:
                return created
            else:
                return i + 1

def is_same_developer(row):
    return "Same" if row["Source_dev"] == row["Target_dev"] else "Different"

def identify_dependency(row):
    source_date = row["Source_date"] 
    target_date = row["Target_date"]
    link_date = row["link_date"]

    return min(time_diff(target_date[:-10], link_date[:-10]), time_diff(source_date[:-10], link_date[:-10]))

### Services

In [98]:
online_repositories = pd.read_csv("./all_os_components.csv")

online_repositories["related_projects"] = online_repositories["related_projects"].apply(ast.literal_eval)

online_repositories = dict(zip(online_repositories["main_project"].values.reshape(-1), online_repositories["related_projects"].values))

switched_key_values = {"openstack/%s" % item: key for key, value in online_repositories.items() for item in value}

### Depends-On

In [97]:
df_depends_on = pd.read_csv("./Files/source_target_depends.csv")

df_depends_on["Source_status"] = df_depends_on["Source"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])
df_depends_on["Target_status"] = df_depends_on["Target"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])

df_depends_on["revisions"] = df_depends_on["Target"].map(lambda x: df.loc[df["number"]==x, "revisions"].values[0])
df_depends_on["Source_change_id"] = df_depends_on["Source"].map(lambda x: df.loc[df["number"]==x, "change_id"].values[0])
df_depends_on["Target_change_id"] = df_depends_on["Target"].map(lambda x: df.loc[df["number"]==x, "change_id"].values[0])

df_depends_on["link_date"] = df_depends_on.apply(retrieve_revision_date, args=("Depends-On",), axis=1)

df_depends_on["worked_revisions"] = df_depends_on.apply(retrieve_revision_date, args=("Depends-On",False,), axis=1)

df_depends_on["is_cross"] = df_depends_on.apply(lambda row: "Cross" if row["Source_repo"]!=row["Target_repo"] else "Same", axis=1)

df_depends_on["is_source_bot"] = df_depends_on["Source"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])
df_depends_on["is_target_bot"] = df_depends_on["Target"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])

df_depends_on["Source_dev"] = df_depends_on["Source"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])
df_depends_on["Target_dev"] = df_depends_on["Target"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])

df_depends_on["same_dev"] = df_depends_on.apply(is_same_developer, axis=1)

df_depends_on["Source_service"] = df_depends_on["Source_repo"].map(lambda x: switched_key_values.get(x))
df_depends_on["Target_service"] = df_depends_on["Target_repo"].map(lambda x: switched_key_values.get(x))

df_depends_on["Source_date"] = df_depends_on["Source"].map(lambda x: df.loc[df["number"]==x, "created"].values[0])
df_depends_on["Target_date"] = df_depends_on["Target"].map(lambda x: df.loc[df["number"]==x, "created"].values[0])

df_depends_on["when_identified"] = df_depends_on[["Source_date", "Target_date", "link_date"]].apply(identify_dependency, axis=1)

### Needed-By

In [99]:
df_needed_by = pd.read_csv("./Files/source_target_needed.csv")

df_needed_by["Source_status"] = df_needed_by["Source"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])
df_needed_by["Target_status"] = df_needed_by["Target"].map(lambda x: df.loc[df["number"]==x, "status"].values[0])

df_needed_by["revisions"] = df_needed_by["Source"].map(lambda x: df.loc[df["number"]==x, "revisions"].values[0])
df_needed_by["Source_change_id"] = df_needed_by["Source"].map(lambda x: df.loc[df["number"]==x, "change_id"].values[0])
df_needed_by["Target_change_id"] = df_needed_by["Target"].map(lambda x: df.loc[df["number"]==x, "change_id"].values[0])

df_needed_by["link_date"] = df_needed_by.apply(retrieve_revision_date, args=("Needed-By",), axis=1)
df_needed_by["worked_revisions"] = df_needed_by.apply(retrieve_revision_date, args=("Needed-By",False,), axis=1)

df_needed_by["is_cross"] = df_needed_by.apply(lambda row: "Cross" if row["Source_repo"]!=row["Target_repo"] else "Same", axis=1)

df_needed_by["is_source_bot"] = df_needed_by["Source"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])
df_needed_by["is_target_bot"] = df_needed_by["Target"].map(lambda x: df.loc[df["number"]==x,"is_owner_bot"].values[0])

df_needed_by["Source_dev"] = df_needed_by["Source"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])
df_needed_by["Target_dev"] = df_needed_by["Target"].map(lambda x: df.loc[df["number"]==x,"owner_account_id"].values[0])

df_needed_by["same_dev"] = df_needed_by.apply(is_same_developer, axis=1)

df_needed_by["Source_service"] = df_needed_by["Source_repo"].map(lambda x: switched_key_values.get(x))
df_needed_by["Target_service"] = df_needed_by["Target_repo"].map(lambda x: switched_key_values.get(x))

df_needed_by["Source_date"] = df_needed_by["Source"].map(lambda x: df.loc[df["number"]==x, "created"].values[0])
df_needed_by["Target_date"] = df_needed_by["Target"].map(lambda x: df.loc[df["number"]==x, "created"].values[0])

df_needed_by["when_identified"] = df_needed_by[["Source_date", "Target_date", "link_date"]].apply(identify_dependency, axis=1)

### Combination of Depends-On and Needed-By

In [None]:
dependency_identification = pd.concat((df_depends_on, def_needed_by)).sort_values("when_identified")
dependency_identification = dependency_identification.drop_duplicates(subset=["Source", "Target"], keep="First")

dependency_identification.to_csv("./RQs/RQ5/Files/dependency_identification.csv", index=False)

### Lags of dependencies

In [3]:
df_all_dependencies = pd.read_csv("./Files/all_dependencies.csv")

In [6]:
df_all_dependencies.loc[(df_all_dependencies["Source_status"] == "MERGED") &
                      (df_all_dependencies["Target_status"] == "MERGED") &
                      (df_all_dependencies["is_source_bot"] == False) & 
                      (df_all_dependencies["is_target_bot"] == False),
                      ["Source", "Target", "Source_repo", "Target_repo", "is_cross", "is_cross_service", "Source_service", "Target_service", "lag", "is_same_dev"]].to_csv("./RQs/RQ5/Files/all_lags.csv", index=False)

In [3]:
all_lags = pd.read_csv("./RQs/RQ5/Files/all_lags.csv")

In [46]:
all_lags.loc[(all_lags.is_same_dev=="Same")&(all_lags.is_cross_service=="Cross")].median()#sort_values("lag").iloc[5600:]

  all_lags.loc[(all_lags.is_same_dev=="Same")&(all_lags.is_cross_service=="Cross")].median()#sort_values("lag").iloc[5600:]


Source    467725.00
Target    467972.00
lag            1.43
dtype: float64

In [21]:
len(all_lags.loc[(all_lags.lag<=1)&(all_lags.is_cross=="Cross"), ["lag"]])/len(all_lags.loc[(all_lags.is_cross=="Cross")])

0.3252418944152096

In [None]:
all_lags.loc[(all_lags["Source_status"] == "MERGED") &
                      (all_lags["Target_status"] == "MERGED") &
                      (all_lags["is_source_bot"] == False) &
                      (all_lags["is_target_bot"] == False) &
                      (all_lags["is_cross_service"] == "Cross") &
                      (all_lags["same_dev"] == "Same") &
                      (all_lags["lag"] > 0), [
                          "Source", "Target", "Source_repo", "Target_repo", "is_cross", "same_dev",
                          "lag", "Source_status", "Target_status"
                      ]].median()


#### Lag of chains

In [18]:
extended_paths_number = pd.read_csv("./Files/Number/extended_merged.csv")

extended_paths_number["Path"] = extended_paths_number["Path"].apply(ast.literal_eval)

In [19]:
extended_paths_number["Path_merged"] = extended_paths_number["Path"].map(lambda path: df.loc[df["number"].isin(path)&(df["status"]=="MERGED"), "number"].values.tolist())

In [2]:
# def merge_lists(list1):
#     merged = []
#     while len(list1) > 0:
#         current_list = list1.pop(0)
#         merged_list = current_list.copy()
#         i = 0
#         while i < len(list1):
#             if any(elem in list1[i] for elem in current_list):
#                 merged_list.extend(list1.pop(i))
#             else:
#                 i += 1
#         merged.append(list(set(merged_list)))
#     return merged

In [21]:
extended_paths_number["length_merged"] = extended_paths_number["Path_merged"].apply(len)

In [22]:
extended_paths_number["is_cross"] = extended_paths_number.loc[extended_paths_number["length_merged"]>1, "Path_merged"].map(lambda path: True if df.loc[df["number"].isin(path), "project"].nunique() > 1 else False)

In [23]:
extended_paths_number_merged = extended_paths_number[extended_paths_number["length_merged"]>1]

In [25]:
def compute_chain_lag(path):
    dates = df.loc[df["number"].isin(path), ["project", "created"]].sort_values("created")
    dates = dates["created"].values
    start_date = dates[0][:-10]
    end_date = dates[-1][:-10]
    return time_diff(start_date, end_date)

In [28]:
extended_paths_number_merged["lag"] = extended_paths_number_merged["Path_merged"].map(compute_chain_lag)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extended_paths_number_merged["lag"] = extended_paths_number_merged["Path_merged"].map(compute_chain_lag)


In [30]:
def is_chain_cross_service(path):
    services = [switched_key_values.get(p) for p in df.loc[df["number"].isin(path), "project"].unique()]
    if len(set(services)) > 1:
        return "Cross"
    elif len(set(services)) == 1:
        return "Same"
    return None

In [None]:
extended_paths_number_merged["is_cross_service"] = extended_paths_number_merged["Path_merged"].map(is_chain_cross_service)

In [90]:
extended_paths_number_merged.to_csv("./RQs/RQ5/Files/chains_lag.csv", index=False)

In [36]:
extended_paths_number_merged[(extended_paths_number_merged.is_cross_service==True)&(extended_paths_number_merged.lag>0)].median()

  extended_paths_number_merged[(extended_paths_number_merged.is_cross_service==True)&(extended_paths_number_merged.lag>0)].median()


length               3.00
all_abandoned        0.00
is_cross             1.00
lag                 90.46
is_cross_service     1.00
length_merged        2.00
dtype: float64

In [203]:
# df_depends_needed.to_csv("./RQs/RQ5/Files/dependency_identification.csv", index=False)

In [47]:
len(extended_paths_number_merged[(extended_paths_number_merged.is_cross=="Cross")&(extended_paths_number_merged.lag==0)])/len(extended_paths_number_merged[(extended_paths_number_merged.is_cross=="Cross")])

0.019747520288548242

### Analysis of when developers find a dependency

In [13]:
dependency_identification = pd.read_csv("./RQs/RQ5/Files/dependency_identification.csv")

In [15]:
dependency_identification["same_dev"] = dependency_identification["same_dev"].map(lambda x: "Different" if x == False else "Same")

In [126]:
dependency_identification[(dependency_identification["is_cross_service"]=="Cross")].median()#sort_values("when_identified")#.iloc[19000]

  df_depends_needed[(df_depends_needed["is_cross_service"]=="Cross")].median()#sort_values("when_identified")#.iloc[19000]


Source              573094.0
Target              573135.0
worked_revisions         1.0
is_cross                 1.0
is_source_bot            0.0
is_target_bot            0.0
Source_dev           10135.0
Target_dev           10135.0
same_dev                 1.0
when_identified          0.0
dtype: float64