In [2]:
import os.path as osp
import pandas as pd
from scipy.stats import wilcoxon
from datetime import datetime
import ast
import utils.helpers as hpr

In [3]:
path = osp.join('.', 'RQs', 'RQ4', 'Files')

### Functions

In [153]:
def time_diff(start, end):
    if start > end:
        start, end = end, start
    current_date =  datetime.strptime(end[:-11], "%Y-%m-%d %H:%M:%S") 
    previous_date = datetime.strptime(start[:-11], "%Y-%m-%d %H:%M:%S") 
    diff = current_date - previous_date
    diff = float("{:.2f}".format(diff.total_seconds() / 3600))
    return diff

def compute_time_diff(revisions):
    if len(revisions) == 1:
        return 0
    dates = [rev["created"] for rev in revisions]
    dates.sort()
    return time_diff(dates[0], dates[-1])

### Main dataframe

In [5]:
df = hpr.combine_openstack_data()

In [6]:
len(df)

516509

In [3]:
df["reviewers"] = df["reviewers"].apply(ast.literal_eval)

In [4]:
df["reviewers_ids"] = df["reviewers"].map(lambda revs: [rev["_account_id"] for rev in revs if "tags" not in rev.keys()])

In [160]:
df["revisions"] = df["revisions"].apply(ast.literal_eval)

In [161]:
df["merge_duration"] = df["revisions"].map(compute_time_diff)

In [4]:
df_all_dependencies = pd.read_csv("./Files/all_dependencies.csv")

In [83]:
df_all_dependencies = df_all_dependencies[
    (df_all_dependencies.is_cross=='Cross') &
    (df_all_dependencies.Source_status == 'MERGED') &
    (df_all_dependencies.Target_status == 'MERGED') &
    (df_all_dependencies.is_source_bot == 0) &
    (df_all_dependencies.is_target_bot == 0)
]

#### \# of cross-project dependencies made by the same developer

In [19]:
# Same
len(df_all_dependencies.loc[df_all_dependencies.is_same_dev == "Same"])

35556

In [20]:
# Different
len(df_all_dependencies.loc[df_all_dependencies.is_same_dev == "Different"])

11572

#### \# of cross-team dependencies made by the same developer

In [26]:
len(df_all_dependencies.loc[
    (df_all_dependencies.is_cross_team == "Cross")&
    (df_all_dependencies.is_same_dev == "Different")
])

2944

#### % of cross-project deps made by different developers

In [27]:
len(df_all_dependencies.loc[df_all_dependencies.is_same_dev == "Different"])/len(df_all_dependencies)

0.24554405024613818

#### % of cross-team deps made by different developers

In [28]:
len(df_all_dependencies.loc[
    (df_all_dependencies.is_cross_team == "Cross")&
    (df_all_dependencies.is_same_dev == "Different")])/len(df_all_dependencies.loc[df_all_dependencies.is_cross_team == "Cross"])

0.33246753246753247

In [29]:
len(df_all_dependencies[(df_all_dependencies.is_same_dev == "Different") &
                    ((df_all_dependencies.Source_rev_target == True) &
                     (df_all_dependencies.Target_rev_source == True))
                     ])/len(df_all_dependencies[(df_all_dependencies.is_same_dev == "Different")])


0.2744555824403733

#### The analysis concerning developer working on project they inexperienced with

##### Cross-project

In [5]:
len(df_all_dependencies.loc[
    (df_all_dependencies.is_same_dev == 'Same') &
    (df_all_dependencies['Source_exp'] < 5)|
    (df_all_dependencies['Target_exp'] < 5), 
    ['Source_exp', 'Target_exp']
])/len(df_all_dependencies)

0.6065184179256493

In [27]:
len(df_all_dependencies.loc[
    (df_all_dependencies.is_same_dev == 'Same') &
    (df_all_dependencies['Source_exp'] < 5)&
    (df_all_dependencies['Target_exp'] < 5), 
    ['Source_exp', 'Target_exp']
])/len(df_all_dependencies)

0.41457307757596334

In [58]:
df_all_dependencies.to_csv('./Files/all_dependencies.csv', index=False)

In [35]:
df_all_dependencies.loc[
     (df_all_dependencies.is_cross == 'Cross') &
    (df_all_dependencies.is_same_dev == 'Same'), ['min', 'max', 'max_min']].to_csv(f'{path}/same_dev_data.csv', index=None)

In [93]:
df_all_dependencies.loc[
    # (df_all_dependencies.is_cross_team == 'Cross')&
    (df_all_dependencies.is_same_dev == 'Same'),
    'max'
].median()

3.5860333438188112

#### Wilcoxon test

In [57]:
wilcoxon(df_all_dependencies.loc[df_all_dependencies['is_same_dev']=='Same', 'min'].tolist(), df_all_dependencies.loc[df_all_dependencies['is_same_dev']=='Same', 'max'].tolist())

WilcoxonResult(statistic=0.0, pvalue=0.0)

#### Churn

In [59]:
df_all_dependencies_churn_less = df_all_dependencies.loc[
    (df_all_dependencies["is_same_dev"] == "Same"), ["Churn_less"]]
df_all_dependencies_churn_less["Label"] = "Less familiar"
df_all_dependencies_churn_less.rename(columns={"Churn_less": "Churn"}, inplace=True)

df_all_dependencies_churn_more = df_all_dependencies.loc[
    (df_all_dependencies["is_same_dev"] == "Same"), ["Churn_more"]]
df_all_dependencies_churn_more["Label"] = "More familiar"
df_all_dependencies_churn_more.rename(columns={"Churn_more": "Churn"}, inplace=True)

df_all_dependencies_churn = pd.concat((df_all_dependencies_churn_less, df_all_dependencies_churn_more))
df_all_dependencies_churn[df_all_dependencies_churn.Churn.notnull()].to_csv(f"{path}/same_dev_churn.csv", index=False)

In [80]:
# df_all_dependencies_churn_less.Churn.median()
# df_all_dependencies_duration_less.Duration.median()
df_all_dependencies_revisions_more.Revisions.median()

2.0

#### Duration

In [64]:
df_all_dependencies_duration_less = df_all_dependencies.loc[
    (df_all_dependencies["is_same_dev"] == "Same"), ["Duration_less"]]
df_all_dependencies_duration_less["Label"] = "Less familiar"
df_all_dependencies_duration_less.rename(columns={"Duration_less": "Duration"}, inplace=True)

df_all_dependencies_duration_more = df_all_dependencies.loc[
    (df_all_dependencies["is_same_dev"] == "Same"), ["Duration_more"]]
df_all_dependencies_duration_more["Label"] = "More familiar"
df_all_dependencies_duration_more.rename(columns={"Duration_more": "Duration"}, inplace=True)

df_all_dependencies_duration = pd.concat((df_all_dependencies_duration_less, df_all_dependencies_duration_more))
df_all_dependencies_duration[df_all_dependencies_duration.Duration.notnull()].to_csv(f"{path}/same_dev_duration.csv", index=False)

#### Revisions

In [66]:
df_all_dependencies_revisions_less = df_all_dependencies.loc[
    (df_all_dependencies["is_same_dev"] == "Same"), ["Revisions_less"]]
df_all_dependencies_revisions_less["Label"] = "Less familiar"
df_all_dependencies_revisions_less.rename(columns={"Revisions_less": "Revisions"}, inplace=True)

df_all_dependencies_revisions_more = df_all_dependencies.loc[
    (df_all_dependencies["is_same_dev"] == "Same"), ["Revisions_more"]]
df_all_dependencies_revisions_more["Label"] = "More familiar"
df_all_dependencies_revisions_more.rename(columns={"Revisions_more": "Revisions"}, inplace=True)

df_all_dependencies_revisions = pd.concat((df_all_dependencies_revisions_less, df_all_dependencies_revisions_more))
df_all_dependencies_revisions[df_all_dependencies_revisions.Revisions.notnull()].to_csv(f"{path}/same_dev_revisions.csv", index=False)

##### Cross-team

In [30]:
len(df_all_dependencies.loc[
    (df_all_dependencies.is_same_dev == 'Same') &
    (df_all_dependencies['is_cross_team']=='Cross')&
    ((df_all_dependencies['Source_exp'] < 5)|
    (df_all_dependencies['Target_exp'] < 5)), 
    ['Source_exp', 'Target_exp']
])/len(df_all_dependencies[(df_all_dependencies['is_cross_team']=='Cross')])

0.6428006775832863

##### Code-churn

In [73]:
df_deps_same_dev = df_all_dependencies.loc[
    (df_all_dependencies.is_same_dev == 'Same') &
    (df_all_dependencies['Source_exp'] < 5)|
    (df_all_dependencies['Target_exp'] < 5), 
    ['Source', 'Target', 'Source_exp', 'Target_exp', 'Source_churn', 'Target_churn', 'Source_revisions', 'Target_revisions', 'Source_duration', 'Target_duration']
]

In [74]:
def identify_min_value(row, attribute):
    if row['Source_exp'] < row['Target_exp']:
        return row[f'Source_{attribute}']
    return row[f'Target_{attribute}']

In [75]:
for attr in ['churn', 'revisions', 'duration']:
    df_deps_same_dev[f'{attr.capitalize()}'] = df_deps_same_dev.apply(identify_min_value, args=(attr,), axis=1)
df_deps_same_dev['id'] = list(range(1, len(df_deps_same_dev) + 1))

In [69]:
df_deps_same_dev[['id', 'Churn', 'Revisions', 'Duration']].to_csv(f'{path}/deps_same_dev.csv', index=None)

### The cross-project/team reviewed by how many shared reviewers ?

##### Cross-project

In [78]:
df_all_dependencies[["intersect_rev"]].to_csv(f"{path}/cross_project_changes_reviewers_intersect.csv",index=False)

In [84]:
# No reviewer
len(df_all_dependencies[df_all_dependencies.intersect_rev==0])/len(df_all_dependencies)

0.08659395688338142

In [88]:
df_all_dependencies['intersect_rev'].median()

33.333333333333336

In [93]:
len(df_all_dependencies[
    (df_all_dependencies.intersect_rev==0)&
    (df_all_dependencies['Source_release']==df_all_dependencies['Target_release'])
    ])/len(df_all_dependencies[
        (df_all_dependencies.intersect_rev==0)
        ])

0.49056603773584906

##### Cross-team

In [79]:
df_all_dependencies.loc[(df_all_dependencies["is_cross_team"]=='Cross'), ["intersect_rev"]].to_csv(f"{path}/cross_service_changes_reviewers_intersect.csv",index=False)

In [87]:
# No reviewer
len(df_all_dependencies[
    (df_all_dependencies.is_cross_team=='Cross')&
    (df_all_dependencies.intersect_rev==0)])/len(df_all_dependencies[df_all_dependencies.is_cross_team=='Cross'])

0.12038396386222473

In [90]:
# Median
df_all_dependencies.loc[(df_all_dependencies.is_cross_team=='Cross'), 'intersect_rev'].median()

16.666666666666668

In [78]:
df_all_dependencies.loc[
    # (df_all_dependencies["intersect_rev"]) &
                            (df_all_dependencies["is_cross_service"] == True),
                            "intersect_rev"].median()

20.0

In [74]:
len(df_all_dependencies.loc[(df_all_dependencies["intersect_rev"] == 0),"intersect_rev"]) / len(df_all_dependencies)


0.08659395688338142

In [16]:
df_all_dependencies.loc[(df_all_dependencies["intersect_rev"] > 0) &
                        (df_all_dependencies["is_cross_team"] == 'Cross'),
"intersect_rev"].median()

22.22222222222222

#### Does it take much time to merge changes in projects with less experience vs. changes in projects with familiar with

In [None]:
df_all_dependencies["Source_duration"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["same_dev"]==True), "Source"].map(lambda nbr: df.loc[df.number==nbr,"merge_duration"].values[0])
df_all_dependencies["Target_duration"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["same_dev"]==True), "Target"].map(lambda nbr: df.loc[df.number==nbr,"merge_duration"].values[0])
df_all_dependencies["merge_duration_diff"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["same_dev"]==True),["Source_exp", "Target_exp", "Source_duration", "Target_duration"]].apply(identify_merge_duration, axis=1)

In [211]:
len(df_all_dependencies.loc[
    (df_all_dependencies["is_cross"] == True) &
    (df_all_dependencies["status_source"] == "MERGED") &
    (df_all_dependencies["status_target"] == "MERGED") &
    (df_all_dependencies["is_source_bot"] == False) &
    ((df_all_dependencies["Source_exp"] != 0) |
     (df_all_dependencies["Target_exp"] != 0)) &
    (df_all_dependencies["same_dev"] == True) &
    (df_all_dependencies["merge_duration_diff"] > 1), ["min"]]) / len(
        df_all_dependencies.loc[
            (df_all_dependencies["is_cross"] == True) &
            ((df_all_dependencies["Source_exp"] != 0) & 
            (df_all_dependencies["Target_exp"] != 0)) &
            (df_all_dependencies["status_source"] == "MERGED") &
            (df_all_dependencies["status_target"] == "MERGED") &
            (df_all_dependencies["is_source_bot"] == False) &
            (df_all_dependencies["is_target_bot"] == False) &
            (df_all_dependencies["same_dev"] == True)])


0.4403501703234314

In [None]:
df_all_dependencies.loc[
    (df_all_dependencies["is_cross"] == True) &
    (df_all_dependencies["status_source"] == "MERGED") &
    (df_all_dependencies["status_target"] == "MERGED") &
    (df_all_dependencies["is_source_bot"] == False) &
    ((df_all_dependencies["Source_exp"] != 0) |
     (df_all_dependencies["Target_exp"] != 0)) &
    (df_all_dependencies["same_dev"] == True) &
    (df_all_dependencies["merge_duration_diff"] < 1), ["min"]]

In [136]:
df_all_dependencies.loc[(df_all_dependencies["status_source"]=="MERGED")&(df_all_dependencies["status_target"]=="MERGED")&(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["same_dev"]==True), ["min", "max", "max_min"]].to_csv("./RQs/RQ4/Files/same_dev_experience.csv", index=False)

### Analysis

In [None]:
df_all_dependencies["Source_revisions"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["same_dev"]==True), "Source"].map(lambda nbr: df.loc[df["number"]==nbr,"revisions_count"].values[0])
df_all_dependencies["Target_revisions"] = df_all_dependencies.loc[(df_all_dependencies["is_cross"]==True)&(df_all_dependencies["same_dev"]==True), "Target"].map(lambda nbr: df.loc[df["number"]==nbr,"revisions_count"].values[0])

In [215]:
len(df_all_dependencies.loc[(df_all_dependencies["intersect_rev"]==0)])/len(df_all_dependencies)

0.08718834644997547

#### Does it get quickly reviewed ?

In [305]:
def compute_review_duration(nbr):
    messages = df.loc[df.number==nbr, "messages"].values[0]
    messages.sort(key=lambda x: x["date"])
    if len(messages) >1:
        return time_diff(messages[0]["date"], messages[-1]["date"])
    return 0

In [5]:
df_all_dependencies_duration_source_reviewers = df_all_dependencies.loc[
    # (df_all_dependencies["Duration_source"].notnull()) &
    (df_all_dependencies["intersect_rev"]>0), ["Source_duration"]]
df_all_dependencies_duration_source_reviewers.rename(columns={"Source_duration": "Duration"}, inplace=True)
df_all_dependencies_duration_source_reviewers["Reviewer_sharing"] = "At least one"
df_all_dependencies_duration_source_reviewers["Project_type"] = "Source"

df_all_dependencies_duration_target_reviewers = df_all_dependencies.loc[
    # (df_all_dependencies["Duration_target"].notnull())&
    (df_all_dependencies["intersect_rev"]>0), ["Target_duration"]]
df_all_dependencies_duration_target_reviewers.rename(columns={"Target_duration": "Duration"}, inplace=True)
df_all_dependencies_duration_target_reviewers["Reviewer_sharing"] = "At least one"
df_all_dependencies_duration_target_reviewers["Project_type"] = "Target"

df_all_dependencies_duration_source_no_reviewers = df_all_dependencies.loc[
    # (df_all_dependencies["Duration_source"].notnull()) &
    (df_all_dependencies["intersect_rev"]==0), ["Source_duration"]]
df_all_dependencies_duration_source_no_reviewers.rename(columns={"Source_duration": "Duration"}, inplace=True)
df_all_dependencies_duration_source_no_reviewers["Reviewer_sharing"] = "Different"
df_all_dependencies_duration_source_no_reviewers["Project_type"] = "Source"

df_all_dependencies_duration_target_no_reviewers = df_all_dependencies.loc[
    # (df_all_dependencies["Duration_target"].notnull()) &
    (df_all_dependencies["intersect_rev"]==0), ["Target_duration"]]
df_all_dependencies_duration_target_no_reviewers.rename(columns={"Target_duration": "Duration"}, inplace=True)
df_all_dependencies_duration_target_no_reviewers["Reviewer_sharing"] = "Different"
df_all_dependencies_duration_target_no_reviewers["Project_type"] = "Target"


In [7]:
df_all_dependencies_duration_reviewers = pd.concat((df_all_dependencies_duration_source_reviewers,df_all_dependencies_duration_target_reviewers,df_all_dependencies_duration_source_no_reviewers,df_all_dependencies_duration_target_no_reviewers))

In [8]:
df_all_dependencies_duration_reviewers.to_csv("./RQs/RQ4/Files/Duration_reviewers.csv", index=False)