In [1]:
import pandas as pd
import numpy as np
import os.path as osp
import ast
from datetime import datetime
from bs4 import BeautifulSoup
import requests as rq
import collections
import utils.helpers as hpr
from itertools import combinations
import re

In [2]:
path = osp.join('.', 'RQs', 'PQ', 'Files')

In [11]:
df = hpr.combine_openstack_data()

In [28]:
df["reviewers"] = df["reviewers"].apply(ast.literal_eval)

In [233]:
os_projects = df["project"].unique()

### The % of developers contributing to different projects

In [14]:
developers = pd.read_csv('developers.csv')

In [7]:
len(developers[developers['projects']>1])/(len(developers))

0.6016635859519408

### Core developers

In [24]:
os_core_team_devs = pd.read_csv(f'{path}/os_core_team.csv')
os_core_team_devs["developers"] = os_core_team_devs["developers"].map(ast.literal_eval)
os_core_team_devs = list(dict.fromkeys(hpr.flatten_list(os_core_team_devs["developers"].values)))

### Casual contributors

In [41]:
def count_age(id):
    dates = df.loc[df.owner_account_id==id, 'created'].values
    if len(dates) == 1:
        return 0
    
    end = datetime.strptime(dates[0], "%Y-%m-%d %H:%M:%S")
    first = datetime.strptime(dates[-1], "%Y-%m-%d %H:%M:%S")

    age = (end - first).days
    return age

df_developers = pd.DataFrame({'id': df['owner_account_id'].unique()})
df_developers['age'] = df_developers['id'].map(count_age)
df_developers['changes'] = df_developers['id'].map(lambda x: len(df.loc[df.owner_account_id==x, 'number']))

In [90]:
df_developers.to_csv(f'{path}/developers.csv', index=None)

In [17]:
df_developers = pd.read_csv(f'{path}/developers.csv')

In [25]:
casual_developers = df_developers.loc[
    (df_developers.changes==1) &
    (~df_developers.id.isin(os_core_team_devs)),
['id']]
casual_developers.to_csv(f'{path}/casual_contributors.csv', index=None)
casual_developers = casual_developers.id.to_list()

# df_developers = df_developers[
#     ((df_developers.age>=15)&(df_developers.changes>=3))|
#     (df_developers.id.isin(os_core_team_devs))
# ]

### How many changes made by core and non. core developers ?

#### Developers

In [237]:
df_developers["status"] = df_developers["id"].map(lambda x: "Core" if x in os_core_team_devs else "Non core")

In [96]:
df_developers[['id', "status", "changes"]].to_csv("./RQs/PQ/Files/core_non_core_changes.csv", index=False)

#### Reviewers

In [238]:
df_reviewers = df['reviewers'].values.tolist()
df_reviewers = hpr.flatten_list(df_reviewers)
df_reviewers = [rev['_account_id'] for rev in df_reviewers if "tags" not in rev and rev['_account_id'] not in casual_developers]
df_reviewers = collections.Counter(df_reviewers)
df_reviewers = pd.DataFrame({'id': df_reviewers.keys(), 'changes': df_reviewers.values()})
df_reviewers["status"] = df_reviewers["id"].map(lambda rev_id: "Core" if rev_id in os_core_team_devs else "Non core")

In [128]:
df_reviewers.to_csv("./RQs/PQ/Files/core_non_core_reviewers_changes.csv", index=False)

### Analysis of OS teams release notes' projects overlap

In [239]:
request_teams = rq.get("https://releases.openstack.org/index.html#teams")
soup_teams = BeautifulSoup(request_teams.text, 'html.parser')

teams_projects = {}
section_teams = soup_teams.find("section", attrs={"id": "teams"})
li_teams = section_teams.find_all("li", "toctree-l1")

for i in range(0, len(li_teams)):
    li_segment = li_teams[i].find("a")
    link = li_teams[i].find("a")
    link_suffix = link["href"]

    request_projects = rq.get("https://releases.openstack.org/%s" % (link_suffix))
    soup_projects = BeautifulSoup(request_projects.text, 'html.parser')
    table_of_contents = soup_projects.find("div", attrs={"id": "table-of-contents"})
    ul_projects = table_of_contents.find("ul")
    ul_projects = ul_projects.find("ul")
    li_releases = ul_projects.find_all("li")
    projects = []
    for li_rel in li_releases:
        if li_rel.text not in ['Antelope', 'Bobcat', 'Caracal']:
            li_projects = li_rel.find_all("li")

            for href_project in li_projects:
                projects.append(href_project.text.replace(" (EOL)", ""))

    teams_projects[re.compile('[^a-zA-Z]').sub('',  link_suffix[6:-5].lower())] = list(dict.fromkeys(projects))

In [240]:
def get_teams_deliverables():
    """Get project team deliverables
    """
    request_teams = rq.get("https://governance.openstack.org/tc/reference/projects/index.html")
    soup_teams = BeautifulSoup(request_teams.text, 'html.parser')

    teams_deliverables = {}
    href_links_teams = soup_teams.find_all("a", "reference internal")

    for i in range(1, len(href_links_teams)-2):
        link = href_links_teams[i]
        link_suffix = link["href"]

        request_projects = rq.get("https://governance.openstack.org/tc/reference/projects/%s" % (link_suffix))
        soup_projects = BeautifulSoup(request_projects.text, 'html.parser')
        href_links_projects = soup_projects.find_all("a", "reference internal")

        projects = [href_links_projects[j].text for j in range(3, len(href_links_projects))]
        teams_deliverables[re.compile('[^a-zA-Z]').sub('',  link_suffix[:-5].lower())] = projects
    return teams_deliverables

additional_teams = get_teams_deliverables()

# combine the two teams lists
team_names = teams_projects.keys()
for key, val in additional_teams.items():
    if key not in team_names:
        teams_projects[key] = val

# Filter out projects with no changes
for team, projects in teams_projects.items():
    teams_projects[team] = [p for p in projects if p in os_projects]

df_teams = pd.DataFrame({'team': teams_projects.keys(), 'projects': teams_projects.values()})
df_teams['size'] = df_teams['projects'].map(len)
df_teams = df_teams[df_teams['size']!=0]

In [246]:
df_team_pairs = list(combinations(df_teams.team.tolist(), 2))
df_team_pairs = np.array(df_team_pairs)

In [247]:
df_team_pairs = pd.DataFrame({"teamA": df_team_pairs[:, 0], "teamB": df_team_pairs[:, 1]})

In [248]:
def calc_intersection(row):
    projects_a = teams_projects.get(row['teamA'])
    projects_b = teams_projects.get(row['teamB'])
    intersect_team = set(projects_a).intersection(projects_b)
    union_team = set(projects_a).union(projects_b)
    return 100 * len(intersect_team) / len(union_team)

In [250]:
df_team_pairs['intersection'] = df_team_pairs.apply(calc_intersection, axis=1)

In [7]:
df_team_pairs = pd.read_csv("./RQs/PQ/Files/teams_metrics.csv")

#### Map projects to their respective teams

In [272]:
def map_projects(row):
    team = row['team']
    projects = []
    if team == 'heat':
        projects = ['heat-translator', 'tosca-parser', 'os-apply-config', 'os-collect-config', 'os-refresh-config']
    elif team == 'neutron':
        projects = ['neutron-lbaas', 'octavia']
    elif team == 'oslo':
        projects = ['openstack-doc-tools', 'openstackdocstheme']
    elif team == 'manila':
        projects = ['manila-ui']
    else:
        return row

    row['projects'] = [p for p in row['projects'] if p not in projects]

    return row

In [273]:
df_teams = df_teams.apply(map_projects, axis=1)

### Compute the percentage of shared developers between pairs of projects within the same team 

In [36]:
def retrieve_devs(projects):
    devs = df.loc[
        (~df["owner_account_id"].isin(casual_developers))
        &(df["project"].isin(projects)), "owner_account_id"].unique()
    return list(set(devs))


def retrieve_reviewers(projects, dev_type):
    reviewers = hpr.flatten_list(df.loc[(df["project"].isin(projects)) &
                                        (~df["owner_account_id"].isin(casual_developers)),
                                        "reviewers"].values.tolist())
    result = []
    for rev in reviewers:
        if "tags" in rev:
            continue

        if dev_type=="Core" and rev['_account_id'] in os_core_team_devs:
            result.append(rev['_account_id'])
        elif rev['_account_id'] not in os_core_team_devs and rev["_account_id"] not in casual_developers:
            result.append(rev['_account_id'])
    return list(set(result))

def retrieve_intersect(list1, list2):
    return list(set(list1).intersection(list2))

def retrieve_union(list1, list2):
    return list(set(list1).union(list2))

def compute_inter_within_team(row):
    result = []
    pairs = list(combinations(row['projects'], 2))
    for pair in pairs:
        # for dev_type in ['Core', 'Non-core']:
        dev_a = retrieve_devs([pair[0]])
        dev_b = retrieve_devs([pair[1]])

        intersect_dev = len(retrieve_intersect(dev_a, dev_b))
        union_dev = len(retrieve_union(dev_a, dev_b))
        ptge_dev = 0 if union_dev == 0 else 100 * intersect_dev / union_dev
        result += [ptge_dev]

    return result
    

In [37]:
df_teams['intersection'] = df_teams[df_teams['size']!=1].apply(compute_inter_within_team, axis=1)
# df_teams

In [379]:
df_teams['Rev'] = df_teams['intersection'].map(lambda x: x['Rev'])

In [3]:
df_team_devs = pd.read_csv(f'{path}/dev_with_team.csv')

In [6]:
df_team_devs#[df_teams['size']==1]

Unnamed: 0,team,intersection
0,adjutant,40.000000
1,adjutant,24.137931
2,adjutant,31.578947
3,barbican,16.666667
4,barbican,50.000000
...,...,...
13727,openstackhelm,46.212121
13728,openstackhelm,8.444444
13729,openstackhelm,28.947368
13730,openstackhelm,15.730337


In [42]:
df_team_devs = df_teams.loc[df_teams['size']!=1, ['team', 'intersection']].copy()
df_team_devs = df_team_devs.explode(column='intersection')
df_team_devs.to_csv(f'{path}/dev_with_team.csv', index=False)

In [70]:
df_team_devs.groupby(by='team').median().sort_values("intersection")

Unnamed: 0_level_0,intersection
team,Unnamed: 1_level_1
glance,2.008032
vitrage,2.898551
kolla,6.024096
swift,6.382979
openstackansible,6.802721
barbican,7.142857
rally,7.19603
solum,8.382506
nova,8.860759
heat,9.045226


In [142]:
del df_teams['intersection']

In [26]:
df_team_revs = pd.read_csv(f'{path}/rev_with_team.csv')

In [137]:
# df_team_revs = df_teams.copy()
# df_team_revs['status'] = [['Core', 'Non-core']for _ in range(len(df_team_revs))]
# df_team_revs = df_team_revs.explode(column='status')
# df_team_revs['intersection'] = df_team_revs.apply(lambda row: row['Rev'][row['status']], axis=1)
# df_team_revs = df_team_revs.explode(column='intersection')
# df_team_revs = df_team_revs[df_team_revs['size']!=1]
# df_team_revs.reset_index(drop=True, inplace=True)
# for c in ['projects', 'size', 'Rev']:
#     del df_team_revs[c]
df_team_revs.to_csv(f'{path}/rev_with_team.csv', index=False)

In [143]:
df_teams.to_csv(f'{path}/os_teams.csv', index=None)

In [8]:
df_teams = pd.read_csv(f'{path}/os_teams.csv')
df_teams['projects'] = df_teams['projects'].map(ast.literal_eval)

### Compute developers' intersection across OpenStack teams' projects

In [16]:
def combine_projects_pairs_metrics(status='Dev'):
    '''Combine generated csv files into a single DataFrame object
    '''
    df = pd.DataFrame([])
    data_path = osp.join('.', 'RQs', 'PQ', 'Files', 'project_metrics', status)
    changes_file_names = hpr.list_file(data_path)
    for f in changes_file_names:
        df_per_file = pd.read_csv("%s/%s" % (data_path, f))
        df = pd.concat((df, df_per_file))

    return df

In [11]:
projects_pairs_dev = combine_projects_pairs_metrics()

In [14]:
projects_pairs_dev.to_csv(f'{path}/inter_team_dev.csv', index=False)

In [17]:
projects_pairs_rev = combine_projects_pairs_metrics('Rev')

In [18]:
projects_pairs_rev.to_csv(f'{path}/inter_team_rev.csv', index=False)

In [9]:
projects_pairs_rev = pd.read_csv(f'{path}/inter_team_rev.csv')

In [15]:
projects_pairs_rev.groupby('status')['intersection'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Core,152444.0,3.876762,3.691422,0.0,0.961538,3.003003,5.844156,50.0
Non-core,152444.0,3.397173,3.615398,0.0,0.0,2.494331,5.2,58.823529


### # of developers per # of projects

In [19]:
casual_contributors = pd.read_csv(f"{path}/casual_contributors.csv")["id"].values

In [139]:
df_dev_per_projects = df.loc[(~df["owner_account_id"].isin(casual_contributors))].groupby(by=["owner_account_id"]).agg({"project": pd.Series.nunique}).reset_index(level=0)#.groupby("project").count().reset_index(level=0)
# df_dev_per_projects.rename(columns={"project": "projects_nbr", "owner_account_id": "devs_nbr"}, inplace=True)
df_dev_per_projects["status"] = df_dev_per_projects["owner_account_id"].map(lambda rev_id: "Core" if rev_id in os_core_team_devs else "Non-core")
df_dev_per_projects['individual'] = 'Developer'

### # of reviewers per # of projects

In [137]:
df_rev_per_projects = df.loc[(~df["owner_account_id"].isin(casual_contributors))].copy()
df_rev_per_projects["reviewers"] = df_rev_per_projects["reviewers"].map(lambda revs: [rev["_account_id"] for rev in revs if ("tags" not in rev) and (rev["_account_id"] not in casual_contributors)])
df_rev_per_projects = df_rev_per_projects[["project", "number", "reviewers"]].explode(column="reviewers")

df_rev_per_projects = df_rev_per_projects.groupby(by=["reviewers"]).agg({"project": pd.Series.nunique}).reset_index(level=0)#.groupby("project").count().reset_index(level=0)
df_rev_per_projects.rename(columns={ "reviewers": "owner_account_id"}, inplace=True)
df_rev_per_projects["status"] = df_rev_per_projects["owner_account_id"].map(lambda rev_id: "Core" if rev_id in os_core_team_devs else "Non-core")

df_rev_per_projects['individual'] = 'Reviewer'
# attrs = ['projects_nbr', 'status', 'individual']

In [141]:
pd.concat((df_dev_per_projects, df_rev_per_projects)).to_csv(f'{path}/dev_rev_pro.csv', index=None)

### # of developers per # of teams

In [8]:
online_repositories = dict(zip(df_teams['team'].values.reshape(-1), df_teams["projects"].values))

def invert_projects_teams():
    return {p:team for team, projects in online_repositories.items() for p in projects}

project_teams_inv = invert_projects_teams()

In [11]:
pd.DataFrame({'project': project_teams_inv.keys(), 'team': project_teams_inv. values()}).to_csv(f'{path}/inv_pro_team.csv', index=None)

In [91]:
df_dev_per_teams = df.loc[(~df["owner_account_id"].isin(casual_contributors)), ["project", "owner_account_id"]]
df_dev_per_teams["team"] = df_dev_per_teams["project"].map(lambda p: project_teams_inv.get(p))

df_dev_per_teams.dropna(subset=["team"], inplace=True)
df_dev_per_teams = df_dev_per_teams.groupby(by=["owner_account_id"]).agg({"team": pd.Series.nunique}).reset_index(level=0)#.groupby("project").count().reset_index(level=0)
df_dev_per_teams["status"] = df_dev_per_teams["owner_account_id"].map(lambda rev_id: "Core" if rev_id in os_core_team_devs else "Non-core")
df_dev_per_teams['individual'] = 'Developer'

### # of reviewers per # of teams

In [93]:
df_rev_per_teams = df.loc[(~df["owner_account_id"].isin(casual_contributors))].copy()
df_rev_per_teams["team"] = df_rev_per_teams["project"].map(lambda p: project_teams_inv.get(p))
df_rev_per_teams = df_rev_per_teams.explode(column=["team"])
df_rev_per_teams.dropna(subset=["team"], inplace=True)
df_rev_per_teams["owner_account_id"] = df_rev_per_teams["reviewers"].map(lambda revs: [rev["_account_id"] for rev in revs if ("tags" not in rev) and (rev["_account_id"] not in casual_contributors)])
df_rev_per_teams = df_rev_per_teams.explode(column=["owner_account_id"])
df_rev_per_teams = df_rev_per_teams.groupby(by=["owner_account_id"]).agg({"team": pd.Series.nunique}).reset_index(level=0)#.groupby("project").count().reset_index(level=0)
df_rev_per_teams["status"] = df_rev_per_teams["owner_account_id"].map(lambda rev_id: "Core" if rev_id in os_core_team_devs else "Non-core")
df_rev_per_teams['individual'] = 'Reviewer'

In [99]:
pd.concat((df_dev_per_teams, df_rev_per_teams)).to_csv(f'{path}/dev_rev_team.csv', index=None)