In [1]:
import pandas as pd
import numpy as np
import ast
import utils.helpers as hpr
from bs4 import BeautifulSoup
import requests as rq
from github import Github
import re
from datetime import datetime

In [2]:
def combine_openstack_data():
    '''Combine generated csv files into a single DataFrame object
    '''
    df = pd.DataFrame([])
    data_path = "%sChanges/" % hpr.DIR
    changes_file_names = hpr.list_file(data_path)
    for f in changes_file_names:
        df_per_file = pd.read_csv("%s%s" % (data_path, f))
        df = pd.concat((df, df_per_file))

    df = df.drop_duplicates(subset=["number"])

    df = df.sort_values(by="updated", ascending=False).reset_index(drop=True)

    return df

In [3]:
df = combine_openstack_data()

### Cross-project changes over time

In [2]:
df_all_dependencies = pd.read_csv("./Files/all_dependencies.csv")

In [13]:
def is_cross_service(row):
    if (type(row["Service_source"]) == str) and (type(row["Service_target"]) == str) and (row["Service_source"] != row["Service_target"]):
        return True
    return False

In [14]:
df_all_dependencies["is_cross_service"] = df_all_dependencies.apply(is_cross_service, axis=1)

In [15]:
df_all_dependencies["Source_date"] = df_all_dependencies["Source"].map(lambda number: df.loc[df["number"]==number, "created"].values[0][:-19])
df_all_dependencies["Target_date"] = df_all_dependencies["Target"].map(lambda number: df.loc[df["number"]==number, "created"].values[0][:-19])

In [23]:
cross_project_changes_overtime = hpr.flatten_list(df_all_dependencies[["Source_date", "Target_date"]].values)
cross_project_changes_overtime = {d[:4]: 0 for d in set([d[:4] for d in cross_project_changes_overtime])}

In [25]:
visited_numbers = []
for idx, row in df_all_dependencies[df_all_dependencies["is_cross"]==True].iterrows():
    source_number = row["Source"]
    target_number = row["Target"]
    if source_number not in visited_numbers:
        cross_project_changes_overtime[row["Source_date"][:4]] += 1

    if target_number not in visited_numbers:
        cross_project_changes_overtime[row["Target_date"][:4]] += 1

    visited_numbers += [source_number, target_number]

In [32]:
df_cross_project_changes_overtime = pd.DataFrame({"year": cross_project_changes_overtime.keys(), "count": cross_project_changes_overtime.values()}).sort_values("year")

In [87]:
df_cross_project_changes_overtime.to_csv("./RQs/RQ4/Files/cross_project_changes_overtime.csv", index=False)

In [24]:
cross_service_changes_overtime = hpr.flatten_list(df_all_dependencies[["Source_datetime", "Target_datetime"]].values)
cross_service_changes_overtime = {d[:4]: 0 for d in set([d[:4] for d in cross_service_changes_overtime])}

In [27]:
cross_project_changes_release = set(hpr.flatten_list(df_all_dependencies[["Source_release", "Target_release"]].values))
cross_project_changes_release.remove("2023.1 antelope")

In [79]:
visited_service_numbers = []
for idx, row in df_all_dependencies[df_all_dependencies["is_cross_service"]==True].iterrows():
    source_number = row["Source"]
    target_number = row["Target"]
    if source_number not in visited_service_numbers:
        cross_service_changes_overtime[row["Source_date"][:4]] += 1

    if target_number not in visited_service_numbers:
        cross_service_changes_overtime[row["Target_date"][:4]] += 1

    visited_service_numbers += [source_number, target_number]

In [80]:
df_cross_service_changes_overtime = pd.DataFrame({"year": cross_service_changes_overtime.keys(), "count": cross_service_changes_overtime.values()}).sort_values("year")

In [85]:
np.sum(list(cross_service_changes_overtime.values()))

15628

In [16]:
projects = set(hpr.flatten_list(df_all_dependencies.loc[(df_all_dependencies.Source_date.str.startswith("2017") | df_all_dependencies.Target_date.str.startswith("2017")) & (df_all_dependencies.is_cross_service==True), ["Source_repo", "Target_repo"]].values))

In [88]:
df_cross_service_changes_overtime.to_csv("./RQs/RQ3/Files/cross_service_changes_overtime.csv", index=False)

### OpenStack archived projects

In [14]:
def getGitRepos(owner, token):
    g = Github(token)
    org = g.get_organization(owner)
    repos = org.get_repos()
    return [repo.ssh_url.split('/')[-1].replace('.git','') for repo in repos]

In [15]:
owner = 'openstack-archive'
token = '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'

archived_projects = getGitRepos(owner, token)

In [16]:
archived_projects = ["openstack/%s" % p for p in archived_projects]

In [33]:
df_all_dependencies.to_csv("./Files/all_dependencies.csv", index=False)

In [None]:
df_all_dependencies.loc[(df_all_dependencies.Source_repo.isin(result) | df_all_dependencies.Target_repo.isin(result)) & (df_all_dependencies.is_cross_service==True)].sort_values(by=["Source_date", "Target_date"], ascending=[0,0])

In [11]:
extended_paths_number = pd.read_csv("./Files/Number/extended_paths.csv")

extended_paths_number["Path"] = extended_paths_number["Path"].apply(ast.literal_eval)

In [12]:
extended_paths_number["length"] = extended_paths_number["Path"].map(lambda x:len(x))

### Cross_project changes across releases

In [8]:
request = rq.get("https://releases.openstack.org/")
soup = BeautifulSoup(request.text, 'html.parser')

tbody = soup.find("tbody")
openstack_releases = {}

tr_list = tbody.find_all("tr")
for i in range(len(tr_list)):
    tr = tr_list[i]
    td_list = tr.find_all("td")
    initial_release = td_list[2].text[:10]
    release_name = td_list[0].select_one('span', {"class": "doc"}).text.lower()
    openstack_releases.update({initial_release: release_name})

In [9]:
openstack_releases_keys = list(openstack_releases.keys())
openstack_releases_keys.sort()
openstack_releases = {i: openstack_releases[i] for i in openstack_releases_keys}

In [7]:
def retrieve_release(nbr):
    change = df.loc[df["number"]==nbr, ["created", "branch"]]
    branch = change["branch"].values[0]
    if branch.startswith("stable/") and branch.split("/")[1] in openstack_releases.values():
        return branch.split("/")[1]
    
    date = change["created"].values[0][:-19]
    keys = openstack_releases.keys()
    for k in keys:
        if date <= k:
            return openstack_releases.get(k)
    return None

In [57]:
df_all_dependencies["Source_release"] = df_all_dependencies["Source"].map(retrieve_release)
df_all_dependencies["Target_release"] = df_all_dependencies["Target"].map(retrieve_release)

In [None]:
cross_project_changes_across_releases = {r: 0 for r in openstack_releases.values()}

In [84]:
visited_numbers = []
for idx, row in df_all_dependencies[df_all_dependencies["is_cross"]==True].iterrows():
    source_number = row["Source"]
    target_number = row["Target"]
    if source_number not in visited_numbers:
        cross_project_changes_across_releases[row["Source_release"]] += 1

    if target_number not in visited_numbers:
        cross_project_changes_across_releases[row["Target_release"]] += 1

    visited_numbers += [source_number, target_number]

In [88]:
df_cross_project_changes_across_releases = pd.DataFrame({"release": cross_project_changes_across_releases.keys(), "count": cross_project_changes_across_releases.values()}).to_csv("./RQs/RQ3/Files/cross_project_changes_across_releases.csv", index=False)

In [89]:
cross_service_changes_across_releases = {r: 0 for r in openstack_releases.values()}

In [90]:
visited_service_numbers = []
for idx, row in df_all_dependencies[df_all_dependencies["is_cross_service"]==True].iterrows():
    source_number = row["Source"]
    target_number = row["Target"]
    if source_number not in visited_service_numbers:
        cross_service_changes_across_releases[row["Source_release"]] += 1

    if target_number not in visited_service_numbers:
        cross_service_changes_across_releases[row["Target_release"]] += 1

    visited_service_numbers += [source_number, target_number]

In [91]:
pd.DataFrame({"release": cross_service_changes_across_releases.keys(), "count": cross_service_changes_across_releases.values()}).to_csv("./RQs/RQ3/Files/cross_service_changes_across_releases.csv", index=False)

### Retrieve project for each release

In [74]:
request = rq.get("https://releases.openstack.org/")
soup = BeautifulSoup(request.text, 'html.parser')

tbody = soup.find("tbody")
openstack_project_per_releases = {}

tr_list = tbody.find_all("tr")
for i in range(len(tr_list)):
    tr = tr_list[i]
    td_list = tr.find_all("td")
    initial_release = td_list[2].text[:10]
    release_segment = td_list[0].find("a").get("href")
    release_request = rq.get("https://releases.openstack.org/%s" % (release_segment))

    soup_release = BeautifulSoup(release_request.text, 'html.parser')

    table_of_contents = soup_release.find("div", attrs={"id": "table-of-contents"})
    
    project_categories = table_of_contents.find("ul").find("li").find_all("ul")#.select("li")
    release_projects = []
    for j in range(1, len(project_categories)):
        project_items = project_categories[j].find_all("li")
        for k in range(0, len(project_items)):
            release_projects.append(project_items[k].text)

    openstack_project_per_releases[release_segment.split("/")[0]] = list(set(release_projects))

In [75]:
df_openstack_releases = pd.DataFrame({"release": openstack_project_per_releases.keys(), "projects": openstack_project_per_releases.values()})

In [76]:
df_openstack_releases["count"] = df_openstack_releases["projects"].map(lambda projects: len(projects))

In [84]:
df_openstack_releases.iloc[2:-4, [0,2]].sort_values("release").to_csv("./RQs/RQ3/Files/projects_releases.csv", index=False)

### How many other projects that X depends on over time:

In [47]:
def identify_related_projects(period):
    result = []
    for idx,row in df_all_dependencies.iterrows():
        if row["status_source"] == "MERGED" and row["status_target"] == "MERGED":
            if row["Source_date"].startswith(period):
                result.append(row["Source_repo"])

            if row["Target_date"].startswith(period):
                result.append(row["Target_repo"])
    return list(set(result))

def identify_related_projects_releases(release):
    result = []
    for idx,row in df_all_dependencies.iterrows():
        if row["status_source"] == "MERGED" and row["status_target"] == "MERGED":
            if row["Source_release"] == release:
                result.append(row["Source_repo"])

            if row["Target_release"] == release:
                result.append(row["Target_repo"])
    return list(set(result))

In [30]:
df_dependent_projects = pd.DataFrame({"year": cross_service_changes_overtime.keys()}).sort_values("year").reset_index(drop=True)
df_dependent_projects["project"] = df_dependent_projects["year"].map(identify_related_projects)

In [31]:
df_dependent_projects = df_dependent_projects.explode(column="project")

In [51]:
def calc_depedent_projects_overtime(row):
    nbr_depedent_projects = len(
        set(
            hpr.flatten_list(df_all_dependencies.loc[
                (df_all_dependencies["is_cross"] == True) &
                ((df_all_dependencies["status_source"] == "MERGED") & (df_all_dependencies["status_target"] == "MERGED")) &
                (((df_all_dependencies["Source_repo"] == row["project"])&
                  df_all_dependencies["Source_date"].str.startswith(row["year"])&
                  df_all_dependencies["Target_date"].str.startswith(row["year"]))
                 | ((df_all_dependencies["Target_repo"] == row["project"])&
                    df_all_dependencies["Source_date"].str.startswith(row["year"])&
                    df_all_dependencies["Target_date"].str.startswith(row["year"]))),
                ["Source_repo", "Target_repo"]].values)))
    return nbr_depedent_projects - 1


def calc_depedent_projects_across_releases(row):
    nbr_depedent_projects = len(
        set(
            hpr.flatten_list(df_all_dependencies.loc[
                (df_all_dependencies["is_cross"] == True) &
                ((df_all_dependencies["status_source"] == "MERGED") & (df_all_dependencies["status_target"] == "MERGED")) &
                (((df_all_dependencies["Source_repo"] == row["project"])&
                  (df_all_dependencies["Source_release"] == row["release"])&
                  (df_all_dependencies["Target_release"] == row["release"]))
                 |
                 ((df_all_dependencies["Target_repo"] == row["project"])&
                  (df_all_dependencies["Source_release"] == row["release"])&
                  (df_all_dependencies["Target_release"] == row["release"]))),
                ["Source_repo", "Target_repo"]].values)))
    return nbr_depedent_projects - 1

In [38]:
df_dependent_projects["dependent_projects"] = df_dependent_projects.apply(calc_depedent_projects_overtime, axis=1)

In [44]:
df_dependent_projects.loc[df_dependent_projects["dependent_projects"]==-1, "dependent_projects"] = 0

In [45]:
df_dependent_projects.reset_index(drop=True).sort_values("year").to_csv("./RQs/RQ3/Files/dependent_projects_year.csv", index=False)

In [63]:
df_dependent_projects.sort_values("dependent_projects")

Unnamed: 0,year,project,dependent_projects
0,2011,openstack/keystone,0
10,2021,openstack/python-magnumclient,0
5,2016,openstack/charm-nova-lxd,0
5,2016,openstack/charm-glance,0
5,2016,openstack/charm-cinder,0
...,...,...,...
7,2018,openstack/project-config,104
4,2015,openstack/project-config,113
5,2016,openstack/project-config,145
6,2017,openstack/project-config,196


#### Across releases

In [57]:
df_dependent_projects_releases = pd.DataFrame({"release": list(cross_project_changes_release)}).sort_values("release").reset_index(drop=True)
df_dependent_projects_releases["project"] = df_dependent_projects_releases["release"].map(identify_related_projects_releases)
df_dependent_projects_releases = df_dependent_projects_releases.explode(column="project")
df_dependent_projects_releases["dependent_projects"] = df_dependent_projects_releases.apply(calc_depedent_projects_across_releases, axis=1)

In [61]:
df_dependent_projects_releases.to_csv("./RQs/RQ3/Files/dependent_projects_release.csv", index=False)

### The number of modified, non modified and archived projects across releases ?

In [4]:
casual_contributors = pd.read_csv("./RQs/Files/casual_contributors.csv")
casual_contributors = casual_contributors["dev"].values

In [5]:
df_subset = df.loc[(df["is_owner_bot"] == 0) & 
                   (df["status"] == "MERGED"),
                   ["number", "project", "created", "branch"]].copy()


In [10]:
df_subset["release"] = df_subset["number"].map(retrieve_release)
df_subset["year"] = df_subset["created"].map(lambda date: date[:4])

In [121]:
df_subset.to_csv("./RQs/RQ3/Files/df_subset.csv", index=False)

In [11]:
df_subset_dict_release = df_subset.groupby(by=["release"])["project"].apply(list).to_dict()
df_subset_dict_release = {k: set(v) for k,v in df_subset_dict_release.items()}
df_subset_dict_release.pop("2023.1 antelope")

df_subset_dict_year = df_subset.groupby(by=["year"])["project"].apply(list).to_dict()
df_subset_dict_year = {k: set(v) for k,v in df_subset_dict_year.items()}

In [12]:
def retrieve_archive_date(repo):
    # g = git.Git("%s/%s"%(path, repo.replace("openstack/", "")))
    # result = g.log('--name-status', 'HEAD^..HEAD', "--date=iso", '--pretty=%cd')
    request = rq.get("https://github.com/%s" % repo.replace("openstack/", "openstack-archive/"))
    soup = BeautifulSoup(request.text, 'html.parser')
    archival_text = soup.find(string=re.compile("This repository has been archived by the owner"))
    archival_date = re.search(r"[a-zA-Z]{3} [0-9]{1,2}, [0-9]{4}", archival_text)
    return archival_date[0]

In [17]:
df_archived_projects = pd.DataFrame({"project": archived_projects})
df_archived_projects["archived_at"] = df_archived_projects["project"].map(retrieve_archive_date)

In [18]:
df_archived_projects["year"] = df_archived_projects["archived_at"].map(lambda date: date[-4:])
df_archived_projects["archived_at"] = df_archived_projects["archived_at"].map(pd.to_datetime)

In [19]:
def retrieve_release_archived(archived_at):
    keys = openstack_releases.keys()
    for k in keys:
        release_date = datetime.strptime(k, "%Y-%m-%d")
        if archived_at <= release_date:
            return openstack_releases.get(k)

In [20]:
df_archived_projects["release"] = df_archived_projects["archived_at"].map(retrieve_release_archived)

In [122]:
df_archived_projects.to_csv("./RQs/RQ3/Files/archived_projects.csv", index=False)

In [21]:
df_archived_projects_dict_release = df_archived_projects.groupby(by=["release"])["project"].apply(list).to_dict()
df_archived_projects_dict_release = {k: set(v) for k,v in df_archived_projects_dict_release.items()}
df_archived_projects_dict_release.pop("2023.1 antelope")
df_archived_projects_dict_release.pop("2023.2 bobcat")

df_archived_projects_dict_year = df_archived_projects.groupby(by=["year"])["project"].apply(list).to_dict()
df_archived_projects_dict_year = {k: set(v) for k,v in df_archived_projects_dict_year.items()}

In [75]:
def identify_project_types(release=True):
    main_key = "release"
    df_main = df_subset_dict_release
    main_archived_project = df_archived_projects_dict_release
    if release == False:
        main_key = "year"
        df_main = df_subset_dict_year
        main_archived_project = df_archived_projects_dict_year

    openstack_project_types = {k:{"modified": list(df_main.get(k)), "non_modified": [], "archived": []} for k in df_main.keys()}
    osKeys = list(openstack_project_types.keys())
    osKeys.sort()
    openstack_project_types = {i: openstack_project_types[i] for i in osKeys}
    keys = list(openstack_project_types.keys())
    prev = None
    current = keys[0]
    archived_keys = main_archived_project.keys()
    all_previous_archived_project = []
    all_previous_modified_project = list(df_main.get(current))
    for i in range(len(keys)):
        if prev and current:
            curr_modified = openstack_project_types[current]["modified"]
            archived = main_archived_project[current] if current in archived_keys else []
            all_previous_archived_project += list(archived)
            
            if len(archived) > 0:
                all_previous_modified_project = [p for p in all_previous_modified_project if p not in all_previous_archived_project]
            
            curr_modified = set(curr_modified).difference(all_previous_archived_project)
            non_modified = set(all_previous_modified_project).difference(curr_modified)

            openstack_project_types[current]["archived"] = list(archived)
            openstack_project_types[current]["non_modified"] = list(non_modified)
            openstack_project_types[current]["modified"] = list(curr_modified)

            all_previous_modified_project = list(set(all_previous_modified_project + list(curr_modified)))

        if current == keys[-1]:
            break

            
        prev, current = current, keys[i+1]

    for k in openstack_project_types:
        openstack_project_types[k]["archived"] = len(openstack_project_types[k]["archived"])
        openstack_project_types[k]["modified"] = len(openstack_project_types[k]["modified"])
        openstack_project_types[k]["non_modified"] = len(openstack_project_types[k]["non_modified"])

    modified = [openstack_project_types[r]["modified"] for r in openstack_project_types.keys()]
    non_modified = [openstack_project_types[r]["non_modified"] for r in openstack_project_types.keys()]
    archived = [openstack_project_types[r]["archived"] for r in openstack_project_types.keys()]

    df_modified = pd.DataFrame({main_key: keys, "count": modified, "type": ["Modified" for i in range(len(keys))]})
    df_non_modified = pd.DataFrame({main_key: keys, "count": non_modified, "type": ["Non modified" for i in range(len(keys))]})
    df_archived = pd.DataFrame({main_key: keys, "count": archived, "type": ["Archived" for i in range(len(keys))]})

    df_projects_evolution_keys = pd.concat((df_modified, df_non_modified, df_archived))
    df_projects_evolution_keys.reset_index(drop=True, inplace=True)

    return df_projects_evolution_keys


In [73]:
os_projects_evolution_releases = identify_project_types()
os_projects_evolution_years = identify_project_types(release=False)
os_projects_evolution_years["year"]=os_projects_evolution_years["year"].values.astype(str)

In [74]:
os_projects_evolution_releases.to_csv("./RQs/RQ3/Files/projects_across_releases.csv", index=False)
os_projects_evolution_years.to_csv("./RQs/RQ3/Files/projects_across_years.csv", index=False)

In [82]:
releases = os_projects_evolution_releases[os_projects_evolution_releases["type"]=="Modified"].iloc[:16, 0].values
all_prev_modified = set(hpr.flatten_list([list(df_subset_dict_release.get(r)) for r in releases]))
modified = set(df_subset_dict_release["train"]).difference(df_archived_projects_dict_release["train"])
non_modified = set(set(all_prev_modified).difference(df_archived_projects_dict_release["train"])).difference(modified)
len(modified), len(df_archived_projects_dict_release["train"]), len(non_modified)


(706, 601, 511)

### Extra

In [None]:
# path = "archived-projects"
# shutil.rmtree(path=path)
# # os.mkdir(path)
# for p in depended_archived_projects:
#     p = p.replace("openstack/", "openstack-archive/") + ".git"
#     os.system("cd archived-projects/ && git clone https://github.com/%s" % p)