In [1]:
import pandas as pd
import numpy as np
import re
import networkx as nx
import utils.helpers as hpr
import ast
from datetime import datetime

In [3]:
def combine_openstack_data():
    '''Combine generated csv files into a single DataFrame object
    '''
    df = pd.DataFrame([])
    data_path = "%sChanges/" % hpr.DIR
    changes_file_names = hpr.list_file(data_path)
    for f in changes_file_names:
        df_per_file = pd.read_csv("%s%s" % (data_path, f))
        df = pd.concat((df, df_per_file))

    df = df.drop_duplicates(subset=["number"])

    df = df.sort_values(by="updated", ascending=False).reset_index(drop=True)

    return df

In [4]:
df = combine_openstack_data()

In [23]:
def retrieve_attr(x, attr):
    rs = re.findall("%s:\s[a-zA-Z0-9/\.\:\+\-\#]{6,}" % (attr), x)
    result = []
    for row in rs:
        row = row[len(attr) + 2:]
        change_id_pattern = re.search(r"[a-zA-Z0-9]{41}", row)
        if change_id_pattern:
            result.append(change_id_pattern[0])
            continue
        number_pattern = re.search("#?https?[\:][/]{2}review[\.](opendev|openstack)[\.]org([a-z0-9A-Z\-\+/\.#]*)\d+", row)
        if number_pattern:
            result.append(re.search("\d+$", number_pattern[0][0:])[0])
    return result if len(result) != 0 else None


def retrieve_related_bug(x):
    result = re.search(r"(Related-Bug:\s#\d+)", x)
    return result[0][14:] if result else None

In [54]:
test_df = df[df["number"].isin([
    861110
    # 110705, 110704, 628498, 642605, 118521, 124181, 637664, 863165, 864948, 864993, 864946, 863158, 863581, 844850, 853589, 858606, 855017
])].copy().reset_index(drop=True)


In [56]:
test_df['depends_on']

0    [860795, 861116]
Name: depends_on, dtype: object

In [24]:
# del df["depends_on"]
# del df["needed_by"]
# del df["related_bug"]
df["depends_on"] = df["commit_message"].apply(retrieve_attr, args=("Depends-On",))
df["needed_by"] = df["commit_message"].apply(retrieve_attr, args=("Needed-By",))
df["related_bug"] = df["commit_message"].map(retrieve_related_bug)

In [27]:
df.loc[df["depends_on"].notnull(), ["depends_on"]].reset_index()

Unnamed: 0,index,depends_on
0,0,[865040]
1,2,[848198]
2,4,[812258]
3,17,[Ifbd990f15b9d85260ab20a2004fe79bafda929d9]
4,18,[865040]
...,...,...
27225,641351,[I92619a95bca2ae0c37e7fdd39da30119b43d1ad6]
27226,648973,[I01b2b7c78e7e7144280c98cdbbe29b012a8a8d93]
27227,653625,[Ic2acb568b55a05b84120b181383ea45ea190dcad]
27228,679909,[Ic01e0f16fe9e7634708fbb51499ccea3f4f40d63]


In [130]:
df.loc[df["needed_by"].notnull(), ["number", "needed_by"]].to_csv("test2.csv", index=False)

In [84]:
df_dep_new = pd.read_csv("test2.csv")
df_dep_new["depends_on"] = df_dep_new["depends_on"].apply(ast.literal_eval)

In [77]:
df_dep_old = pd.read_csv("test2.csv")
df_dep_old["depends_on"] = df_dep_old["depends_on"].apply(ast.literal_eval)

In [85]:
ddd = df_dep_old.loc[~df_dep_old["depends_on"].isin(df_dep_new["depends_on"].values)]

In [102]:
ddd[120:130]

Unnamed: 0,number,depends_on
13315,587834,[88]
14380,557694,"[Ib340376ee80ea42a732a51d0c195b048ca0440ac, 13..."
14506,563690,"[If73e7bd518a7bc60c2db08e2aa3a93dcfe79c0dd, 159]"
21312,369674,[294474]
22520,374019,[1104]
24500,232983,[401]
25303,261960,[14570]
25311,260600,[14570]
26204,218931,[61]
26371,211539,[10550]


In [31]:
df.loc[df["change_id"]== "I977fb6d72764d391b1c1b3b643968928792b34fd", "commit_message"].values[0]

'Switch off Nova V2 XML Support\n\nTempest has a switch to disable tests that exercise the\nNova V2 XML API support. We need a way to enable/disable\nthis flag in devstack. Also we disable these tests by default.\n\nDepends-On: Idd07587b0208f6ad4e34dc8fb112391c469c3ba2\nChange-Id: I977fb6d72764d391b1c1b3b643968928792b34fd\n'

In [17]:
df_temp = df.loc[df["updated"] > datetime(2022, 1, 1)]

In [19]:
df_temp = df_temp.explode(column="depends_on").reset_index(drop=True)
df_temp = df_temp.explode(column="needed_by").reset_index(drop=True)

In [20]:
len(df_temp)

39022

In [13]:
df = df.explode(column="depends_on").reset_index(drop=True)
df = df.explode(column="needed_by").reset_index(drop=True)

In [21]:
df_filter = df_temp.loc[(df_temp["depends_on"].notnull() | df_temp["related_bug"].notnull() | df_temp["needed_by"].notnull())]

In [14]:
def flatten_list(array):
    response = [item for sublist in array for item in sublist]
    return response

In [15]:
def build_depends_chain(row):
    obj = {}
    depends_on = row["depends_on"]
    obj["Target"] = row["number"]
    obj["Target_repo"] = row["project"]
    row_src = None
    if depends_on.isnumeric():
        row_src = df[df["number"] == int(depends_on)]
    else:
        row_src = df[df["change_id"] == depends_on]

    if len(row_src) != 0:
        source_numbers = flatten_list(row_src[["number"]].to_numpy())

        source_numbers = list(dict.fromkeys(source_numbers))
        obj["Source"] = source_numbers
        obj["Source_repo"] = row_src["project"].head(1).tolist()[0]

    return obj

In [16]:
def build_needed_chain(row):
    obj = {}
    needed_by = row["needed_by"]
    obj["Source"] = row["number"]
    obj["Source_repo"] = row["project"]
    row_src = None
    if needed_by.isnumeric():
        row_target = df[df["number"] == int(needed_by)]
    else:
        row_target = df[df["change_id"] == needed_by]

    if len(row_target) != 0:
        target_numbers = flatten_list(row_target[["number"]].to_numpy())

        target_numbers = list(dict.fromkeys(target_numbers))
        obj["Target"] = target_numbers
        obj["Target_repo"] = row_target["project"].head(1).tolist()[0]

    return obj

In [17]:
df_subset_columns = ["change_id", "project", "depends_on", "number"]

df_subset_dep = df.loc[df["depends_on"].isnull() == False,
                       df_subset_columns].copy().reset_index(drop=True)

df_depends_on = df_subset_dep.apply(build_depends_chain, axis=1)

df_depends_on = pd.json_normalize(data=df_depends_on, errors="ignore")

In [20]:
subset_needed_columns = ["change_id", "project", "needed_by", "number"]
df_subset_needed = df.loc[df["needed_by"].notnull(),
                subset_needed_columns].copy().reset_index(drop=True)

df_needed_by = df_subset_needed.apply(build_needed_chain, axis=1)

df_needed_by = pd.json_normalize(data=df_needed_by, errors="ignore")

In [None]:
df_columns_dep = ["change_id", "project", "depends_on", "number"]

df_subset_dep = df_temp.loc[df_temp["depends_on"].notnull(),
                       df_columns_dep].copy().reset_index(drop=True)

df_depends_on_temp = df_subset_dep.apply(build_depends_chain, axis=1)

df_depends_on_temp = pd.json_normalize(data=df_depends_on_temp, errors="ignore")

In [None]:
subset_needed_columns_temp = ["change_id", "project", "needed_by", "number"]
df_subset_needed_temp = df_temp.loc[df_temp["needed_by"].notnull(),
                subset_needed_columns_temp].copy().reset_index(drop=True)

df_needed_by_temp = df_subset_needed_temp.apply(build_needed_chain, axis=1)

df_needed_by_temp = pd.json_normalize(data=df_needed_by_temp, errors="ignore")

In [22]:
df_depends_on.isnull().sum()
# df_depends_on_temp.isnull().sum()

Target            0
Target_repo       0
Source         1184
Source_repo    1184
dtype: int64

In [23]:
df_depends_on.dropna(inplace=True)
# df_depends_on_temp.dropna(inplace=True)

In [24]:
df_depends_on.isnull().sum()
# df_depends_on_temp.isnull().sum()

Target         0
Target_repo    0
Source         0
Source_repo    0
dtype: int64

In [25]:
df_depends_on = df_depends_on.explode(column="Source").reset_index(drop=True)
# df_depends_on_temp = df_depends_on_temp.explode(column="Source").reset_index(drop=True)

In [26]:
len(df_depends_on)
# len(df_depends_on_temp)

36232

In [27]:
df_needed_by.isnull().sum()
# df_needed_by_temp.isnull().sum()

Source            0
Source_repo       0
Target         1192
Target_repo    1192
dtype: int64

In [28]:
df_needed_by.dropna(inplace=True)
# df_needed_by_temp.dropna(inplace=True)

In [29]:
df_needed_by.isnull().sum()
# df_needed_by_temp.isnull().sum()

Source         0
Source_repo    0
Target         0
Target_repo    0
dtype: int64

In [30]:
len(df_needed_by)
# len(df_needed_by_temp)

732

In [31]:
df_needed_by = df_needed_by.explode(column="Target").reset_index(drop=True)
# df_needed_by_temp = df_needed_by_temp.explode(column="Target").reset_index(drop=True)

In [32]:
len(df_needed_by)
# len(df_needed_by_temp)

732

In [33]:
evolution_columns = ["Source", "Target", "Source_repo", "Target_repo"]

df_depends_on = df_depends_on.loc[:, evolution_columns]
# df_depends_on_temp = df_depends_on_temp.loc[:, evolution_columns]

In [34]:
df_needed_by = df_needed_by.loc[:, evolution_columns]
# df_needed_by_temp = df_needed_by_temp.loc[:, evolution_columns]

In [35]:
df_depends_needed = pd.concat((df_depends_on, df_needed_by)).reset_index(drop=True)
# df_depends_needed_temp = pd.concat((df_depends_on_temp, df_needed_by_temp)).reset_index(drop=True)

In [36]:
df_depends_needed.duplicated().sum()
# df_depends_needed_temp.duplicated().sum()

341

In [37]:
df_depends_needed.drop_duplicates(inplace=True)
# df_depends_needed_temp.drop_duplicates(inplace=True)

In [38]:
df_depends_needed.duplicated().sum()
# df_depends_needed_temp.duplicated().sum()

0

In [39]:
len(df_depends_needed)
# len(df_depends_needed_temp)

36623

In [40]:
df_depends_needed[df_depends_needed["Source"] == df_depends_needed["Target"]].count()
# df_depends_needed_temp[df_depends_needed_temp["Source"] == df_depends_needed_temp["Target"]].count()

Source         13
Target         13
Source_repo    13
Target_repo    13
dtype: int64

In [41]:
df_depends_needed = df_depends_needed[df_depends_needed["Source"] != df_depends_needed["Target"]]
# df_depends_needed_temp = df_depends_needed_temp[df_depends_needed_temp["Source"] != df_depends_needed_temp["Target"]]

In [42]:
df_depends_needed[df_depends_needed["Source"] == df_depends_needed["Target"]].count()
# df_depends_needed_temp[df_depends_needed_temp["Source"] == df_depends_needed_temp["Target"]].count()

Source         0
Target         0
Source_repo    0
Target_repo    0
dtype: int64

In [48]:
df_depends_needed.count()
# df_depends_needed_temp.count()

Source         33352
Target         33352
Source_repo    33352
Target_repo    33352
dtype: int64

In [44]:
df_depends_needed = df_depends_needed.loc[df_depends_needed["Source_repo"]!=df_depends_needed["Target_repo"]].reset_index(drop=True)
# df_depends_needed_temp = df_depends_needed_temp.reset_index(drop=True)

In [45]:
df_depends_needed["Source"] = df_depends_needed[["Source"]].astype(int)
df_depends_needed["Target"] = df_depends_needed[["Target"]].astype(int)

# df_depends_needed_temp["Source"] = df_depends_needed_temp[["Source"]].astype(int)
# df_depends_needed_temp["Target"] = df_depends_needed_temp[["Target"]].astype(int)

In [55]:
len(df_depends_needed.loc[df_depends_needed["Source"] == 652339, ["Target_repo"]].value_counts())

24

In [42]:
df_depends_needed.to_csv("./Files/Number/depends_needed.csv", index=False)
# df_depends_needed_temp.to_csv("./experiments/depends_needed.csv", index=False)

### Building Data for graph

In [34]:
df_depends_needed = pd.read_csv("./Files/source_target_evolution.csv")

In [40]:
# df_depends_needed[df_depends_needed["Source"]==865314]
df.columns

Index(['id', 'project', 'branch', 'topic', 'change_id', 'subject', 'status',
       'created', 'updated', 'submitted', 'insertions', 'deletions',
       'total_comment_count', 'number', 'current_revision',
       'discussion_messages_count', 'reviewers_count', 'revisions_count',
       'files_count', 'owner_account_id', 'owner_name', 'owner_username',
       'commit_message'],
      dtype='object')

In [79]:
from itertools import chain
from itertools import product
from itertools import starmap
from functools import partial

chaini = chain.from_iterable

G = nx.DiGraph(df_depends_needed_temp[["Source", "Target"]].values.tolist())
roots = (v for v, d in G.in_degree() if d == 0)
leaves = (v for v, d in G.out_degree() if d == 0)
all_paths = partial(nx.all_simple_paths, G)
paths = list(chaini(starmap(all_paths, product(roots, leaves))))

In [None]:
pd.DataFrame(paths).to_csv("./experiments/paths.csv", index=False)

In [8]:
from itertools import chain, product, starmap
from functools import partial

graph = nx.from_pandas_edgelist(df=df_depends_needed_temp, source="Source", target="Target", create_using=nx.DiGraph)

roots = (node for node, d in graph.in_degree if d == 0)

leaves = (node for node, d in graph.out_degree if d == 0)

all_paths = partial(nx.all_simple_paths, graph)

paths = list(chain.from_iterable(starmap(all_paths, product(roots, leaves))))

In [10]:
pd.DataFrame({"Path": paths}).to_csv("./Files/Number/depends_on_needed_by.csv", index=False)

In [146]:
os_edges = df_depends_on[["Source", "Target"]].copy()
os_nodes_1 = df_depends_on[["Source", "Source_repo"]].copy()
os_nodes_1 = os_nodes_1.rename(columns={
    "Source": "id",
    "Source_repo": "label"
})
os_nodes_2 = df_depends_on[["Target", "Target_repo"]].copy()
os_nodes_2 = os_nodes_2.rename(columns={
    "Target": "id",
    "Target_repo": "label"
})

In [147]:
os_edges = os_edges.reset_index(drop=True)

In [148]:
os_nodes = pd.concat((os_nodes_1, os_nodes_2))
os_nodes = os_nodes.drop_duplicates()
os_nodes = os_nodes.reset_index(drop=True)

### Related-Bug

In [86]:
def combine_projects(row):
    combined_projects = ""
    projects = list(dict.fromkeys(row["project"]))
    for i in range(len(projects)):
        combined_projects += projects[i]
        if i + 1 < len(projects):
            combined_projects += " "
    return combined_projects

In [87]:
def retrieve_project_numbers(x):
    return x["number"].values

def remove_single_components(arr):
    result = []
    for item in arr:
        if len(dict.fromkeys(item)) > 1:
            result.append(list(item))

    return result

In [89]:
# df_main_related_bug = df[df["related_bug"].isnull() == False].copy()

# df_main_related_bug["related_bug"] = df_main_related_bug[["related_bug"]].astype(int).reset_index(drop=True)

# df_related_bug_subset = df_main_related_bug.copy()[[
#     "related_bug", "number"
# ]].groupby("related_bug").apply(retrieve_project_numbers).reset_index(
#     drop=True)

# related_bug_number_co_changes = remove_single_components(df_related_bug_subset)

df_main_related_bug_temp = df_temp[df_temp["related_bug"].isnull() == False].copy()

df_main_related_bug_temp["related_bug"] = df_main_related_bug_temp[["related_bug"]].astype(int).reset_index(drop=True)

df_related_bug_subset_temp = df_main_related_bug_temp.copy()[[
    "related_bug", "number"
]].groupby("related_bug").apply(retrieve_project_numbers).reset_index(
    drop=True)

related_bug_number_co_changes_temp = remove_single_components(df_related_bug_subset_temp)

### Topic

In [90]:
# df_main_topic = df[df["topic"].isnull() == False].copy()

# df_topic_subset = df_main_topic.copy()[[
#     "topic", "number"
# ]].groupby("topic").apply(retrieve_project_numbers).reset_index(
#     drop=True)

# topic_number_co_changes = remove_single_components(df_topic_subset)


df_main_topic_temp = df_temp[df_temp["topic"].isnull() == False].copy()

df_topic_subset_temp = df_main_topic_temp.copy()[[
    "topic", "number"
]].groupby("topic").apply(retrieve_project_numbers).reset_index(
    drop=True)

topic_number_co_changes_temp = remove_single_components(df_topic_subset_temp)

### Subject

In [170]:
df_main_subject = df[df["subject"].isnull() == False].copy()

df_subject_subset = df_main_subject.copy()[[
    "subject", "number"
]].groupby("subject").apply(retrieve_project_numbers).reset_index(
    drop=True)

subject_number_co_changes = remove_single_components(df_subject_subset)

### Change-id

In [91]:
# df_main_change_id = df[df["change_id"].isnull() == False].copy()

# df_change_id_subset = df_main_change_id.copy()[[
#     "change_id", "number"
# ]].groupby("change_id").apply(retrieve_project_numbers).reset_index(
#     drop=True)

# change_id_number_co_changes = remove_single_components(df_change_id_subset)


df_main_change_id_temp = df_temp[df_temp["change_id"].isnull() == False].copy()

df_change_id_subset_temp = df_main_change_id_temp.copy()[[
    "change_id", "number"
]].groupby("change_id").apply(retrieve_project_numbers).reset_index(
    drop=True)

change_id_number_co_changes_temp = remove_single_components(df_change_id_subset_temp)

### All possible paths including depend-on, related-bug, topic, subject and change-id based co-changes

In [43]:
def combine_co_changes_number(main_array, second_array):
    result = main_array.copy()
    
    for arr_item in second_array:

        added = False
        for j in range(len(main_array)):
            row_dep = main_array[j]
            check_any = any(item in row_dep for item in arr_item)

            if check_any:
                result[j] = list(dict.fromkeys(result[j] + arr_item))
                added = True

        if not added:
            result.append(arr_item)
    return result

### Test

In [44]:
depends_on_test = [["A", "C"], ["D", "C"], ["F", "K"]]
other_arrays_test = [["B", "C"], ["A", "F"], ["L", "K"], ["Z", "N"]]

test = combine_co_changes_number(depends_on_test, other_arrays_test)
print(test)

[['A', 'C', 'B', 'F'], ['D', 'C', 'B'], ['F', 'K', 'A', 'L'], ['Z', 'N']]


In [24]:
# related_bug_number_co_changes = pd.read_csv("./Files/Number/related_bug.csv")
# related_bug_number_co_changes = related_bug_number_co_changes["Path"].apply(ast.literal_eval).values.tolist()

In [98]:
# result_number_co_changes = combine_co_changes_number(paths, related_bug_number_co_changes)
result_number_co_changes = combine_co_changes_number(paths, related_bug_number_co_changes_temp)

In [29]:
topic_number_co_changes = pd.read_csv("./Files/Number/topic.csv")
topic_number_co_changes = topic_number_co_changes["Path"].apply(ast.literal_eval).values.tolist()

In [99]:
# result_number_co_changes = combine_co_changes_number(result_number_co_changes, topic_number_co_changes)
result_number_co_changes = combine_co_changes_number(result_number_co_changes, topic_number_co_changes_temp)

In [32]:
change_id_number_co_changes = pd.read_csv("./Files/Number/change_id.csv")
change_id_number_co_changes = change_id_number_co_changes["Path"].apply(ast.literal_eval).values.tolist()

In [100]:
# result_number_co_changes = combine_co_changes_number(result_number_co_changes, change_id_number_co_changes)
result_number_co_changes = combine_co_changes_number(result_number_co_changes, change_id_number_co_changes_temp)

In [102]:
# pd.DataFrame({"Path": paths}).to_csv("./experiments/Number/depends_on.csv", index=False)
pd.DataFrame({"Path": related_bug_number_co_changes_temp}).to_csv("./experiments/related_bug.csv", index=False)
pd.DataFrame({"Path": topic_number_co_changes_temp}).to_csv("./experiments/topic.csv", index=False)
pd.DataFrame({"Path": change_id_number_co_changes_temp}).to_csv("./experiments/change_id.csv", index=False)
pd.DataFrame({"Path": result_number_co_changes}).to_csv("./experiments/all_paths.csv", index=False)

In [308]:
pd.DataFrame({"Path": paths}).to_csv("./Co-changes/Number/depends_on.csv", index=False)
pd.DataFrame({"Path": related_bug_number_co_changes}).to_csv("./Co-changes/Number/related_bug.csv", index=False)
pd.DataFrame({"Path": topic_number_co_changes}).to_csv("./Co-changes/Number/topic.csv", index=False)
pd.DataFrame({"Path": subject_number_co_changes}).to_csv("./Co-changes/Number/subject.csv", index=False)
pd.DataFrame({"Path": change_id_number_co_changes}).to_csv("./Co-changes/Number/change_id.csv", index=False)
pd.DataFrame({"Path": result_number_co_changes}).to_csv("./Co-changes/Number/all_paths.csv", index=False)

### Simple test of the 1st item of the paths

In [332]:
print(result_number_co_changes[0])
print(list(dict.fromkeys(paths[0]+result_number_co_changes[0])))

[865314, 865315, 212127, 599335, 579277, 510663, 485956, 465150, 461122, 360999, 360991, 96492]
[865314, 865315, 212127, 599335, 579277, 510663, 485956, 465150, 461122, 360999, 360991, 96492]


### Extending the csv file with single-component related changes that do not appear in other co-changes

In [None]:
result_number_co_changes = pd.read_csv("./Experiments/all_paths.csv")

In [104]:
all_paths_flattend = list(dict.fromkeys(flatten_list(result_number_co_changes)))

In [105]:
# single_component_changes = df.loc[~df["number"].isin(all_paths_flattend), ["number", "project"]].reset_index(drop=True)
single_component_changes = df_temp.loc[~df_temp["number"].isin(all_paths_flattend), ["number", "project"]].reset_index(drop=True)

In [106]:
single_component_changes_number = single_component_changes["number"].map(lambda x: [x]).tolist()

In [107]:
extended_paths_number = result_number_co_changes + single_component_changes_number

In [108]:
print("result_number_co_changes %d"%len(result_number_co_changes))
print("single_component_changes_number %d"%len(single_component_changes_number))
print("extended_paths_number %d"%len(extended_paths_number))

result_number_co_changes 6200
single_component_changes_number 12805
extended_paths_number 19005


In [261]:
pd.DataFrame({"Path": extended_paths_number}).to_csv("./experiments/all_paths.csv", index=False)

In [463]:
result_number_co_changes = pd.read_csv("./Co-changes/Number/all_paths.csv", converters={'Path': pd.eval})

### Replace numbers with repos in extended_paths_number 

In [70]:
def number_to_repo(data, df):
    '''Replace numbers of data with their corresponding repository names, then removing any duplicates.
    '''
    df_subset = df[["number", "project"]]
    result = []
    for i in range(len(data)):
        row = data[i]
        new_row = df_subset.loc[
            df_subset["number"].isin(row), ["project"]].drop_duplicates(subset="project").values.reshape(-1).tolist()
        result.append(new_row)
    return result

In [None]:
extended_paths_repo = number_to_repo(extended_paths_number)
# extended_paths_number[1]

In [45]:
def merge_numbers(data):
    result = []
    data_copy = data.copy()
    for i in range(len(data)):
        new_item = {}
        not_to_add_indices = []

        if len(data[i]) == 0:
            continue

        for j in range(len(data_copy)):

            check_any = any(item in data[i] for item in data_copy[j])
            if check_any:
                new_item = {**new_item, **dict.fromkeys(data[i] + data_copy[j])}
                data_copy[j] = []
                not_to_add_indices.append(j)
        if len(new_item) == 0:
            print(new_item)
            result.append(data[i])
        else:
            result.append(list(new_item))

        for ntai in not_to_add_indices:
            data[ntai] = []

    return result

In [46]:
# test = flatten_list(result_number_co_changes.copy().values)
test = [["X", "K", "L"], ["A", "B", "D"], ["C", "D"], ["K", "X"],
        ["M", "N", "Q"], ["F", "C"]]
# test = [["A", "B", "F"], ["X", "K"], ["M", "O", "C"], ["F", "J", "C"],
#         ["C", "M", "D"]]

In [47]:
merge_numbers(test)

[['X', 'K', 'L'], ['A', 'B', 'D', 'C'], ['M', 'N', 'Q'], ['F', 'C']]

In [262]:
merged_extended_paths_number_temp = merge_numbers(extended_paths_number)

In [267]:
pd.DataFrame({"Path": merged_extended_paths_number_temp}).to_csv("./experiments/merged_extended_paths.csv", index=False)

In [269]:
possible_path_repo = number_to_repo(merged_extended_paths_number_temp, df)

In [271]:
pd.DataFrame({"Path": possible_path_repo}).to_csv("./experiments/extended_paths.csv", index=False)