In [3]:
import pandas as pd
from datetime import datetime
import utils.helpers as hpr
import re
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import ast
from efficient_apriori import apriori
import numpy as np
import utils.helpers as hpr
from scipy import stats

In [4]:
def combine_openstack_data():
    '''Combine generated csv files into a single DataFrame object
    '''
    df = pd.DataFrame([])
    data_path = "%sChanges/" % hpr.DIR
    changes_file_names = hpr.list_file(data_path)
    for f in changes_file_names:
        df_per_file = pd.read_csv("%s%s" % (data_path, f))
        df = pd.concat((df, df_per_file))

    df = df.drop_duplicates(subset=["number"])

    df = df.sort_values(by="updated", ascending=False).reset_index(drop=True)

    return df

In [5]:
df = combine_openstack_data()

In [9]:
df_all_dependencies = pd.read_csv("./Files/all_dependencies.csv")
df_all_dependencies = df_all_dependencies[(df_all_dependencies.is_cross==True)&(df_all_dependencies.status_source=="MERGED")&(df_all_dependencies.status_target=="MERGED")]

In [17]:
study_sample = df_all_dependencies[["Source", "Target", "Source_repo", "Target_repo", "same_dev"]].sample(n=381)
study_sample.reset_index(drop=True, inplace=True)
study_sample["Source_repo"] = study_sample["Source_repo"].map(lambda repo: repo.replace("openstack/", ""))
study_sample["Target_repo"] = study_sample["Target_repo"].map(lambda repo: repo.replace("openstack/", ""))
study_sample["description"] = None
study_sample["relation_type"] = None
study_sample.to_csv("./RQs/RQ6/study_sample.csv", index_label="index")

### Association rules mining algorithm

In [None]:
chains = pd.read_csv("./Files/Repo/extended_paths.csv")
chains = chains["Path"].apply(ast.literal_eval).values.tolist()
chains = [list(set(chain)) for chain in chains]

In [None]:
# Runing the Apriori algorithm and save itemsets and association rules
itemsets, rules = apriori(chains, min_support=0.00005, min_confidence=0.00008, verbosity=0)
items = sorted(rules, key=lambda item: (item.lift, item.confidence), reverse=True)
len(items)

In [None]:
# Print out every rule with 2 items on the left hand side,
# 1 item on the right hand side, sorted by lift
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)

rules = []
for rule in sorted(rules_rhs, key=lambda rule: rule.lift):
    rule_item = {"antecedent": rule.lhs[0], "consequent": rule.rhs[0], "supp": rule.support, "conf": rule.confidence}
    rules.append(rule_item)

In [None]:
df_rules = pd.DataFrame(rules).sort_values(by="conf", ascending=0).reset_index(drop=True)

In [None]:
len(df_rules[df_rules.conf<.10])#/len(df_rules)

In [None]:
df_rules.to_csv("./RQs/RQ6/Files/association_rules.csv", index=False)

### K-means

In [None]:
df_all_dependencies = pd.read_csv("./Files/all_dependencies.csv")

In [None]:
def count_cross_project(p):
    nbr_cross_project = len(set(hpr.flatten_list(df_all_dependencies.loc[
        (df_all_dependencies["Source_repo"]==p) &
        (df_all_dependencies["is_cross"]=="Cross") &
        (df_all_dependencies["Source_status"]=="MERGED") & 
        (df_all_dependencies["Target_status"]=="MERGED") &
        (df_all_dependencies["is_source_bot"]==False) &
        (df_all_dependencies["is_target_bot"]==False), ["Source"]].values)))
    return nbr_cross_project

In [None]:
df_source_projects = pd.DataFrame({"Project": df_all_dependencies["Source_repo"].unique().tolist()})
df_source_projects["Total_changes"] = df_source_projects["Project"].map(lambda p: len(set(df_all_dependencies.loc[
    (df_all_dependencies["Source_repo"]==p) &
    (df_all_dependencies["Source_status"]=="MERGED") & 
    (df_all_dependencies["Target_status"]=="MERGED") &
    (df_all_dependencies["is_source_bot"]==False) &
    (df_all_dependencies["is_target_bot"]==False), "Source"].values)))
df_source_projects["Total_cross_project_changes"] = df_source_projects["Project"].map(count_cross_project)
# df_source_projects = df_source_projects[df_source_projects["Total_cross_project_changes"]!=0].reset_index(drop=True)

In [None]:
df_source_projects.loc[(df_source_projects["Total_changes"]!=0)&(df_source_projects["Total_cross_project_changes"]!=0), ["Total_changes", "Total_cross_project_changes"]].to_csv("./RQs/RQ5_MA/k_means_data.csv", index=False)

In [None]:
# data
X = df_source_projects.loc[(df_source_projects["Total_changes"]!=0)&(df_source_projects["Total_cross_project_changes"]!=0), ["Total_changes", "Total_cross_project_changes"]].values

log_X = np.log(X)

# Create a KMeans instance with 2 clusters
kmeans = KMeans(n_clusters=3)

# Fit the data to the KMeans model
kmeans.fit(log_X)

# Get the cluster labels assigned to each data point
labels = kmeans.labels_

# Get the coordinates of the cluster centers
centers = kmeans.cluster_centers_

# Visualize the data points and cluster centers
plt.scatter(log_X[:, 0], log_X[:, 1], c=labels,)
plt.scatter(centers[:, 0], centers[:, 1], marker='x', color='red')
plt.xlabel('# of all changes')
plt.ylabel('# of cross-project changes')
plt.title('K-means Clustering')
# plt.xticks(ticks=plt.xticks()[0], labels=np.round(np.exp(plt.xticks()[0]), 2))
# plt.yticks(ticks=plt.yticks()[0], labels=np.round(np.exp(plt.yticks()[0]), 2))
plt.show()

In [None]:
stats.spearmanr(log_X[:, 0], log_X[:, 1])