In [None]:
import csv
from xml.etree import ElementTree
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import os
from matplotlib.ticker import StrMethodFormatter
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy.stats as stats
from pyclustertend import hopkins, vat, assess_tendency_by_mean_metric_score
from sklearn.preprocessing import scale, MinMaxScaler, minmax_scale, RobustScaler,robust_scale
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import silhouette_score
import sklearn.metrics as compute_metrics
import seaborn as sns

In [None]:
real_labels_file = None

metrics_labelled_file = "/classification_all/complete_classification.csv"

chg_lines_file = "/changed_lines.csv"

var_list = ['LOC', 'CC', 'NP', 'NV', 'NEST', 'Ca', 'Ce', 'NChg', 'NCall']
label_list = ['CLevel_threshold', 'CLevel_k_means', 'CLevel_em']

def get_labelled_data():
    data = pd.read_csv(metrics_labelled_file, sep=';')
    return data

labelled_df = get_labelled_data()
labelled_df

# Using expert knowledge

In [None]:
real_labels_data = None
if real_labels_file is not None:
    real_labels_data = pd.read_csv(real_labels_file, sep=';')
    print(real_labels_data.describe())

In [None]:

def print_cm(cm, labels):
    """pretty print for confusion matrixes"""
    column_width = 10
    # Print header
    header = " " * column_width
    for label in labels:
        header += "%{0}s".format(column_width) % label
    print(header)
    # Print rows
    for i, label1 in enumerate(labels):
        row_text = "%{0}s".format(column_width) % label1
        for j in range(len(labels)):
            cell = "%{0}.1f".format(column_width) % cm[i, j]
            row_text += cell
        print(row_text)


def classification_report(real, predicted):
    labels = ['high', 'regular', 'low']
    ari = adjusted_rand_score(labels_true=real, labels_pred=predicted)
    acc = compute_metrics.accuracy_score(y_true=real, y_pred=predicted)
    report = compute_metrics.classification_report(y_true=real, y_pred=predicted, labels=labels)
    conf_matrix = compute_metrics.confusion_matrix(y_true=real, y_pred=predicted, labels=labels)
    print('ARI ', ari)
    print('Accuracy ', acc)
    print(report)
    print('Confusion matrix')
    print_cm(conf_matrix, labels)


def classification_report_for_all():
    real_labels_data = pd.read_csv(real_labels_file, sep=';')
    
    metrics_labelled_data = pd.read_csv(metrics_labelled_file, sep=';')

    data_combined = pd.merge(left=real_labels_data[['Method', 'CLevel']],
                             right=metrics_labelled_data[['Method', 'CLevel_threshold', 'CLevel_k_means', 'CLevel_em']],
                             on='Method', how='inner')

    pred_labels_var = ['CLevel_threshold', 'CLevel_k_means', 'CLevel_em']
    for y_pred in pred_labels_var:
        print('------- {} ------'.format(y_pred))
        classification_report(data_combined['CLevel'], data_combined[y_pred])

if real_labels_data is not None:
    classification_report_for_all()

# Using changed lines

In [None]:

chg_lines_data = pd.read_csv(chg_lines_file, sep=';')
chg_lines_data

In [None]:
df = pd.merge(labelled_df, chg_lines_data[['Previous_Method_Parsed', 'ChgLines']], how='inner', left_on='Method', right_on='Previous_Method_Parsed')
df

In [None]:
df.isna().sum()

In [None]:
df[df['ChgLines']>0]

In [None]:
custom_dict = {'low': 0, 'regular': 1, 'high': 2}

In [None]:
sub_df1 = df[['Method','CLevel_threshold', 'ChgLines']]
sub_df1 = sub_df1.sort_values(by=['CLevel_threshold'], key=lambda x: x.map(custom_dict), ignore_index=True)
sub_df1['method_idx'] = sub_df1.index

fig, ax = plt.subplots(figsize=(15,4), dpi= 80)
sns.scatterplot(data=sub_df1, x="method_idx", y="ChgLines", hue="CLevel_threshold", linewidth=0, alpha=0.5,
                palette={'low':'blue', 'regular':'#DCB732', 'high':'red'})
plt.legend(loc='upper left')
plt.title('Number of changed lines after the threshold-based clustering')
plt.ylabel('Number of lines changed')
plt.xlabel('Methods')
# plt.savefig(plots_save_to_location + '/chg-lines_threshold.pdf', bbox_inches = 'tight', pad_inches = 0)
plt.show()

In [None]:
grouped_df1 = sub_df1.groupby('CLevel_threshold')
print(grouped_df1[['ChgLines']].sum())
grouped_df1[['ChgLines']].describe()

In [None]:
sub_df2 = df[['Method', 'CLevel_k_means', 'ChgLines']]
sub_df2 = sub_df2.sort_values(by=['CLevel_k_means'], key=lambda x: x.map(custom_dict), ignore_index=True)
sub_df2['method_idx'] = sub_df2.index

fig, ax = plt.subplots(figsize=(15,4), dpi= 80)
sns.scatterplot(data=sub_df2, x="method_idx", y="ChgLines", hue="CLevel_k_means", 
                palette={'low':'blue', 'regular':'#DCB732', 'high':'red'})
plt.legend(loc='upper left')
plt.title('Number of changed lines after the K-means clustering')
plt.xlabel('Methods')
plt.savefig(plots_save_to_location + '/chg-lines_k_means.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
grouped_df2 = sub_df2.groupby('CLevel_k_means')
print(grouped_df2[['ChgLines']].sum())
grouped_df2[['ChgLines']].describe()

In [None]:
sub_df3 = df[['Method', 'CLevel_em', 'ChgLines']]
sub_df3 = sub_df3.sort_values(by=['CLevel_em'], key=lambda x: x.map(custom_dict), ignore_index=True)
sub_df3['method_idx'] = sub_df3.index

fig, ax = plt.subplots(figsize=(15,4), dpi= 80)
sns.scatterplot(data=sub_df3, x="method_idx", y="ChgLines", hue="CLevel_em", 
                palette={'low':'blue', 'regular':'#DCB732', 'high':'red'})
plt.legend(loc='upper left')
plt.title('Number of changed lines after the EM clustering')
plt.xlabel('Methods')
plt.savefig(plots_save_to_location + '/chg-lines_em.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
grouped_df3 = sub_df3.groupby('CLevel_em')
print(grouped_df3[['ChgLines']].sum())
grouped_df3[['ChgLines']].describe()

In [None]:
m_df = df[['Method','CLevel_threshold','CLevel_k_means','CLevel_em', 'ChgLines']]
m_df = pd.melt(df, id_vars=['Method', 'ChgLines'], value_vars=['CLevel_threshold','CLevel_k_means','CLevel_em'])
m_df.columns = ['Method', 'ChgLines', 'CType', 'CLevel']
m_df

In [None]:
fig, ax = plt.subplots(figsize=(10,5), dpi= 80)    
sns.stripplot(data=m_df, x='CType', y='ChgLines', hue='CLevel',
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=["low", "regular", "high"],
              jitter=0.25, size=5, ax=ax, linewidth=.3, dodge=True)
# plt.title('Results for the threshold approach')
plt.xlabel('')
plt.show()

# Changed lines correlation with the other metrics

In [None]:
scaled_data = df.copy()
list_columns = ['LOC', 'CC', 'NP', 'NV', 'NEST', 'Ca', 'Ce', 'NChg', 'NCall', 'ChgLines']

for col_name in list_columns:
    col = scaled_data[col_name]
    min_col, max_col = col.min(), col.max()
#     min_col = 0  # consider min as 0 to perserve the importance of values; eg LOC 25, 50 -> 0.5, 1 
#     print(col_name, min_col, max_col)
    scaled_data[col_name] = (col - min_col) / (max_col - min_col)
    
scaled_data

In [None]:
p_corr = scaled_data[list_columns].corr(method='kendall')
fig, ax = plt.subplots(figsize=(10,5))
sns.heatmap(p_corr, xticklabels=p_corr.columns, yticklabels=p_corr.columns, annot=True, cmap='coolwarm', ax = ax)

In [None]:
p_corr = df[list_columns].corr(method='kendall')
fig, ax = plt.subplots(figsize=(10,5))
sns.heatmap(p_corr, xticklabels=p_corr.columns, yticklabels=p_corr.columns, annot=True, cmap='coolwarm', ax = ax)