In [None]:
import csv
from xml.etree import ElementTree
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import os
from matplotlib.ticker import StrMethodFormatter
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy.stats as stats
from pyclustertend import hopkins, vat, assess_tendency_by_mean_metric_score
from sklearn.preprocessing import scale, StandardScaler, MinMaxScaler, minmax_scale, RobustScaler,robust_scale
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns
import sklearn.metrics as compute_metrics
from sklearn.metrics.cluster import adjusted_rand_score

In [None]:
test_cov_file = None

true_labels = None

save_to_folder = "/classification/plots/"

def get_labelled_data():
    file = "/classification/all_labels.csv"
    data = pd.read_csv(file, sep=';')
    return data

complete_data_labels = pd.read_csv("/classification/init_all_labels.csv", sep=';')

all_labels = get_labelled_data()
all_labels

In [None]:
list_columns = all_labels.columns.tolist()
str_cols = ['Method', 'CLevel_threshold', 'CLevel_k_means', 'CLevel_em']
list_columns = [col for col in list_columns if col not in str_cols]
list_columns

In [None]:
df = None
if true_labels is not None:
    df_true_labells = pd.read_csv(true_labels, sep=';')
    df_true_labells = df_true_labells[['Method', 'CLevel']]
    df = pd.merge(df_true_labells, all_labels, how='left', on='Method')
    df = df[['Method', 'LOC', 'CC', 'NP', 'NV', 'NEST', 'Ca', 'Ce', 'NChg',
           'NCall', 'CLevel', 'CLevel_threshold', 'CLevel_k_means', 'CLevel_em']] sep=';', index=False)
df

In [None]:
scaled_data = all_labels.copy()

for col_name in list_columns:
    col = scaled_data[col_name]
    min_col, max_col = col.min(), col.max()
#     min_col = 0  # consider min as 0 to perserve the importance of values; eg LOC 25, 50 -> 0.5, 1 
#     print(col_name, min_col, max_col)
    scaled_data[col_name] = (col - min_col) / (max_col - min_col)
    
scaled_data

In [None]:
scaled_data[list_columns]

In [None]:
# sh_list_columns = ['LOC', 'NP', 'Ca', 'Ce', 'NChg']
X = all_labels[list_columns]
X_scaled = StandardScaler().fit_transform(X)

# X_scaled = scaled_data[list_columns]

features = X_scaled.T
cov_matrix = np.cov(features)

values, vectors = np.linalg.eig(cov_matrix)

importance = {}
explained_variances = []
for i in range(len(values)):
    val = values[i] / np.sum(values)
    explained_variances.append(val)
    importance[val] = list_columns[i]
 
print(np.sum(explained_variances), '\n', explained_variances)
dict_keys = list(importance.keys())
dict_keys.sort(reverse = True)
all_in_order = ""
for k in dict_keys:
    all_in_order += importance[k] + "  "
print(all_in_order)
print(dict_keys)

projected_1 = X_scaled.dot(vectors.T[0])
projected_2 = X_scaled.dot(vectors.T[1])
res = pd.DataFrame(projected_1, columns=['PC1'])
res['PC2'] = projected_2
res.head()

In [None]:
# plt.figure(figsize=(5,5))
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18,5))
ax= axes.flatten()
clvls = ['CLevel_threshold', 'CLevel_k_means', 'CLevel_em']
titles = ['Threshold', 'K-means', 'EM']
for i in range(3):
    lvl = clvls[i]
    sns.scatterplot(x=res['PC1'], y=res['PC2'], hue=all_labels[lvl],
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=[ "high", "regular","low"], s=20, ax=ax[i])
    ax[i].legend(loc="lower left", title=titles[i])
#     ax[i].set_ylabel(col_name)


plt.show()

# fig.suptitle('Clustering results on the first two principal components')
plt.savefig(save_to_folder + 'pca.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
metrics_list = ['LOC',  'NP', 'Ca', 'Ce', 'NChg']

temp_df = pd.DataFrame()
temp_df["CRank"] = scaled_data[metrics_list].sum(axis=1)
temp_df = temp_df.sort_values(by='CRank', ignore_index=True)
n = temp_df.shape[0]
first_cut = round(n*0.7)
second_cut = round(n*0.9)

print(first_cut, second_cut, n)

temp_df.loc[:first_cut, "CLevel"] = "low"
temp_df.loc[first_cut:second_cut, "CLevel"] = "regular"
temp_df.loc[second_cut:, "CLevel"] = "high"
print(temp_df.describe())
grouped_temp_df = temp_df.groupby('CLevel')
grouped_temp_df.describe()

In [None]:
df = scaled_data.copy()
df = pd.melt(df, id_vars=str_cols, value_vars=list_columns)
df

In [None]:
scaled_data.describe()

In [None]:
for lvl in ['CLevel_threshold', 'CLevel_k_means', 'CLevel_em']:
    print(lvl)
    l1 = complete_data_labels[lvl]
    l2 = all_labels[lvl]
    
    ari = adjusted_rand_score(l1, l2)
    pr = compute_metrics.precision_score(l1, l2, labels=['high', 'regular', 'low'], average=None)
    acc = compute_metrics.accuracy_score(l1, l2)
    recall = compute_metrics.recall_score(l1, l2, labels=['high', 'regular', 'low'], average=None)
    print('ari: {}  precision: {}, recall: {} accuracy: {}'.format(ari, pr, recall, acc))

In [None]:
complete_data_labels['CLevel_threshold']

In [None]:
fig, ax = plt.subplots(figsize=(15,7), dpi= 80)    
sns.stripplot(data=df, x='variable', y='value', hue='CLevel_threshold',
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=["low", "regular", "high"],
              jitter=0.25, size=5, ax=ax, linewidth=.3, dodge=True)
# plt.title('Results for the threshold approach')
# plt.show()
plt.xlabel('')
plt.ylabel('')
plt.savefig(save_to_folder + 'threshold.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
fig, ax = plt.subplots(figsize=(15,7), dpi= 80)    
sns.stripplot(data=df, x='variable', y='value', hue='CLevel_k_means',
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=["low", "regular", "high"],
              jitter=0.25, size=5, ax=ax, linewidth=.3, dodge=True)
# plt.title('Results for the K-means algorithm')
# plt.show()
plt.xlabel('')
plt.ylabel('')

plt.savefig(save_to_folder + 'k-means.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
fig, ax = plt.subplots(figsize=(15,7), dpi= 80)    
sns.stripplot(data=df, x='variable', y='value', hue='CLevel_em',
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=["low", "regular", "high"],
              jitter=0.25, size=5, ax=ax, linewidth=.3, dodge=True)
# plt.title('Results for the EM algorithm')
# plt.show()
plt.xlabel('')
plt.ylabel('')
plt.savefig(save_to_folder + 'em.pdf', bbox_inches = 'tight', pad_inches = 0)

In [None]:
if test_cov_file is None:
    test_data = pd.DataFrame(columns = ['Method', 'CoveredStatements', 'TotalStatements'])
else:
    test_data = pd.read_csv(test_cov_file, sep=';')

data_combined = pd.merge(all_labels[str_cols], test_data, on='Method', how='left')

data_combined.isna().sum()

In [None]:
from pywaffle import Waffle

print('Result for threshold approach')

low_ = data_combined[data_combined['CLevel_threshold'] == "low"]
regular_ = data_combined[data_combined['CLevel_threshold'] == "regular"]
high_ = data_combined[data_combined['CLevel_threshold'] == "high"]

if data_combined['TotalStatements'].sum() == 0:
    l_p, r_p, h_p = 0, 0, 0
else:
    l_p = low_['CoveredStatements'].sum()/low_['TotalStatements'].sum()
    r_p = regular_['CoveredStatements'].sum()/regular_['TotalStatements'].sum()
    h_p = high_['CoveredStatements'].sum()/high_['TotalStatements'].sum()

print('low: {}  regular: {}  high: {}  test coverage percetange'.format(l_p, r_p, h_p))

data = {'Low critical': low_.shape[0], 'Regular critical': regular_.shape[0], 'High critical': high_.shape[0]}
print('Methods number', data)

fig = plt.figure(
    FigureClass=Waffle, 
    rows=10, 
    values=data, 
    colors=("#232066", "#DCB732", "#983D3D"),
    labels=['Low critical', 'Regular critical', 'High critical'],
    legend={'loc': 'lower left', 'bbox_to_anchor': (0, -0.4), 'ncol': len(data), 'framealpha': 0},
    interval_ratio_x=0.5,
    interval_ratio_y=0.5,
    figsize=(20,10)
)
fig.gca().set_facecolor('#EEEEEE')
fig.set_facecolor('#EEEEEE')
plt.title('Result for threshold approach')
plt.show()

In [None]:
print('Result for k_means algorithm')

low_ = data_combined[data_combined['CLevel_k_means'] == "low"]
regular_ = data_combined[data_combined['CLevel_k_means'] == "regular"]
high_ = data_combined[data_combined['CLevel_k_means'] == "high"]

if data_combined['TotalStatements'].sum() == 0:
    l_p, r_p, h_p = 0, 0, 0
else:
    l_p = low_['CoveredStatements'].sum()/low_['TotalStatements'].sum()
    r_p = regular_['CoveredStatements'].sum()/regular_['TotalStatements'].sum()
    h_p = high_['CoveredStatements'].sum()/high_['TotalStatements'].sum()
print('low: {}  regular: {}  high: {}  test coverage percetange'.format(l_p, r_p, h_p))


data = {'Low critical': low_.shape[0], 'Regular critical': regular_.shape[0], 'High critical': high_.shape[0]}
print('Methods number', data)

fig = plt.figure(
    FigureClass=Waffle, 
    rows=10, 
    values=data, 
    colors=("#232066", "#DCB732", "#983D3D"),
    labels=['Low critical', 'Regular critical', 'High critical'],
    legend={'loc': 'lower left', 'bbox_to_anchor': (0, -0.4), 'ncol': len(data), 'framealpha': 0},
    interval_ratio_x=0.5,
    interval_ratio_y=0.5,
    figsize=(20,10)
)
fig.gca().set_facecolor('#EEEEEE')
fig.set_facecolor('#EEEEEE')
plt.title('Result for k_means algorithm')
plt.show()

In [None]:
print('Result for EM algorithm')

low_ = data_combined[data_combined['CLevel_em'] == "low"]
regular_ = data_combined[data_combined['CLevel_em'] == "regular"]
high_ = data_combined[data_combined['CLevel_em'] == "high"]

if data_combined['TotalStatements'].sum() == 0:
    l_p, r_p, h_p = 0, 0, 0
else:
    l_p = low_['CoveredStatements'].sum()/low_['TotalStatements'].sum()
    r_p = regular_['CoveredStatements'].sum()/regular_['TotalStatements'].sum()
    h_p = high_['CoveredStatements'].sum()/high_['TotalStatements'].sum()
    
print('low: {}  regular: {}  high: {}  test coverage percetange'.format(l_p, r_p, h_p))


data = {'Low critical': low_.shape[0], 'Regular critical': regular_.shape[0], 'High critical': high_.shape[0]}
print('Methods number', data)

fig = plt.figure(
    FigureClass=Waffle, 
    rows=10, 
    values=data, 
    colors=("#232066", "#DCB732", "#983D3D"),
    labels=['Low critical', 'Regular critical', 'High critical'],
    legend={'loc': 'lower left', 'bbox_to_anchor': (0, -0.4), 'ncol': len(data), 'framealpha': 0},
    interval_ratio_x=0.5,
    interval_ratio_y=0.5,
    figsize=(20,10)
)
fig.gca().set_facecolor('#EEEEEE')
fig.set_facecolor('#EEEEEE')
plt.title('Result for EM algorithm')
plt.show()

In [None]:
total = data_combined['CoveredStatements'].sum()/data_combined['TotalStatements'].sum()
print('total coverage', total)

In [None]:
r = data_combined[data_combined['CLevel_threshold'] == "low"]
r[data_combined['CLevel_k_means'] == "low"]