In [None]:
import csv
from xml.etree import ElementTree
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import os
from matplotlib.ticker import StrMethodFormatter
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy.stats as stats
from pyclustertend import hopkins, vat, assess_tendency_by_mean_metric_score
from sklearn.preprocessing import scale, MinMaxScaler, minmax_scale, RobustScaler,robust_scale
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns

In [None]:
# the csv containing all the labels
data_file = "/classification/2_all_labels.csv"

var_list = ['LOC', 'CC', 'NP', 'NV', 'NEST', 'Ca', 'Ce', 'NChg', 'NCall']
label_list = ['CLevel_threshold', 'CLevel_k_means', 'CLevel_em']

plots_save_to_location = "/classification/plots_1"
chg_lines_file = "/changed_lines.csv"

In [None]:
data = pd.read_csv(data_file, sep=';')
data

In [None]:
data['LOC'].sum()

In [None]:
scaled_data = data.copy()

for col_name in var_list:
    col = scaled_data[col_name]
    min_col, max_col = col.min(), col.max()
    scaled_data[col_name] = (col - min_col) / (max_col - min_col)
    
scaled_data

In [None]:
melted_data = scaled_data.copy()
melted_data = pd.melt(melted_data, id_vars=['Method', 'CLevel_threshold', 'CLevel_k_means', 'CLevel_em'], value_vars=var_list)
melted_data

In [None]:
fig, ax = plt.subplots(figsize=(10,5), dpi= 80)
sns.stripplot(data=melted_data, x='variable', y='value', hue='CLevel_threshold',
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=["low", "regular", "high"],
              jitter=0.25, size=8, ax=ax, linewidth=.5, dodge=True)

print("Threshold clustering")
# plt.show()
plt.savefig(plots_save_to_location + '/melted_threshold', bbox_inches = 'tight', pad_inches = 0)

In [None]:
fig, ax = plt.subplots(figsize=(10,5), dpi= 80)
sns.stripplot(data=melted_data, x='variable', y='value', hue='CLevel_k_means',
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=["low", "regular", "high"],
              jitter=0.25, size=8, ax=ax, linewidth=.5, dodge=True)

print("k-means clustering")
# plt.show()
plt.savefig(plots_save_to_location + '/melted_k_means', bbox_inches = 'tight', pad_inches = 0)

In [None]:
fig, ax = plt.subplots(figsize=(10,5), dpi= 80)
sns.stripplot(data=melted_data, x='variable', y='value', hue='CLevel_em',
              palette={'low':'blue', 'regular':'#DCB732', 'high':'red'},
              hue_order=["low", "regular", "high"],
              jitter=0.25, size=8, ax=ax, linewidth=.5, dodge=True)

print("EM clustering")
# plt.show()
plt.savefig(plots_save_to_location + '/melted_em', bbox_inches = 'tight', pad_inches = 0)

# Validation

In [None]:
chg_lines_data = pd.read_csv(chg_lines_file, sep=';')
chg_lines_data

In [None]:
df = pd.merge(data, chg_lines_data[['Previous_Method_Parsed', 'ChgLines']], how='inner', left_on='Method', right_on='Previous_Method_Parsed')
df

In [None]:
df.isna().sum()

In [None]:
custom_dict = {'low': 0, 'regular': 1, 'high': 3}

In [None]:
sub_df1 = df[['Method','CLevel_threshold', 'ChgLines']]
sub_df1 = sub_df1.sort_values(by=['CLevel_threshold'], key=lambda x: x.map(custom_dict), ignore_index=True)
sub_df1['method_idx'] = sub_df1.index

fig, ax = plt.subplots(figsize=(10,5), dpi= 80)
sns.scatterplot(data=sub_df1, x="method_idx", y="ChgLines", hue="CLevel_threshold", 
                palette={'low':'blue', 'regular':'#DCB732', 'high':'red'})
plt.legend(loc='upper left')
plt.savefig(plots_save_to_location + '/chg-lines_threshold', bbox_inches = 'tight', pad_inches = 0)

In [None]:
sub_df2 = df[['Method', 'CLevel_k_means', 'ChgLines']]
sub_df2 = sub_df2.sort_values(by=['CLevel_k_means'], key=lambda x: x.map(custom_dict), ignore_index=True)
sub_df2['method_idx'] = sub_df2.index

fig, ax = plt.subplots(figsize=(10,5), dpi= 80)
sns.scatterplot(data=sub_df2, x="method_idx", y="ChgLines", hue="CLevel_k_means", 
                palette={'low':'blue', 'regular':'#DCB732', 'high':'red'})
plt.legend(loc='upper left')
plt.savefig(plots_save_to_location + '/chg-lines_k_means', bbox_inches = 'tight', pad_inches = 0)

In [None]:
sub_df3 = df[['Method', 'CLevel_em', 'ChgLines']]
sub_df3 = sub_df3.sort_values(by=['CLevel_em'], key=lambda x: x.map(custom_dict), ignore_index=True)
sub_df3['method_idx'] = sub_df3.index

fig, ax = plt.subplots(figsize=(10,5), dpi= 80)
sns.scatterplot(data=sub_df3, x="method_idx", y="ChgLines", hue="CLevel_em", 
                palette={'low':'blue', 'regular':'#DCB732', 'high':'red'})
plt.legend(loc='upper left')
plt.savefig(plots_save_to_location + '/chg-lines_em', bbox_inches = 'tight', pad_inches = 0)