#df_orig - 10000 ads
#df_uniq - around 9000 ads
#df_non_noisy - around 5500 ads
#Append the rows that need to be deleted at the end of the dataframe

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.decomposition import TruncatedSVD

stop_words = set(stopwords.words('english'))
FILE = '../data/ht_unique_jun.csv'
OUTPUT_FILE = '../data/svd_results/results_temp8.csv'
OUTPUT_PROP_FILE = '../data/svd_results/cluster_properties_temp8.json'

df_orig = pd.read_csv(FILE)

content = []
for index, row in df_orig.iterrows():
    c = ''

    if type(row['body']) == type(' '):
        c = c + row['body']
    # c = row['title'] + ' ' + row['body']
    c = re.sub(r'\d+', '', c)
#     if type(row['Name']) == type(''):
#         name = row['Name'].split(';')
#         for n in name:
#             name_regex = re.compile(re.escape(n), re.IGNORECASE)
#             c = name_regex.sub('', c)
    c = re.sub(r'[^\x00-\x7F]+',' ', c)
    cleanr = re.compile('<.*?>')
    c = re.sub(cleanr, '', c)
#     df_data.at[index, 'body'] = c
    content.append(c)
df_orig['content_p'] = content

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(2,3), norm='l2', 
    smooth_idf=True, stop_words=stop_words, min_df=2, max_df=0.8)
bigram_matrix = vectorizer.fit_transform(content)
print(bigram_matrix.shape)
# print (bigram_matrix[0])

# svd = TruncatedSVD(n_components=3)
# svd.fit_transform(bigram_matrix)
# np.save('../data/modalities_data/tf_idf_bigrams.npy', bigram_matrix)
print ("Done")

In [None]:
df_orig.drop_duplicates(['title', 'body'])

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

counts = df_orig['label'].value_counts()
ordered_counts = list(map(lambda x: counts[x], range(0, 7)))
c = ['green', 'green', 'green', 'grey', 'red', 'red', 'red']
t_l = ['0-Strongly likely Not Trafficking', '1-Likely Not Trafficking', '2-Weakly likely Not Trafficking', '3-Unsure',
      '4-Weakly likely Trafficking', '5-Likely Trafficking', '6-Strongly Likely Trafficking']
plt.barh(range(0,7), width=ordered_counts, color=c, alpha=0.4)
plt.yticks(range(0,7), t_l)
plt.xlabel('Ad Count')
plt.tight_layout()
# plt.show()
plt.savefig('../results/ads_count_dist.png')

In [None]:
print (ordered_counts)

In [None]:
true_labels = df_orig['label'].values.tolist()

binary_true_labels = [0] * len(true_labels)
for ind, label in enumerate(true_labels):
    if label >= 4:
        binary_true_labels[ind] = 1

df_orig['binary_label'] = binary_true_labels
# df_unique = df_orig.copy()

In [None]:
svd = TruncatedSVD(n_components=20)
encoded_vecs = svd.fit_transform(bigram_matrix)

In [None]:
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=3, algorithm='best', alpha=1.0)
clusterer.fit(encoded_vecs)
#     print (clusterer.labels_)
labels = clusterer.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
unique_labels = set(labels)
probs = clusterer.probabilities_
#     colors = [plt.cm.Spectral(each)
#               for each in np.linspace(0, 1, len(set(true_labels))]
print ("Number of labels : " + str(len(list(set(clusterer.labels_)))))

In [None]:
df_orig['cluster_label'] = labels
df_orig['sim_check'] = False
df_orig['sim_index'] = -1

In [None]:
def calculate_pairwise_similarities(mat):
    mat = np.asarray(mat.todense())
    sim_scores = np.zeros((mat.shape[0], mat.shape[0]))
    for i in range(mat.shape[0]):
        for j in range(i+1, mat.shape[0]):
#             print ("i : {}, j : {}".format(i,j))
            if i == j:
                continue
#             print (len(mat[i]))
            sim_scores[i][j] = (mat[i]==mat[j]).all()
    return sim_scores

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

filtered_df = pd.DataFrame(columns=df_orig.columns)
# unique_labels = list(unique_labels) + [-1]
blacklisted_global_indices = []
for l in unique_labels:
    df_fil = df_orig[df_orig['cluster_label']==l].copy()
    if l == -1:
        filtered_df = pd.concat((filtered_df, df_fil), axis=0)
        continue
    indices = list(df_fil.index)
    df_fil.reset_index(drop=True, inplace=True)
    
    ads = list(df_fil['content_p'])
#     print (ads)
    count_vectorizer = CountVectorizer(lowercase=True, ngram_range=(2,3), stop_words=stop_words)
    ads_vectors = count_vectorizer.fit_transform(ads)
#     sim_scores = calculate_pairwise_similarities(ads_vectors)
    sim_scores = cosine_similarity(ads_vectors, dense_output=True)
    sim_scores *= np.tri(*sim_scores.shape)
    np.fill_diagonal(sim_scores, 0.0)

    indices_similar = np.where(sim_scores>0.998)
    cluster_tuples = zip(indices_similar[0], indices_similar[1])
#     print (cluster_tuples)
#     indices_similar = np.where(sim_scores == 1)
#     x = indices_similar[0]
#     y = indices_similar[1]
#     print (sim_scores)
#     print (y)
    blacklisted_indices = []
    
    for tup in cluster_tuples:
#         print (tup)
        if tup[0] > tup[1]:
            blacklisted_indices.append(indices[tup[1]])
            blacklisted_global_indices.append(indices[tup[1]])
#     print (df_orig.loc[blacklisted_global_indices, 'body'])
#     break
#     blacklisted_indices = list(set(blacklisted_indices))
    print (indices, set(blacklisted_indices))
    remain_ind = [x for x in indices if x not in blacklisted_indices]
    df_orig.loc[blacklisted_indices, 'sim_index'] = remain_ind[0] if len(remain_ind) >0 else -1
#     if len(df_fil) < 10:
#         print (df_fil['body'])
#         print (sim_scores)
#         print (blacklisted_indices)
#     df_fil.drop(blacklisted_indices, inplace=True)
#     filtered_df = pd.concat((filtered_df, df_fil), axis=0)
blacklisted_global_indices = list(set(blacklisted_global_indices))
df_orig.loc[blacklisted_global_indices, 'sim_check'] = True

#     break
print (df_orig.shape)

In [None]:
print (len(blacklisted_global_indices))

In [None]:
print (df_orig.at[720, 'body'])
print (df_orig.at[3916, 'body'])
print (df_orig.at[1319, 'body'])
print (df_orig.at[4352, 'body'])

In [None]:
print (len(set(blacklisted_global_indices)))

In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.decomposition import TruncatedSVD

stop_words = set(stopwords.words('english'))

filtered_df = df_orig[df_orig['sim_check'] == False].copy()
print (filtered_df.shape)
content = []
# content = df_data['body']
# content = content.replace(np.nan, '', regex=True)
for index, row in filtered_df.iterrows():
    c = ''

    if type(row['body']) == type(' '):
        c = c + row['body']
    # c = row['title'] + ' ' + row['body']
    c = re.sub(r'\d+', '', c)
    if type(row['Name']) == type(''):
        name = row['Name'].split(';')
        for n in name:
            name_regex = re.compile(re.escape(n), re.IGNORECASE)
            c = name_regex.sub('', c)
        c = re.sub(r'[^\x00-\x7F]+',' ', c)
        cleanr = re.compile('<.*?>')
        c = re.sub(cleanr, '', c)
#     df_data.at[index, 'body'] = c
    content.append(c)
filtered_df['content_p'] = content


vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(2,4), norm='l2', 
     stop_words=stop_words, min_df=2, max_df=0.8)
bigram_matrix = vectorizer.fit_transform(content)
features_col = vectorizer.get_feature_names()
print(bigram_matrix.shape)
# print (bigram_matrix[0])

# svd = TruncatedSVD(n_components=3)
# svd.fit_transform(bigram_matrix)
# np.save('../data/modalities_data/tf_idf_bigrams.npy', bigram_matrix)
print ("Done")

In [None]:
s = bigram_matrix.sum(axis=1)
print (s.shape)
print (len(np.argwhere(s==0)))

In [None]:
svd = TruncatedSVD(n_components=64)
encoded_vecs = svd.fit_transform(bigram_matrix)
# print (a.shape)

In [None]:
print (encoded_vecs.shape)

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
# rcParams['figure.figsize'] = 10,100
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
plt.imshow(encoded_vecs)
ax.set_aspect('equal')
plt.colorbar(orientation='vertical')
plt.show()

In [None]:
from scipy.spatial import distance
origin = np.zeros(64)
dist = np.zeros(len(encoded_vecs), dtype=float)
for i in range(len(encoded_vecs)):
    dist[i] = distance.euclidean(origin, encoded_vecs[i])
    
noisy_set = np.argwhere(dist < 0.1)


In [None]:
print(type(bigram_matrix))
print (bigram_matrix.shape)

In [None]:
s = bigram_matrix.sum(axis=1)
print (s.shape)
print (len(np.argwhere(s==0)))

In [None]:
filtered_df['noise'] = False
filtered_df['index1'] = filtered_df.index
filtered_df.reset_index(drop=True, inplace=True)
print (max(filtered_df.index))

In [None]:
print (type(bigram_matrix))

In [None]:
bigram_matrix = bigram_matrix.todense()

In [None]:
print (type(bigram_matrix))

In [None]:
from scipy.spatial import distance
# spatial.distance.euclidean(origin, encoded_vecs[0])
noisy_list = []
# filtered_df.reset_index(drop=True, inplace=True)
noisy_list=list(map(lambda x : x[0], noisy_set))
print(len(noisy_list))
filtered_df.loc[noisy_list, 'noise'] = True

# noisy_list

In [None]:
filtered_df.set_index('index1', inplace=True)
df_orig = df_orig.join(filtered_df['noise'], how='outer')
filtered_df['index1'] = filtered_df.index
filtered_df.reset_index(drop=True, inplace=True)

In [None]:
df_nonoise = filtered_df[filtered_df['noise'] == False].copy()
df_nonoise.reset_index(drop=True, inplace=True)
bigram_matrix = np.delete(bigram_matrix, noisy_list, axis=0)
encoded_vecs = np.delete(encoded_vecs, noisy_list, axis=0)

In [None]:
print (encoded_vecs.shape)

In [None]:
noisy_set = noisy_set.reshape(-1)

In [None]:
colors = ['green', 'red', 'blue']
binary_true_labels = list(df_nonoise['binary_label'])
color_array = []
for ind, l in enumerate(binary_true_labels):
    color_array.append(colors[int(l)])

In [None]:
print (len(color_array))

In [None]:
import umap
X_embedded = umap.UMAP().fit_transform(encoded_vecs)

In [None]:
# ######################### HBDSCAN ############################
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=3, algorithm='best', alpha=1.0)
clusterer.fit(encoded_vecs)
#     print (clusterer.labels_)
labels = clusterer.labels_
labels_clustering = labels
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
unique_labels = set(labels)
probs = clusterer.probabilities_
#     colors = [plt.cm.Spectral(each)
#               for each in np.linspace(0, 1, len(set(true_labels))]
print ("Number of labels : " + str(len(list(set(clusterer.labels_)))))
#     palette = sns.color_palette()
#     cluster_colors = [sns.desaturate(palette[col], sat)
#                       if col >= 0 else (0.5, 0.5, 0.5) for col, sat in
#                       zip(clusterer.labels_, clusterer.probabilities_)]

In [None]:
binary_true_labels = np.asarray(binary_true_labels)
# noise_labels = np.where(labels==-1)
no_noise_labels = np.where(labels!=-1)
tr_labels = np.where(binary_true_labels==1)
print (binary_true_labels.shape)
print (no_noise_labels[0].shape)
print (tr_labels)
# lst3 = [value for value in tr_labels if value in noise_labels] 
common_tp = np.intersect1d(no_noise_labels[0], tr_labels[0])
print (common_tp.shape)
total_p = len(common_tp)/len((tr_labels[0]))
print (total_p)

In [None]:
# without_threshold_labels = [0]*len(binary_true_labels)
# for i in common_tp:
#     without_threshold_labels[i] = 1

# pscore = classification_report(binary_true_labels, without_threshold_labels)
# print (pscore)

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20,10
noise_labels = np.where(labels==-1)
print (len(noise_labels[0]))
noise_vecs = encoded_vecs[noise_labels[0],:][:]
noise_dist = np.zeros(len(noise_vecs), dtype=float)
for i in range(len(noise_dist)):
    noise_dist[i] = distance.euclidean(origin, encoded_vecs[i])
print (noise_dist)
noise_dist = np.sort(noise_dist)
plt.plot(noise_dist)
plt.show()

In [None]:
anomaly_indices = []
rgba_colors = np.zeros((len(binary_true_labels),4))
for ind, cl in enumerate(labels):
    if cl == -1:
        continue
    cluster_idx = np.argwhere(labels == cl).reshape(-1)
    anomaly_indices += list(cluster_idx)
rgba_colors[:, 0] = 0
rgba_colors[:, 3] = 0.01
# print (anomaly_indices)
for ind in anomaly_indices:
    rgba_colors[ind, 0] = 1
    rgba_colors[ind, 3] = 0.1
    
rcParams['figure.figsize'] = 20,10
plt.scatter(X_embedded.T[0], X_embedded.T[1], color=rgba_colors)

In [None]:

rcParams['figure.figsize'] = 20,10
plt.scatter(X_embedded.T[0], X_embedded.T[1], c=color_array, alpha=0.1)
# plt.title("Clustering accuracy={}, fmeasure_synth={}, number_of_labels={}".format(clustering_acc, fmeasure, 
#                                                                                   len(unique_labels)))
plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False,
right='off', left='off', labelleft='off')
plt.text(15, 15, 'TLDetect', fontsize=24)
plt.savefig('../results/embedding_scatter_trafficking10k.png')
plt.show()

In [None]:
rcParams['figure.figsize'] = 10,2


a = [0,1,2,3,4,5]
# b = [0,1,2,3,4,5]

from itertools import combinations 
tuples = list(combinations(a, 2))
count = 0
# for i in range(5):
#     if i%5 == 0:
#         plt.show()
#     else:
#         continue
#     ax1 = plt.subplot(1,5,1)
#     ax2 = plt.subplot(1,5,2)
#     ax3 = plt.subplot(1,5,3)
#     ax4 = plt.subplot(1,5,4)
#     ax5 = plt.subplot(1,5,5)
    
#     ax1.scatter(encoded_vecs.T[tuples[i][0]], encoded_vecs.T[tuples[i][1]], c=color_array, alpha=0.2)
#     ax1.title.set_text("{},{}".format(tuples[i][0], tuples[i][1]))
    
#     ax2.scatter(encoded_vecs.T[tuples[i+1][0]], encoded_vecs.T[tuples[i+1][1]], c=color_array, alpha=0.2)
#     ax2.title.set_text("{},{}".format(tuples[i+1][0], tuples[i+1][1]))
    
#     ax3.scatter(encoded_vecs.T[tuples[i+2][0]], encoded_vecs.T[tuples[i+2][1]], c=color_array, alpha=0.2)
#     ax3.title.set_text("{},{}".format(tuples[i+2][0], tuples[i+2][1]))
    
#     ax4.scatter(encoded_vecs.T[tuples[i+3][0]], encoded_vecs.T[tuples[i+3][1]], c=color_array, alpha=0.2)
#     ax4.title.set_text("{},{}".format(tuples[i+3][0], tuples[i+3][1]))
    
#     ax5.scatter(encoded_vecs.T[tuples[i+4][0]], encoded_vecs.T[tuples[i+4][1]], c=color_array, alpha=0.2)
#     ax5.title.set_text("{},{}".format(tuples[i+4][0], tuples[i+4][1]))
    
    
#     ax2.scatter(encoded_vecs.T[0], encoded_vecs.T[2], c=color_array, alpha=0.2)
#     ax2.title.set_text("{},{}".format(tup[i][0], tup[i][1]))
#     plt.tight_layout()
#     # plt.savefig('../results/svd_components_0.png')
#     plt.show()


for i in range(10):
    ax1 = plt.subplot(1,5,1)
    ax2 = plt.subplot(1,5,2)
    ax3 = plt.subplot(1,5,3)
    ax4 = plt.subplot(1,5,4)
    ax5 = plt.subplot(1,5,5)
    
    ax1.scatter(encoded_vecs.T[i], encoded_vecs.T[i+1], c=color_array, alpha=0.1)
    ax1.title.set_text("{},{}".format(i, i+1))
    
    ax2.scatter(encoded_vecs.T[i], encoded_vecs.T[i+2], c=color_array, alpha=0.1)
    ax2.title.set_text("{},{}".format(i, i+2))
    
    ax3.scatter(encoded_vecs.T[i], encoded_vecs.T[i+3], c=color_array, alpha=0.1)
    ax3.title.set_text("{},{}".format(i, i+3))
    
    ax4.scatter(encoded_vecs.T[i], encoded_vecs.T[i+4], c=color_array, alpha=0.1)
    ax4.title.set_text("{},{}".format(i, i+4))
    
    ax5.scatter(encoded_vecs.T[i], encoded_vecs.T[i+5], c=color_array, alpha=0.1)
    ax5.title.set_text("{},{}".format(i, i+5))
    
    plt.tight_layout()
    plt.savefig('../results/svd_components_{}.png'.format(i))
    plt.show()


In [None]:
# rcParams['figure.figsize'] = 6,2
# fig, axs = plt.subplots(1, 3)
# # ax1 = plt.subplot(1,4,1)
# # ax2 = plt.subplot(1,4,2)
# # ax3 = plt.subplot(1,4,3)
# # ax4 = plt.subplot(1,4,4)

# axs[0, 0].scatter(encoded_vecs.T[0], encoded_vecs.T[3], c=color_array, alpha=0.2)
# axs[0, 0].text(0.5, 0.3, "{},{}".format(0, 3))
# # ax1.title.set_text("{},{}".format(i, i+1))

# axs[0, 1].scatter(encoded_vecs.T[2], encoded_vecs.T[3], c=color_array, alpha=0.2)
# axs[0, 1].text(-0.05, 0.3,"{},{}".format(2, 3))

# axs[0, 2].scatter(encoded_vecs.T[7], encoded_vecs.T[10], c=color_array, alpha=0.2)
# axs[0, 2].text(0.4, 0.4,"{},{}".format(1, 5))

# axs[1, 0].scatter(encoded_vecs.T[4], encoded_vecs.T[9], c=color_array, alpha=0.2)
# axs[1, 0].text(0.6, 0.5,"{},{}".format(4, 9))

# axs[1, 1].scatter(encoded_vecs.T[1], encoded_vecs.T[5], c=color_array, alpha=0.2)
# axs[1, 1].text(0.5, 0.6,"{},{}".format(1, 5))

# axs[1, 2].scatter(encoded_vecs.T[5], encoded_vecs.T[9], c=color_array, alpha=0.2)
# axs[1, 2].text(0.5, 0.5,"{},{}".format(1, 5))



# plt.tight_layout()
# plt.savefig('../results/svd_components_trafficking10k.png'.format(i))
# plt.show()

In [None]:
def get_all_subgraphs(mat_data, cl_ind):
    filtered_sub = mat_data[cl_ind,:][:]
    bigrams_count = np.count_nonzero(filtered_sub, axis=0)
#     print ("Bigrams count matrix shape : {}".format(bigrams_count.shape))
    zero_count_index = np.where(bigrams_count==0)[1]
#     print (zero_count_index)
    one_count_index = list(np.where(bigrams_count==1)[1])
#     print ("One count index : {}".format(one_count_index))
#     print ("Max zero count index : {} ".format(max(zero_count_index)))
    core_bigrams_index = list(np.where(bigrams_count>1)[1])
#     print ("Core Bigrams index : {}".format(len(core_bigrams_index)))
    outer_bigrams_index = list(one_count_index + core_bigrams_index)
    
    bigram_induced_graph = mat_data[:][:,core_bigrams_index]
#     print ("Bigram Induced Graph shape : {}".format(bigram_induced_graph.shape))
    ads_count = np.count_nonzero(bigram_induced_graph, axis=1)
    ads_in_shell_index = np.where(ads_count>1)[0]
    
    not_core_ads = list(set(list(ads_in_shell_index)) - set(cl_ind))
    not_core_bigrams = list(set(list(one_count_index)) - set(core_bigrams_index))
    mat_copy = np.asarray(mat_data)
    for i in not_core_ads:
        for j in not_core_bigrams:
            mat_copy[i][j] = 0
    shell_subgraph = mat_copy[ads_in_shell_index,:][:,outer_bigrams_index]
    mat_copy[not_core_ads, :] = 0
    outer_subgraph = mat_copy[ads_in_shell_index,:][:,outer_bigrams_index]
    mat_copy[:,one_count_index] = 0
    core_subgraph = mat_copy[ads_in_shell_index,:][:,outer_bigrams_index]
#     print ("Shell Subgraph shape : {}". format(shell_subgraph.shape))
#     print ("Outer Subgraph shape : {}". format(outer_subgraph.shape))
#     print ("Core Subgraph shape : {}". format(core_subgraph.shape))
#     print (zero_count_index.shape)
#     print (zero_count_index[1])
#     outer_subgraph = np.delete(filtered_sub, zero_count_index, axis=1)
#     core_subgraph = outer_subgraph.copy()
#     core_subgraph[:,one_count_index] = 0
#     core_subgraph = np.delete(filtered_sub, list(set(list(zero_count_index) + list(one_count_index))), axis=1)
    shell_subgraph = np.asarray(shell_subgraph)
    outer_subgraph = np.asarray(outer_subgraph)
    core_subgraph = np.asarray(core_subgraph)
#     print ("subgraph sizes: {}, {}, {}".format(core_subgraph.shape, outer_subgraph.shape, shell_subgraph.shape))
    
    return shell_subgraph, outer_subgraph, core_subgraph

In [None]:
rcParams['figure.figsize'] = 20,10
plt.scatter(encoded_vecs.T[2], encoded_vecs.T[3], c=color_array, alpha=0.2)
plt.show()

In [None]:
avg_scores = []
df_nonoise['cluster_label'] = labels
df_nonoise['probabilities'] = probs
for c in unique_labels:
    df_fil = df_nonoise[df_nonoise['cluster_label']==c]
    score = sum(df_fil['label'])/len(df_fil['label'])
    avg_scores.append(score)



In [None]:
import math

def calculate_unweighted_density(core_mat):
    edge_weight = np.count_nonzero(core_mat)
    ads_count = np.count_nonzero(core_mat, axis=0)
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
#     print (ads_core_num)
#     print (bigrams_core_num)
    return edge_weight/(ads_core_num * bigrams_core_num + 1)

def calculate_weighted_density(core_mat):
    edge_weight = np.sum(core_mat)
    ads_count = np.count_nonzero(core_mat, axis=0)
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])

    return edge_weight/(ads_core_num * bigrams_core_num + 1)

def calculate_unweighted_fraudar_score(core_mat):
    edge_weight = np.count_nonzero(core_mat)
    ads_count = np.count_nonzero(core_mat, axis=0)
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return edge_weight/(ads_core_num + bigrams_core_num + 1)

def calculate_weighted_fraudar_score(core_mat):
    edge_weight = np.sum(core_mat)
    ads_count = np.count_nonzero(core_mat, axis=0)
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return edge_weight/(ads_core_num + bigrams_core_num + 1)

def calculate_unweighted_edge_per_score(core_mat, outer_mat):
    core_edges = np.count_nonzero(core_mat)
    outer_edges = np.count_nonzero(outer_mat)
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return ((core_edges + 1)/(outer_edges +1))*(math.log(bigrams_core_num+1))

def calculate_weighted_edge_per_score(core_mat, outer_mat):
    core_edges = np.sum(core_mat)
    outer_edges = np.sum(outer_mat)
    
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return ((core_edges + 1)/(outer_edges +1))*(math.log(bigrams_core_num+1))


def calculate_custom_score(core_mat, outer_mat):
    bigram_degrees = np.count_nonzero(np.asarray(core_mat), axis=0)
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    outer_edges = np.count_nonzero(outer_mat)
    bigram_degrees = bigram_degrees/bigrams_core_num
    
    ads_count = np.count_nonzero(core_mat, axis=0)
    ads_core_num = len(np.where(ads_count>0)[0])
    
    return (np.sum(bigram_degrees)/(outer_edges+1))*(math.log(bigrams_core_num+1))*(math.log(ads_core_num+1))
# #     print (mat.shape)
#     edges_nonzero = np.count_nonzero(mat, axis=0)
#     unique, counts = np.unique(edges_nonzero, return_counts=True)
#     degree_counts = dict(zip(unique, counts))
#     numerator = 0.0
#     denominator = 0.0
#     half = max(mat.shape[0]/2, 2)
#     for k, v in degree_counts.items():
#         if k == 0:
#             continue
#         elif k <= half:
#             denominator += k*v
#         else:
#             denominator += k*v
#             numerator += k*v
#     if denominator == 0.0:
#         return 0.0
#     else:
#         return numerator/denominator

# def calculate_weighted_edge_per_score(mat):
#     return 0.0

#Should be shell_mat instead of outer_mat, change once you figure out how to get shell subgraph.
def calculate_unweighted_modularity_score(core_mat, outer_mat, total_edges):
#     total_edges = math.log(total_edges)
    ad_degrees = np.count_nonzero(np.asarray(outer_mat), axis=1)
#     print (ad_degrees)
    bigram_degrees = np.count_nonzero(np.asarray(outer_mat), axis=0)
#     print (bigram_degrees)
    ads_count = np.count_nonzero(core_mat, axis=1)
    bigrams_count = np.count_nonzero(core_mat, axis=0)
    ads_core_index = np.where(ads_count>0)[0]
    bigrams_core_index = np.where(bigrams_count>0)[0]
#     ads_list_index = [x for x in range(core_mat.shape[0])]
#     nonzero_edges = np.transpose(np.nonzero(core_mat))
    summation = 0.0
    for i in ads_core_index:
        for j in bigrams_core_index:
            if core_mat[i][j] == 0:
                adj = 0
            else:
                adj = 1
            if adj == 1:
                summation += (adj - (ad_degrees[i] * bigram_degrees[j])/total_edges)

#     nonzero_edges = np.transpose(np.nonzero(core_mat))
#     summation = 0.0
#     for k in range(nonzero_edges.shape[0]):
#         ad_index = nonzero_edges[k][0]
#         big_index = nonzero_edges[k][1]
# #         print (ad_index)
# #         print (big_index)
# #         if core_mat[ad_index][big_index] != 0:
# #             adj = 1
# #         else:
# #             adj = 0
        
#         summation += (1 - (ad_degrees[ad_index] * bigram_degrees[big_index])/(2*total_edges))
    
    return (summation/total_edges)*(math.log(len(ads_core_index)+1))

def calculate_weighted_modularity_score(core_mat, outer_mat, total_edges):
    ad_degrees = np.sum(np.asarray(outer_mat), axis=1)
    bigram_degrees = np.sum(np.asarray(outer_mat), axis=0)
    
    ads_count = np.count_nonzero(core_mat, axis=1)
    bigrams_count = np.count_nonzero(core_mat, axis=0)
    ads_core_index = np.where(ads_count>0)[0]
    bigrams_core_index = np.where(bigrams_count>0)[0]

    

#     ads_list_index = [x for x in range(core_mat.shape[0])]
#     nonzero_edges = np.transpose(np.nonzero(core_mat))
    summation = 0.0
    for i in ads_core_index:
        for j in bigrams_core_index:
            if core_mat[i][j] != 0:
                summation += (core_mat[i][j] - (ad_degrees[i] * bigram_degrees[j])/total_edges)
#     summation = 0.0
#     for k in range(nonzero_edges.shape[0]):
#         ad_index = nonzero_edges[k][0]
#         big_index = nonzero_edges[k][1]
#         summation += (outer_mat[ad_index][big_index] - (ad_degrees[ad_index] * bigram_degrees[big_index])/(2*total_edges))
    
    return (summation/total_edges)*(math.log(len(ads_core_index)+1))

def calculate_pairwise_modularity(mat):
    mat = np.asarray(mat.todense())
    sim_scores = np.zeros((mat.shape[0], mat.shape[0]))
    for i in range(mat.shape[0]):
        for j in range(i+1, mat.shape[0]):
#             print ("i : {}, j : {}".format(i,j))
            if i == j:
                continue
#             print (len(mat[i]))
            sim_scores[i][j] = calculate_modularity_score(np.vstack((mat[i], mat[j])))
    
    return sim_scores

In [None]:
s = bigram_matrix.sum(axis=1)
print (s.shape)
print (len(np.argwhere(s==0)))

In [None]:
import gc
from sklearn.metrics.pairwise import cosine_similarity

def get_all_metrics(bigram_matrix, unique_labels, labels, df_data):
    eigen_ratios = []
    weighted_cluster_density = []
    unweighted_cluster_density = []
    unweighted_fraudar_scores = []
    weighted_fraudar_scores = []
    unweighted_outer_edge_perc_scores = []
    weighted_outer_edge_perc_scores = []
    unweighted_shell_edge_perc_scores = []
    weighted_shell_edge_perc_scores = []
    weighted_outer_modularity_scores = []
    unweighted_outer_modularity_scores = []
    weighted_shell_modularity_scores = []
    unweighted_shell_modularity_scores = []
    pairwise_similarity = []
    custom_score = []
    avg_label_scores = []
    max_label_scores = []
    sum_label_scores = []
    avg_binary_scores = []
    max_binary_scores = []
    sum_binary_scores = []
    clusters = []
    cluster_counts = []
    
    total_edges_unweighted = np.count_nonzero(bigram_matrix)
    total_edges_weighted = np.sum(bigram_matrix)
    for l in unique_labels:
#         s = bigram_matrix.sum(axis=1)
        if l== -1:
            weighted_cluster_density.append(0)
            unweighted_cluster_density.append(0)
            weighted_fraudar_scores.append(0)
            unweighted_fraudar_scores.append(0)
            weighted_outer_edge_perc_scores.append(0)
            unweighted_outer_edge_perc_scores.append(0)
            weighted_shell_edge_perc_scores.append(0)
            unweighted_shell_edge_perc_scores.append(0)
            unweighted_outer_modularity_scores.append(0)
            weighted_outer_modularity_scores.append(0)
            unweighted_shell_modularity_scores.append(0)
            weighted_shell_modularity_scores.append(0)
            pairwise_similarity.append(0)
            custom_score.append(0)
            cluster_counts.append(len(cluster_idx))
            eigen_ratios.append(0)
            clusters.append(l)
            
            max_label_scores.append(0)
            avg_label_scores.append(0)
            sum_label_scores.append(0)

            max_binary_scores.append(0)
            avg_binary_scores.append(0)
            sum_binary_scores.append(0)
            continue
#         print (s.shape)
#         print ("bigram matrix sum : {}".format(bigram_matrix.sum()))
#         print ("Zero elems: {}".format(len(np.argwhere(s==0))))
        cluster_idx = np.argwhere(labels == l).reshape(-1)
#         print (l, len(cluster_idx))
        
        print (cluster_idx)
        shell_subgraph, outer_subgraph, core_subgraph = get_all_subgraphs(bigram_matrix.copy(), cluster_idx)
#         print (l, len(cluster_idx), core_subgraph.sum(), outer_subgraph.sum(), shell_subgraph.sum())
        

        df_filt = df_data[df_data['cluster_label']== l]
        if len(df_filt) == 0 or core_subgraph.shape[0] == 0 or shell_subgraph.shape[0] == 0 or outer_subgraph.shape[0] == 0:
            weighted_cluster_density.append(0)
            unweighted_cluster_density.append(0)
            weighted_fraudar_scores.append(0)
            unweighted_fraudar_scores.append(0)
            weighted_outer_edge_perc_scores.append(0)
            unweighted_outer_edge_perc_scores.append(0)
            weighted_shell_edge_perc_scores.append(0)
            unweighted_shell_edge_perc_scores.append(0)
            unweighted_outer_modularity_scores.append(0)
            weighted_outer_modularity_scores.append(0)
            unweighted_shell_modularity_scores.append(0)
            weighted_shell_modularity_scores.append(0)
            pairwise_similarity.append(0)
            custom_score.append(0)
            cluster_counts.append(len(cluster_idx))
            eigen_ratios.append(0)
            clusters.append(l)
            
            max_label_scores.append(0)
            avg_label_scores.append(0)
            sum_label_scores.append(0)

            max_binary_scores.append(0)
            avg_binary_scores.append(0)
            sum_binary_scores.append(0)
            continue
#         elif core_subgraph.shape[0] == 0 or shell_subgraph.shape[0] == 0 or outer_subgraph.shape[0] == 0:
#             continue
        print (l, df_filt.shape)
        max_label_scores.append(max(df_filt['label']))
        avg_label_scores.append(sum(df_filt['label'])/len(df_filt['label']))
        sum_label_scores.append(sum(df_filt['label']))

        max_binary_scores.append(max(df_filt['binary_label']))
        avg_binary_scores.append(sum(df_filt['binary_label'])/len(df_filt['binary_label']))
        sum_binary_scores.append(sum(df_filt['binary_label']))
        
        local_content = list(df_filt['content_p'])
        count_vectorizer = TfidfVectorizer(ngram_range=(2,2), use_idf=False)
        count_data = count_vectorizer.fit_transform(local_content)

        svd = TruncatedSVD(n_components=2)
        local_vecs = svd.fit_transform(count_data)
        w = svd.singular_values_
        eigen_rat = w[1]/w[0]
        eigen_ratios.append(eigen_rat)
        
        print (outer_subgraph.shape)
        print (core_subgraph.shape)
        pairwise_sim_mat = cosine_similarity(outer_subgraph, dense_output=True)
        pairwise_sim_mat = np.tril(pairwise_sim_mat, -1)
#         print (sum(pairwise_sim_mat).shape)
        print (pairwise_sim_mat.sum())
        an_score = calculate_weighted_edge_per_score(core_subgraph, outer_subgraph)
        names = [x.lower() if type(x) == type('') else None for x in df_filt['Name'].unique()]
        if math.nan in names:
            names.remove(math.nan)
        if None in names:
            names.remove(None)
        names = list(set(names))
        c_score = an_score * max(0, len(names)-1)
        print ("Scores : {}, {}, {}".format(an_score, len(names), c_score))
        weighted_cluster_density.append(calculate_weighted_density(core_subgraph))
        unweighted_cluster_density.append(calculate_unweighted_density(core_subgraph))
        weighted_fraudar_scores.append(calculate_weighted_fraudar_score(core_subgraph))
        unweighted_fraudar_scores.append(calculate_unweighted_fraudar_score(core_subgraph))
        weighted_outer_edge_perc_scores.append(calculate_weighted_edge_per_score(core_subgraph, outer_subgraph))
        unweighted_outer_edge_perc_scores.append(calculate_unweighted_edge_per_score(core_subgraph, outer_subgraph))
        weighted_shell_edge_perc_scores.append(calculate_weighted_edge_per_score(core_subgraph, shell_subgraph))
        unweighted_shell_edge_perc_scores.append(calculate_unweighted_edge_per_score(core_subgraph, shell_subgraph))
        unweighted_outer_modularity_scores.append(calculate_unweighted_modularity_score(core_subgraph, outer_subgraph, total_edges_unweighted))
        weighted_outer_modularity_scores.append(calculate_weighted_modularity_score(core_subgraph, outer_subgraph, total_edges_weighted))
        unweighted_shell_modularity_scores.append(calculate_unweighted_modularity_score(core_subgraph, shell_subgraph, total_edges_unweighted))
        weighted_shell_modularity_scores.append(calculate_weighted_modularity_score(core_subgraph, shell_subgraph, total_edges_weighted))
        custom_score.append(c_score)
        pairwise_similarity.append(pairwise_sim_mat.sum()/len(cluster_idx))
        cluster_counts.append(len(cluster_idx))
        clusters.append(l)
        
        count_data = []
        local_content = []
        shell_subgraph = []
        core_subgraph = []
        outer_subgraph = []
        if l % 50 == 0:
            print (l)
            gc.collect()
#     original_labels = labels.copy()

    metrics = {}
    metrics['weighted_cluster_density'] = weighted_cluster_density
    metrics['unweighted_cluster_density'] = unweighted_cluster_density
    metrics['weighted_fraudar_scores'] = weighted_fraudar_scores
    metrics['unweighted_fraudar_scores'] = unweighted_fraudar_scores
    metrics['weighted_outer_edge_perc_scores'] = weighted_outer_edge_perc_scores
    metrics['unweighted_outer_edge_perc_scores'] = unweighted_outer_edge_perc_scores
    metrics['weighted_shell_edge_perc_scores'] = weighted_shell_edge_perc_scores
    metrics['unweighted_shell_edge_perc_scores'] = unweighted_shell_edge_perc_scores
    metrics['unweighted_outer_modularity_scores'] = unweighted_outer_modularity_scores
    metrics['weighted_outer_modularity_scores'] = weighted_outer_modularity_scores
    metrics['unweighted_shell_modularity_scores'] = unweighted_shell_modularity_scores
    metrics['weighted_shell_modularity_scores'] = weighted_shell_modularity_scores
    metrics['pairwise_similarity'] = pairwise_similarity
    metrics['custom_score'] = custom_score
    metrics['avg_label_scores'] = avg_label_scores
    metrics['sum_label_scores'] = sum_label_scores
    metrics['max_label_scores'] = max_label_scores
    metrics['avg_binary_scores'] = avg_binary_scores
    metrics['max_binary_scores'] = max_binary_scores
    metrics['sum_binary_scores'] = sum_binary_scores
    metrics['eigen_ratios'] = eigen_ratios
    metrics['clusters'] = clusters
    metrics['labels'] = labels.copy()
    metrics['cluster_counts'] = cluster_counts
    
    return metrics


In [None]:
s = bigram_matrix.sum(axis=1)
print (s.shape)
print (len(np.argwhere(s==0)))

In [None]:
pre_merging_metrics = get_all_metrics(bigram_matrix, unique_labels, labels, df_nonoise)

In [None]:
print (max(pre_merging_metrics['custom_score']))
print (pre_merging_metrics['clusters'][pre_merging_metrics['custom_score'].index(max(pre_merging_metrics['custom_score']))])

In [None]:

df_filt = df_nonoise[df_nonoise['cluster_label'] == 375]
names = [x.lower() if type(x) == type('') else None for x in df_filt['Name'].unique()]
if math.nan in names:
    names.remove(math.nan)
if None in names:
    names.remove(None)
names = list(set(names))
print (len(names))

In [None]:
cluster_idx = np.argwhere(labels == 9).reshape(-1)
print (cluster_idx)
bigram_matrix[cluster_idx,:][:].sum()

In [None]:
print (max(pre_merging_metrics['weighted_outer_edge_perc_scores']))
plt.scatter(pre_merging_metrics['avg_label_scores'], pre_merging_metrics['weighted_outer_edge_perc_scores'], alpha=0.2)
plt.show()

In [None]:
import gc
gc.collect()

In [None]:
rcParams['figure.figsize'] = 5,5
clusters = pre_merging_metrics['clusters']
pre_merging_metrics['eigen_ratios'][pre_merging_metrics['clusters'].index(-1)] = 1

# plt.plot(pre_merging_metrics['eigen_ratios'])
# plt.show()

plt.scatter(pre_merging_metrics['avg_label_scores'], pre_merging_metrics['eigen_ratios'], alpha=0.2)
plt.show()

In [None]:
eigen_ratios = pre_merging_metrics['eigen_ratios']
eigen_np = np.array(eigen_ratios)
clusters_nonhomogenous_index = np.where(eigen_np > 0.8)[0]
clusters_nonhomogenous = [clusters[i] for i in clusters_nonhomogenous_index]
# print (clusters_nonhomogenous)
non_noisy_clusters = [x for x in clusters if x not in clusters_nonhomogenous]

In [None]:
print (bigram_matrix.shape)

In [None]:
rerun_cluster_idx = []
original_labels = pre_merging_metrics['labels'].copy()
for l in clusters_nonhomogenous:
    cluster_idx = np.argwhere(original_labels == l).reshape(-1)
    rerun_cluster_idx += list(cluster_idx)
#     print (total_cluster_idx)
print (len(rerun_cluster_idx))
rerun_bigram_data = bigram_matrix[rerun_cluster_idx, :]

In [None]:
svd = TruncatedSVD(n_components=64)
encoded_vecs_rerun = svd.fit_transform(rerun_bigram_data)

In [None]:
import umap
X_embedded_rerun = umap.UMAP().fit_transform(encoded_vecs_rerun)

In [None]:
rcParams['figure.figsize'] = 20,10
plt.scatter(X_embedded_rerun.T[0], X_embedded_rerun.T[1], alpha=0.1)
# plt.title("Clustering accuracy={}, fmeasure_synth={}, number_of_labels={}".format(clustering_acc, fmeasure, 
#                                                                                   len(unique_labels)))
plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False,
right='off', left='off', labelleft='off')
plt.text(15, 15, 'TLDetect', fontsize=24)
# plt.savefig('../results/embedding_scatter_trafficking10k.png')
plt.show()

In [None]:
clusterer.fit(encoded_vecs_rerun)
#     print (clusterer.labels_)
rerun_clustering_labels = clusterer.labels_
rerun_labels_clustering = labels
rerun_n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_rerun_ = list(labels).count(-1)
rerun_unique_labels = set(labels)
rerun_probs = clusterer.probabilities_
#     colors = [plt.cm.Spectral(each)
#               for each in np.linspace(0, 1, len(set(true_labels))]
print ("Number of labels : " + str(len(list(set(clusterer.labels_)))))
#     palette = sns.color_palette()

In [None]:
anomaly_indices = []
rgba_colors = np.zeros((len(binary_true_labels),4))
for ind, cl in enumerate(labels):
    if cl == -1:
        continue
    cluster_idx = np.argwhere(labels == cl).reshape(-1)
    anomaly_indices += list(cluster_idx)
rgba_colors[:, 0] = 0
rgba_colors[:, 3] = 0.01
# print (anomaly_indices)
for ind in anomaly_indices:
    rgba_colors[ind, 0] = 1
    rgba_colors[ind, 3] = 0.1
    
rcParams['figure.figsize'] = 20,10
plt.scatter(X_embedded.T[0], X_embedded.T[1], color=rgba_colors)

In [None]:
max_clusters = max(pre_merging_metrics['clusters'])
# labels = pre_merging_metrics['labels']
rerun_clusters = []
for ind, ad in enumerate(rerun_cluster_idx):
    if rerun_clustering_labels[ind] != -1:
        original_labels[ad] = rerun_clustering_labels[ind] + max_clusters + 1
        rerun_clusters.append(rerun_clustering_labels[ind] + max_clusters + 1)
        probs[ad] = rerun_probs[ind]
    else:
        original_labels[ad] = -1
        probs[ad] = rerun_probs[ind]
        rerun_clusters.append(-1)
rerun_clusters = list(set(rerun_clusters))
clusters = rerun_clusters + non_noisy_clusters
print (clusters)

In [None]:
df_nonoise['cluster_label'] = original_labels
df_nonoise['probabilities'] = probs

In [None]:

post_splitting_metrics = get_all_metrics(bigram_matrix, rerun_clusters, original_labels, df_nonoise)

In [None]:
print (max(post_splitting_metrics['custom_score']))
print (post_splitting_metrics['clusters'][post_splitting_metrics['custom_score'].index(max(post_splitting_metrics['custom_score']))])

In [None]:

df_filt = df_nonoise[df_nonoise['cluster_label'] == 485]
names = [x.lower() if type(x) == type('') else None for x in df_filt['Name'].unique()]
if math.nan in names:
    names.remove(math.nan)
if None in names:
    names.remove(None)
names = list(set(names))
print (len(names))

In [None]:
eigen_ratios = []
weighted_cluster_density = []
unweighted_cluster_density = []
unweighted_fraudar_scores = []
weighted_fraudar_scores = []
unweighted_outer_edge_perc_scores = []
weighted_outer_edge_perc_scores = []
unweighted_shell_edge_perc_scores = []
weighted_shell_edge_perc_scores = []
weighted_outer_modularity_scores = []
unweighted_outer_modularity_scores = []
weighted_shell_modularity_scores = []
unweighted_shell_modularity_scores = []
pairwise_similarity = []
custom_score = []
avg_label_scores = []
max_label_scores = []
sum_label_scores = []
avg_binary_scores = []
max_binary_scores = []
sum_binary_scores = []
cluster_counts = []
print (len(rerun_clusters))
print (len(post_splitting_metrics['eigen_ratios']))
for i in clusters:
    if i in rerun_clusters:
        ind = rerun_clusters.index(i)
        eigen_ratios.append(post_splitting_metrics['eigen_ratios'][ind])
        weighted_cluster_density.append(post_splitting_metrics['weighted_cluster_density'][ind])
        unweighted_cluster_density.append(post_splitting_metrics['unweighted_cluster_density'][ind])
        unweighted_fraudar_scores.append(post_splitting_metrics['unweighted_fraudar_scores'][ind])
        weighted_fraudar_scores.append(post_splitting_metrics['weighted_fraudar_scores'][ind])
        unweighted_outer_edge_perc_scores.append(post_splitting_metrics['unweighted_outer_edge_perc_scores'][ind])
        weighted_outer_edge_perc_scores.append(post_splitting_metrics['weighted_outer_edge_perc_scores'][ind])
        unweighted_shell_edge_perc_scores.append(post_splitting_metrics['unweighted_shell_edge_perc_scores'][ind])
        weighted_shell_edge_perc_scores.append(post_splitting_metrics['weighted_shell_edge_perc_scores'][ind])
        weighted_outer_modularity_scores.append(post_splitting_metrics['weighted_outer_modularity_scores'][ind])
        unweighted_outer_modularity_scores.append(post_splitting_metrics['unweighted_outer_modularity_scores'][ind])
        weighted_shell_modularity_scores.append(post_splitting_metrics['weighted_shell_modularity_scores'][ind])
        unweighted_shell_modularity_scores.append(post_splitting_metrics['unweighted_shell_modularity_scores'][ind])
        pairwise_similarity.append(post_splitting_metrics['pairwise_similarity'][ind])
        custom_score.append(post_splitting_metrics['custom_score'][ind])
        avg_label_scores.append(post_splitting_metrics['avg_label_scores'][ind])
        max_label_scores.append(post_splitting_metrics['max_label_scores'][ind])
        sum_label_scores.append(post_splitting_metrics['sum_label_scores'][ind])
        avg_binary_scores.append(post_splitting_metrics['avg_binary_scores'][ind])
        max_binary_scores.append(post_splitting_metrics['max_binary_scores'][ind])
        sum_binary_scores.append(post_splitting_metrics['sum_binary_scores'][ind])
        cluster_counts.append(post_splitting_metrics['cluster_counts'][ind])
        
    elif i in non_noisy_clusters:
        ind = pre_merging_metrics['clusters'].index(i)
        eigen_ratios.append(pre_merging_metrics['eigen_ratios'][ind])
        weighted_cluster_density.append(pre_merging_metrics['weighted_cluster_density'][ind])
        unweighted_cluster_density.append(pre_merging_metrics['unweighted_cluster_density'][ind])
        unweighted_fraudar_scores.append(pre_merging_metrics['unweighted_fraudar_scores'][ind])
        weighted_fraudar_scores.append(pre_merging_metrics['weighted_fraudar_scores'][ind])
        unweighted_outer_edge_perc_scores.append(pre_merging_metrics['unweighted_outer_edge_perc_scores'][ind])
        weighted_outer_edge_perc_scores.append(pre_merging_metrics['weighted_outer_edge_perc_scores'][ind])
        unweighted_shell_edge_perc_scores.append(pre_merging_metrics['unweighted_shell_edge_perc_scores'][ind])
        weighted_shell_edge_perc_scores.append(pre_merging_metrics['weighted_shell_edge_perc_scores'][ind])
        weighted_outer_modularity_scores.append(pre_merging_metrics['weighted_outer_modularity_scores'][ind])
        unweighted_outer_modularity_scores.append(pre_merging_metrics['unweighted_outer_modularity_scores'][ind])
        weighted_shell_modularity_scores.append(pre_merging_metrics['weighted_shell_modularity_scores'][ind])
        unweighted_shell_modularity_scores.append(pre_merging_metrics['unweighted_shell_modularity_scores'][ind])
        pairwise_similarity.append(pre_merging_metrics['pairwise_similarity'][ind])
        custom_score.append(pre_merging_metrics['custom_score'][ind])
        avg_label_scores.append(pre_merging_metrics['avg_label_scores'][ind])
        max_label_scores.append(pre_merging_metrics['max_label_scores'][ind])
        sum_label_scores.append(pre_merging_metrics['sum_label_scores'][ind])
        avg_binary_scores.append(pre_merging_metrics['avg_binary_scores'][ind])
        max_binary_scores.append(pre_merging_metrics['max_binary_scores'][ind])
        sum_binary_scores.append(pre_merging_metrics['sum_binary_scores'][ind])
        cluster_counts.append(pre_merging_metrics['cluster_counts'][i])

metrics = {}
metrics['weighted_cluster_density'] = weighted_cluster_density
metrics['unweighted_cluster_density'] = unweighted_cluster_density
metrics['weighted_fraudar_scores'] = weighted_fraudar_scores
metrics['unweighted_fraudar_scores'] = unweighted_fraudar_scores
metrics['weighted_outer_edge_perc_scores'] = weighted_outer_edge_perc_scores
metrics['unweighted_outer_edge_perc_scores'] = unweighted_outer_edge_perc_scores
metrics['weighted_shell_edge_perc_scores'] = weighted_shell_edge_perc_scores
metrics['unweighted_shell_edge_perc_scores'] = unweighted_shell_edge_perc_scores
metrics['unweighted_outer_modularity_scores'] = unweighted_outer_modularity_scores
metrics['weighted_outer_modularity_scores'] = weighted_outer_modularity_scores
metrics['unweighted_shell_modularity_scores'] = unweighted_shell_modularity_scores
metrics['weighted_shell_modularity_scores'] = weighted_shell_modularity_scores
metrics['pairwise_similarity'] = pairwise_similarity
metrics['custom_score'] = custom_score
metrics['avg_label_scores'] = avg_label_scores
metrics['sum_label_scores'] = sum_label_scores
metrics['max_label_scores'] = max_label_scores
metrics['avg_binary_scores'] = avg_binary_scores
metrics['max_binary_scores'] = max_binary_scores
metrics['sum_binary_scores'] = sum_binary_scores
metrics['eigen_ratios'] = eigen_ratios
metrics['clusters'] = clusters
metrics['cluster_label'] = labels.copy()
metrics['cluster_counts'] = cluster_counts 

post_noisy_split_metrics = metrics
print (len(post_noisy_split_metrics['eigen_ratios']))
print (len(clusters))

In [None]:
print (post_noisy_split_metrics['cluster_label'])

In [None]:
ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)

ax1.scatter(pre_merging_metrics['avg_label_scores'], pre_merging_metrics['eigen_ratios'], alpha=0.2)
ax1.title.set_text("Before Rerun - Avg score vs Eigen Ratios")
ax2.scatter(post_noisy_split_metrics['avg_label_scores'], post_noisy_split_metrics['eigen_ratios'], alpha=0.2)
ax2.title.set_text("After Rerun - Avg score vs Eigen Ratios")

plt.show()


############################OUTER EDGE PERCENTAGE#############################
ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)

ax1.scatter(pre_merging_metrics['avg_label_scores'], pre_merging_metrics['weighted_outer_edge_perc_scores'], alpha=0.2)
ax1.title.set_text("Before Rerun - Avg score vs Weighted Outer Edge Percentage")
ax2.scatter(post_noisy_split_metrics['avg_label_scores'], post_noisy_split_metrics['weighted_outer_edge_perc_scores'], alpha=0.2)
ax2.title.set_text("After Rerun - Avg score vs Weighted Outer Edge Percentage")

plt.show()


ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)

ax1.scatter(pre_merging_metrics['avg_label_scores'], pre_merging_metrics['weighted_fraudar_scores'], alpha=0.2)
ax1.title.set_text("Before Rerun - Avg score vs Weighted Fraudar Score")
ax2.scatter(post_noisy_split_metrics['avg_label_scores'], post_noisy_split_metrics['weighted_fraudar_scores'], alpha=0.2)
ax2.title.set_text("After Rerun - Avg score vs Weighted Fraudar Score")

plt.show()


ax1 = plt.subplot(1,2,1)
ax2 = plt.subplot(1,2,2)

ax1.scatter(pre_merging_metrics['avg_label_scores'], pre_merging_metrics['weighted_outer_modularity_scores'], alpha=0.2)
ax1.title.set_text("Before Rerun - Avg score vs Weighted outer Modularity")
ax2.scatter(post_noisy_split_metrics['avg_label_scores'], post_noisy_split_metrics['weighted_outer_modularity_scores'], alpha=0.2)
ax2.title.set_text("After Rerun - Avg score vs Weighted Outer modularity")

plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cluster_labels_results = df_nonoise['cluster_label']
cluster_vectors = np.zeros((len(clusters), bigram_matrix.shape[1]))
for ind, cl in enumerate(clusters):
    cluster_idx = np.argwhere(cluster_labels_results == cl).reshape(-1)
    tf_cluster_mat = bigram_matrix[cluster_idx,:][:]
    tf_cluster_mat_flat = np.mean(tf_cluster_mat, axis=0)
    print (tf_cluster_mat_flat.shape)
    cluster_vectors[ind] = tf_cluster_mat_flat
    
pairwise_sim_mat = cosine_similarity(cluster_vectors, dense_output=True)

In [None]:
pairwise_sim_mat *= np.tri(*pairwise_sim_mat.shape)
np.fill_diagonal(pairwise_sim_mat, 0.0)

sim_clusters = np.where(pairwise_sim_mat>0.8)
cluster_tuples = zip(sim_clusters[0], sim_clusters[1])

clusters_to_be_merged = []

for tup in cluster_tuples:
    clusters_to_be_merged.append((clusters[tup[0]], clusters[tup[1]]))
print (clusters_to_be_merged)

In [None]:
pre_clusters = np.array(cluster_labels_results)
print (bigram_matrix.shape)
for cl in clusters_to_be_merged:
    np_clusters = np.array(cluster_labels_results)
#     adj_matrix_copy = bigram_matrix.copy()
    replace_index = np.where(np_clusters==cl[1])
    for i in replace_index:
        np_clusters[i] = cl[0]
    pre_cl_ind1 = np.argwhere(pre_clusters==cl[0]).reshape(-1)
    pre_cl_ind2 = np.argwhere(pre_clusters==cl[1]).reshape(-1)
    print (pre_cl_ind1)
    pre_shell1, pre_outer1, pre_core1 = get_all_subgraphs(bigram_matrix.copy(), pre_cl_ind1)
    pre_shell2, pre_outer2, pre_core2 = get_all_subgraphs(bigram_matrix.copy(), pre_cl_ind2)
    total_edges = np.sum(bigram_matrix)
    mod1 = calculate_weighted_edge_per_score(pre_core1, pre_outer1)
    mod2 = calculate_weighted_edge_per_score(pre_core2, pre_outer2)
    print (np.sum(pre_shell1))
    post_cl_ind = np.argwhere(np_clusters==cl[0]).reshape(-1)
    post_shell, post_outer, post_core = get_all_subgraphs(bigram_matrix.copy(), post_cl_ind)
    post_mod = calculate_weighted_edge_per_score(post_core, post_outer)
    
    print ("Cluster1: {}, Cluster2: {}, Pre mod1: {}, Pre Mod2: {}, Post Mod: {}".format(cl[0], cl[1], mod1, mod2, post_mod))

In [None]:
eigen_ratios = post_noisy_split_metrics['eigen_ratios']
weighted_cluster_density = post_noisy_split_metrics['weighted_cluster_density']
unweighted_cluster_density = post_noisy_split_metrics['unweighted_cluster_density']
unweighted_fraudar_scores = post_noisy_split_metrics['unweighted_fraudar_scores']
weighted_fraudar_scores = post_noisy_split_metrics['weighted_fraudar_scores']
unweighted_outer_edge_perc_scores = post_noisy_split_metrics['unweighted_outer_edge_perc_scores']
weighted_outer_edge_perc_scores = post_noisy_split_metrics['weighted_outer_edge_perc_scores']
unweighted_shell_edge_perc_scores = post_noisy_split_metrics['unweighted_shell_edge_perc_scores']
weighted_shell_edge_perc_scores = post_noisy_split_metrics['weighted_shell_edge_perc_scores']
weighted_outer_modularity_scores = post_noisy_split_metrics['weighted_outer_modularity_scores']
unweighted_outer_modularity_scores = post_noisy_split_metrics['unweighted_outer_modularity_scores']
weighted_shell_modularity_scores = post_noisy_split_metrics['weighted_shell_modularity_scores']
unweighted_shell_modularity_scores = post_noisy_split_metrics['unweighted_shell_modularity_scores']
pairwise_similarity = post_noisy_split_metrics['pairwise_similarity']
custom_score = post_noisy_split_metrics['custom_score']
avg_label_scores = post_noisy_split_metrics['avg_label_scores']
max_label_scores = post_noisy_split_metrics['max_label_scores']
sum_label_scores = post_noisy_split_metrics['sum_label_scores']
avg_binary_scores = post_noisy_split_metrics['avg_binary_scores']
max_binary_scores = post_noisy_split_metrics['max_binary_scores']
sum_binary_scores = post_noisy_split_metrics['sum_binary_scores']
cluster_counts = post_noisy_split_metrics['cluster_counts']
labels = df_nonoise['cluster_label']
df_a = df_nonoise.copy()
# labels = df_data['cluster_label']
to_calculate_clusters = []
for tup in clusters_to_be_merged:
    if tup[1] in clusters:
        df_nonoise['cluster_label'].replace(tup[1], tup[0], inplace=True)
    # Handle case when 3 clusters are similar to each other eg. (a,b) (c,b)
#         print (df_a['cluster_label'].unique())

        ind = clusters.index(tup[1])
        del eigen_ratios[ind]
        del weighted_cluster_density[ind]
        del unweighted_cluster_density[ind]
        del unweighted_fraudar_scores[ind]
        del weighted_fraudar_scores[ind]
        del unweighted_outer_edge_perc_scores[ind]
        del weighted_outer_edge_perc_scores[ind]
        del unweighted_shell_edge_perc_scores[ind]
        del weighted_shell_edge_perc_scores[ind]
        del weighted_outer_modularity_scores[ind]
        del unweighted_outer_modularity_scores[ind]
        del weighted_shell_modularity_scores[ind]
        del unweighted_shell_modularity_scores[ind]
        del pairwise_similarity[ind]
        del custom_score[ind]
        del avg_label_scores[ind]
        del max_label_scores[ind]
        del sum_label_scores[ind]
        del avg_binary_scores[ind]
        del max_binary_scores[ind]
        del sum_binary_scores[ind]
        del cluster_counts[ind]
        del clusters[ind]
        if tup[1] in to_calculate_clusters:
            to_calculate_clusters.remove(tup[1])
        to_calculate_clusters.append(tup[0])

metrics = get_all_metrics(bigram_matrix, to_calculate_clusters, df_nonoise['cluster_label'], df_nonoise)
for i, cl in enumerate(to_calculate_clusters):
    ind = clusters.index(cl)
    eigen_ratios[ind] = metrics['eigen_ratios'][i]
    weighted_cluster_density[ind] = post_noisy_split_metrics['weighted_cluster_density'][i]
    unweighted_cluster_density[ind] = post_noisy_split_metrics['unweighted_cluster_density'][i]
    unweighted_fraudar_scores[ind] = post_noisy_split_metrics['unweighted_fraudar_scores'][i]
    weighted_fraudar_scores[ind] = post_noisy_split_metrics['weighted_fraudar_scores'][i]
    unweighted_outer_edge_perc_scores[ind] = post_noisy_split_metrics['unweighted_outer_edge_perc_scores'][i]
    weighted_outer_edge_perc_scores[ind] = post_noisy_split_metrics['weighted_outer_edge_perc_scores'][i]
    unweighted_shell_edge_perc_scores[ind] = post_noisy_split_metrics['unweighted_shell_edge_perc_scores'][i]
    weighted_shell_edge_perc_scores[ind] = post_noisy_split_metrics['weighted_shell_edge_perc_scores'][i]
    weighted_outer_modularity_scores[ind] = post_noisy_split_metrics['weighted_outer_modularity_scores'][i]
    unweighted_outer_modularity_scores[ind] = post_noisy_split_metrics['unweighted_outer_modularity_scores'][i]
    weighted_shell_modularity_scores[ind] = post_noisy_split_metrics['weighted_shell_modularity_scores'][i]
    unweighted_shell_modularity_scores[ind] = post_noisy_split_metrics['unweighted_shell_modularity_scores'][i]
    pairwise_similarity[ind] = post_noisy_split_metrics['pairwise_similarity'][i]
    custom_score[ind] = post_noisy_split_metrics['custom_score'][i]
    avg_label_scores[ind] = post_noisy_split_metrics['avg_label_scores'][i]
    max_label_scores[ind] = post_noisy_split_metrics['max_label_scores'][i]
    sum_label_scores[ind] = post_noisy_split_metrics['sum_label_scores'][i]
    avg_binary_scores[ind] = post_noisy_split_metrics['avg_binary_scores'][i]
    max_binary_scores[ind] = post_noisy_split_metrics['max_binary_scores'][i]
    sum_binary_scores[ind] = post_noisy_split_metrics['sum_binary_scores'][i]
    cluster_counts[ind] = post_noisy_split_metrics['cluster_counts'][i]

post_merging_metric = {}
post_merging_metric['weighted_cluster_density'] = weighted_cluster_density
post_merging_metric['unweighted_cluster_density'] = unweighted_cluster_density
post_merging_metric['weighted_fraudar_scores'] = weighted_fraudar_scores
post_merging_metric['unweighted_fraudar_scores'] = unweighted_fraudar_scores
post_merging_metric['weighted_outer_edge_perc_scores'] = weighted_outer_edge_perc_scores
post_merging_metric['unweighted_outer_edge_perc_scores'] = unweighted_outer_edge_perc_scores
post_merging_metric['weighted_shell_edge_perc_scores'] = weighted_shell_edge_perc_scores
post_merging_metric['unweighted_shell_edge_perc_scores'] = unweighted_shell_edge_perc_scores
post_merging_metric['unweighted_outer_modularity_scores'] = unweighted_outer_modularity_scores
post_merging_metric['weighted_outer_modularity_scores'] = weighted_outer_modularity_scores
post_merging_metric['unweighted_shell_modularity_scores'] = unweighted_shell_modularity_scores
post_merging_metric['weighted_shell_modularity_scores'] = weighted_shell_modularity_scores
post_merging_metric['pairwise_similarity'] = pairwise_similarity
post_merging_metric['custom_score'] = custom_score
post_merging_metric['avg_label_scores'] = avg_label_scores
post_merging_metric['sum_label_scores'] = sum_label_scores
post_merging_metric['max_label_scores'] = max_label_scores
post_merging_metric['avg_binary_scores'] = avg_binary_scores
post_merging_metric['max_binary_scores'] = max_binary_scores
post_merging_metric['sum_binary_scores'] = sum_binary_scores
post_merging_metric['eigen_ratios'] = eigen_ratios
post_merging_metric['clusters'] = clusters
post_merging_metric['cluster_label'] = df_nonoise['cluster_label'].copy()
post_merging_metric['cluster_counts'] = cluster_counts 

In [None]:
print (len(post_merging_metric['clusters']), df_nonoise['cluster_label'].nunique(), len(post_merging_metric['eigen_ratios']))

In [None]:
df_nonoise.set_index('index1', inplace=True)



In [None]:
# df_a = df_orig.copy()
df_nonoise['final_label'] = df_nonoise['cluster_label']
df_orig = df_orig.join(df_nonoise['final_label'], how='outer')



In [None]:
df_orig['final_label'].isna().sum()



In [None]:
print (df_orig[df_orig['noise'] == True].shape)


In [None]:
for ind, row in df_orig.iterrows():
    if row['noise'] == True:
        df_orig.at[ind, 'final_label'] = -2
    elif row['sim_check'] == True:
        df_orig.at[ind, 'final_label'] = df_orig.at[row['sim_index'], 'final_label']
    

In [None]:
for ind, row in df_orig.iterrows():
    if row['noise'] == True:
        df_orig.at[ind, 'final_label'] = -2
    elif row['sim_check'] == True:
        df_orig.at[ind, 'final_label'] = df_orig.at[row['sim_index'], 'final_label']
    

In [None]:
for cl in clusters:
    df_filt = df_orig[df_orig['final_label'] == cl]
    names = [x.lower() if type(x) == type('') else None for x in df_filt['Name'].unique()]
    if math.nan in names:
        names.remove(math.nan)
    if None in names:
        names.remove(None)
    names = list(set(names))
    print (cl, len(names))
# print (post_merging_metric['cluster_counts'][post_merging_metric['custom_score'].index(max(post_merging_metric['custom_score']))])

In [None]:
print (post_merging_metric['clusters'][post_merging_metric['custom_score'].index(max(post_merging_metric['custom_score']))])

In [None]:
df_filt = df_orig[df_orig['final_label'] == 375]

names = [x.lower() if type(x) == type('') else None for x in df_filt['Name'].unique()]
if math.nan in names:
    names.remove(math.nan)
if None in names:
    names.remove(None)
names = list(set(names))
print (names)
for ind, row in df_filt.iterrows():
    print ('------------------------------------------------------------')
    print (row['body'])

In [None]:
# custom_score  = post_merging_metric['custom_score']
# multiplier = (max(custom_score) - min(custom_score))
# custom_score = list(map(lambda x: x/multiplier, custom_score))
# # custom_score = list(map(lambda x: x*2, custom_score))
# post_merging_metric['c_score'] = custom_score

In [None]:
suspicious_scores = post_merging_metric['weighted_outer_edge_perc_scores']
text_similarity = post_merging_metric['pairwise_similarity']
clusters = post_merging_metric['clusters']
avg_label_scores = post_merging_metric['avg_label_scores']
avg_binary_scores = post_merging_metric['avg_binary_scores']
w_density = post_merging_metric['weighted_cluster_density']
uw_density = post_merging_metric['unweighted_cluster_density']
w_fraudar = post_merging_metric['weighted_fraudar_scores']
uw_fraudar = post_merging_metric['unweighted_fraudar_scores']
w_outer_edge = post_merging_metric['weighted_outer_edge_perc_scores']
uw_outer_edge = post_merging_metric['unweighted_outer_edge_perc_scores']
w_shell_edge = post_merging_metric['weighted_shell_edge_perc_scores']
uw_shell_edge = post_merging_metric['unweighted_shell_edge_perc_scores']
w_outer_mod = post_merging_metric['weighted_outer_modularity_scores']
uw_outer_mod = post_merging_metric['unweighted_outer_modularity_scores']
w_shell_mod = post_merging_metric['weighted_shell_modularity_scores']
uw_shell_mod = post_merging_metric['unweighted_shell_modularity_scores']

In [None]:
print (suspicious_scores[0], clusters[0])

In [None]:
# df_filt = df_orig[df_orig['cluster_label'] == 322]

# names = [x.lower() if type(x) == type('') else None for x in df_filt['Name'].unique()]
# if math.nan in names:
#     names.remove(math.nan)
# if None in names:
#     names.remove(None)
# names = list(set(names))
# print (names)
# for ind, row in df_filt.iterrows():
#     print ('------------------------------------------------------------')
#     print (row['body'])

In [None]:
import seaborn as sns


ax = sns.regplot(x=avg_label_scores, y=suspicious_scores, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='anomaly score')
plt.show()

ax = sns.regplot(x=avg_label_scores, y=w_outer_edge, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
plt.show()

# import seaborn as sns
ax = sns.regplot(x=text_similarity, y=w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Text Similarity', ylabel='Weighted outer edge percentage')
plt.show()


ax = sns.regplot(x=avg_label_scores, y=text_similarity, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Text Similarity')
plt.show()
# import seaborn as sns
# ax = sns.regplot(x=avg_label_scores, y=w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
# ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
# plt.show()

# import seaborn as sns
# ax = sns.regplot(x=avg_label_scores, y=w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
# ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
# plt.show()

# import seaborn as sns
# ax = sns.regplot(x=avg_label_scores, y=w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
# ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
# plt.show()

# import seaborn as sns
# ax = sns.regplot(x=avg_label_scores, y=w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
# ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
# plt.show()


# plt.scatter(avg_label_scores, w_outer_edge, alpha=0.2)
# plt.title('Avg label score vs weighted outer edge percentage')
# plt.xlabel('avg label score')
# plt.ylabel('weighted outer edge percentage')
# # w = np.linalg.lstsq(avg_label_scores, w_outer_edge)[0]
# # yh = np.dot(avg_label_scores,w)
# # plt.plot(avg_label_scores, yh, 'r-')
# plt.show()

# plt.scatter(avg_label_scores, w_shell_edge, alpha=0.2)
# plt.title('Avg label score vs weighted shell edge percentage')
# plt.xlabel('avg label score')
# plt.ylabel('weighted shell edge percentage')
# plt.show()

# plt.scatter(text_similarity, w_outer_edge, alpha=0.2)
# plt.title('Avg text similarity vs weighted outer edge percentage')
# plt.xlabel('avg text similarity')
# plt.ylabel('weighted outer edge percentage')

# # plt.scatter(avg_label_scores, w_shell_edge, alpha=0.2)
# # plt.title('Avg label score vs weighted shell edge percentage')
# # plt.xlabel('avg label score')
# # plt.ylabel('weighted shell edge percentage')
# plt.show()

In [None]:
def filter_by_name(clusters, df_data):
    filtered_clusters = []
    for cl in clusters:
        df_fil = df_data[df_data['cluster_label']==cl]
        names = [x.lower() if type(x) == type('') else None for x in df_fil['Name'].unique()]
        if math.nan in names:
            names.remove(math.nan)
        if None in names:
            names.remove(None)
        names = list(set(names))
        if len(names)>1:
#             print (names)
            filtered_clusters.append(cl)
#             filtered_cluster_metric.append(anomaly_cluster_metric[ind])
#         else:
#             filtered_clusters.append(c)
#             filtered_cluster_metric.append(anomaly_cluster_metric[ind])
    return filtered_clusters

def filter_by_threshold(clusters, metric, threshold=0.0):
    filtered_clusters = []
    index_list = [i for i, e in enumerate(metric) if e > threshold]
    filtered_clusters = [clusters[i] for i in index_list] 
    
    return filtered_clusters

In [None]:
# suspicious_scores = post_merging_metric['weighted_outer_edge_perc_scores']
# text_similarity = post_merging_metric['pairwise_similarity']
# clusters = post_merging_metric['clusters']
# avg_label_scores = post_merging_metric['avg_label_scores']
# avg_binary_scores = post_merging_metric['avg_binary_scores']
# w_density = post_merging_metric['weighted_cluster_density']
# uw_density = post_merging_metric['unweighted_cluster_density']
# w_fraudar = post_merging_metric['weighted_fraudar_scores']
# uw_fraudar = post_merging_metric['unweighted_fraudar_scores']
# w_outer_edge = post_merging_metric['weighted_outer_edge_perc_scores']
# uw_outer_edge = post_merging_metric['unweighted_outer_edge_perc_scores']
# w_shell_edge = post_merging_metric['weighted_shell_edge_perc_scores']
# uw_shell_edge = post_merging_metric['unweighted_shell_edge_perc_scores']
# w_outer_mod = post_merging_metric['weighted_outer_modularity_scores']
# uw_outer_mod = post_merging_metric['unweighted_outer_modularity_scores']
# w_shell_mod = post_merging_metric['weighted_shell_modularity_scores']
# uw_shell_mod = post_merging_metric['unweighted_shell_modularity_scores']

In [None]:
filtered_clusters = filter_by_name(clusters, df_orig.copy())
print (filtered_clusters)
filtered_clusters = set(filtered_clusters)
index_list = [i for i, e in enumerate(clusters) if e in filtered_clusters]
filtered_avg_label_scores = [avg_label_scores[i] for i in index_list] 
filtered_text_similarity = [text_similarity[i] for i in index_list] 
filtered_suspicious_scores = [suspicious_scores[i] for i in index_list] 
filtered_avg_binary_scores = [avg_binary_scores[i] for i in index_list] 
filtered_w_density = [w_density[i] for i in index_list] 
filtered_uw_density = [uw_density[i] for i in index_list] 
filtered_w_fraudar = [w_fraudar[i] for i in index_list] 
filtered_uw_fraudar = [uw_fraudar[i] for i in index_list] 
filtered_w_outer_edge = [w_outer_edge[i] for i in index_list] 
filtered_uw_outer_edge = [uw_outer_edge[i] for i in index_list] 
filtered_w_shell_edge = [w_shell_edge[i] for i in index_list] 
filtered_uw_shell_edge = [uw_shell_edge[i] for i in index_list] 
filtered_w_outer_mod = [w_outer_mod[i] for i in index_list] 
filtered_uw_outer_mod = [uw_outer_mod[i] for i in index_list] 
filtered_w_shell_mod = [w_shell_mod[i] for i in index_list] 
filtered_uw_shell_mod = [uw_shell_mod[i] for i in index_list] 

In [None]:
from scipy.stats import spearmanr
import plotly.express as px

filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, \
filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, filtered_w_outer_edge, \
filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  filtered_w_outer_mod, \
filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod = \
zip(*sorted(zip(filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, 
                filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, 
                filtered_w_outer_edge, filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  
                filtered_w_outer_mod, filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod)))
filtered_suspicious_scores = list(reversed(filtered_suspicious_scores))
filtered_clusters = list(reversed(filtered_clusters))
filtered_avg_label_scores = list(reversed(filtered_avg_label_scores))
filtered_text_similarity = list(reversed(filtered_text_similarity))
filtered_avg_binary_scores = list(reversed(filtered_avg_binary_scores))
filtered_w_density = list(reversed(filtered_w_density))
filtered_uw_density = list(reversed(filtered_uw_density))
filtered_w_fraudar = list(reversed(filtered_w_fraudar))  
filtered_uw_fraudar = list(reversed(filtered_uw_fraudar))
filtered_w_outer_edge = list(reversed(filtered_w_outer_edge)) 
filtered_uw_outer_edge = list(reversed(filtered_w_density)) 
filtered_w_shell_edge = list(reversed(filtered_w_shell_edge))
filtered_uw_shell_edge = list(reversed(filtered_uw_shell_edge))  
filtered_w_outer_mod = list(reversed(filtered_w_outer_mod)) 
filtered_uw_outer_mod = list(reversed(filtered_uw_outer_mod)) 
filtered_w_shell_mod = list(reversed(filtered_w_shell_mod)) 
filtered_uw_shell_mod = list(reversed(filtered_uw_shell_mod))
top_k = [10, 50, 100, 150, 200, 250, 300, 350, 400]
rcParams['figure.figsize'] = 4,4
for k in top_k:
    print ('==================================k = {}========================================'.format(k) )
    plt.hist(filtered_avg_label_scores[:k], bins=7)
    plt.show()
    
    plt.hist(filtered_avg_binary_scores[:k], bins=3)
    plt.show()
    
    plt.plot(filtered_text_similarity[:k], marker='o', linestyle='--')
    plt.xlabel(' Clusters')
    plt.ylabel('avg text similarity')
    plt.show()
    
    print ("Average of average label scores : {}".format(sum(filtered_avg_label_scores[:k])/k))
    print ("Average text similarity : {}".format(sum(filtered_text_similarity[:k])/k))
    print ('\n')
    eig_ts_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_text_similarity[:k])[0]
    eig_w_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ("Spearman Correlation | Average Label Score |          Text Similarity          | {0:.2f} ".format(eig_ts_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Label Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
    
    eig_w_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ('\n')
    print ("Spearman Correlation | Average Binary Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Binary Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
print (len(clusters))

In [None]:
rcParams['figure.figsize'] = 10,10

ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_suspicious_scores, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='anomaly score')
plt.title('After filtering out clusters with only one individual')
plt.show()

ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_w_outer_edge, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual')
plt.show()

# import seaborn as sns
ax = sns.regplot(x=filtered_text_similarity, y=filtered_w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Text Similarity', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual')
plt.show()


ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_text_similarity, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Text Similarity')
plt.title('After filtering out clusters with only one individual')
plt.show()

In [None]:
f2_clusters = filter_by_threshold(filtered_clusters, filtered_suspicious_scores, threshold=0.00000001)
print (filtered_clusters)
f2_clusters = set(f2_clusters)
index_list = [i for i, e in enumerate(filtered_clusters) if e in f2_clusters]
filtered_avg_label_scores = [filtered_avg_label_scores[i] for i in index_list] 
filtered_text_similarity = [filtered_text_similarity[i] for i in index_list] 
filtered_suspicious_scores = [filtered_suspicious_scores[i] for i in index_list] 
filtered_avg_binary_scores = [filtered_avg_binary_scores[i] for i in index_list] 
filtered_w_density = [filtered_w_density[i] for i in index_list] 
filtered_uw_density = [filtered_uw_density[i] for i in index_list] 
filtered_w_fraudar = [filtered_w_fraudar[i] for i in index_list] 
filtered_uw_fraudar = [filtered_uw_fraudar[i] for i in index_list] 
filtered_w_outer_edge = [filtered_w_outer_edge[i] for i in index_list] 
filtered_uw_outer_edge = [filtered_uw_outer_edge[i] for i in index_list] 
filtered_w_shell_edge = [filtered_w_shell_edge[i] for i in index_list] 
filtered_uw_shell_edge = [filtered_uw_shell_edge[i] for i in index_list] 
filtered_w_outer_mod = [filtered_w_outer_mod[i] for i in index_list] 
filtered_uw_outer_mod = [filtered_uw_outer_mod[i] for i in index_list] 
filtered_w_shell_mod = [filtered_w_shell_mod[i] for i in index_list] 
filtered_uw_shell_mod = [filtered_uw_shell_mod[i] for i in index_list] 

In [None]:
from scipy.stats import spearmanr
import plotly.express as px

filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, \
filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, filtered_w_outer_edge, \
filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  filtered_w_outer_mod, \
filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod = \
zip(*sorted(zip(filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, 
                filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, 
                filtered_w_outer_edge, filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  
                filtered_w_outer_mod, filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod)))
filtered_suspicious_scores = list(reversed(filtered_suspicious_scores))
filtered_clusters = list(reversed(filtered_clusters))
filtered_avg_label_scores = list(reversed(filtered_avg_label_scores))
filtered_text_similarity = list(reversed(filtered_text_similarity))
filtered_avg_binary_scores = list(reversed(filtered_avg_binary_scores))
filtered_w_density = list(reversed(filtered_w_density))
filtered_uw_density = list(reversed(filtered_uw_density))
filtered_w_fraudar = list(reversed(filtered_w_fraudar))  
filtered_uw_fraudar = list(reversed(filtered_uw_fraudar))
filtered_w_outer_edge = list(reversed(filtered_w_outer_edge)) 
filtered_uw_outer_edge = list(reversed(filtered_w_density)) 
filtered_w_shell_edge = list(reversed(filtered_w_shell_edge))
filtered_uw_shell_edge = list(reversed(filtered_uw_shell_edge))  
filtered_w_outer_mod = list(reversed(filtered_w_outer_mod)) 
filtered_uw_outer_mod = list(reversed(filtered_uw_outer_mod)) 
filtered_w_shell_mod = list(reversed(filtered_w_shell_mod)) 
filtered_uw_shell_mod = list(reversed(filtered_uw_shell_mod))
top_k = [10, 50, 100, 150, 200, 250, 300, 350, 400]
rcParams['figure.figsize'] = 4,4
for k in top_k:
    print ('==================================k = {}========================================'.format(k) )
    plt.hist(filtered_avg_label_scores[:k], bins=7)
    plt.show()
    
    plt.hist(filtered_avg_binary_scores[:k], bins=3)
    plt.show()
    
    plt.plot(filtered_text_similarity[:k], marker='o', linestyle='--')
    plt.xlabel(' Clusters')
    plt.ylabel('avg text similarity')
    plt.show()
    
    print ("Average of average label scores : {}".format(sum(filtered_avg_label_scores[:k])/k))
    print ("Average text similarity : {}".format(sum(filtered_text_similarity[:k])/k))
    print ('\n')
    eig_ts_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_text_similarity[:k])[0]
    eig_w_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ("Spearman Correlation | Average Label Score |          Text Similarity          | {0:.2f} ".format(eig_ts_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Label Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
    
    eig_w_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ('\n')
    print ("Spearman Correlation | Average Binary Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Binary Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
print (len(clusters))

In [None]:
eig_ts_pscore = spearmanr(filtered_avg_label_scores[:10], filtered_suspicious_scores[:10])[0]
print (eig_ts_pscore)

In [None]:

rcParams['figure.figsize'] = 10,10

ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_suspicious_scores, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='anomaly score')
plt.title('After filtering out clusters with only one individual and with score = 0')
plt.show()

ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_w_outer_edge, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual and above threshold-0.5')
plt.show()

# import seaborn as sns
ax = sns.regplot(x=filtered_text_similarity, y=filtered_w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Text Similarity', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual and above threshold-0.5')
plt.show()


ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_text_similarity, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Text Similarity')
plt.title('After filtering out clusters with only one individual and above threshold-0.5')
plt.show()

In [None]:
suspicious_scores = post_merging_metric['weighted_outer_edge_perc_scores']
text_similarity = post_merging_metric['pairwise_similarity']
clusters = post_merging_metric['clusters']
avg_label_scores = post_merging_metric['avg_label_scores']
avg_binary_scores = post_merging_metric['avg_binary_scores']
w_density = post_merging_metric['weighted_cluster_density']
uw_density = post_merging_metric['unweighted_cluster_density']
w_fraudar = post_merging_metric['weighted_fraudar_scores']
uw_fraudar = post_merging_metric['unweighted_fraudar_scores']
w_outer_edge = post_merging_metric['weighted_outer_edge_perc_scores']
uw_outer_edge = post_merging_metric['unweighted_outer_edge_perc_scores']
w_shell_edge = post_merging_metric['weighted_shell_edge_perc_scores']
uw_shell_edge = post_merging_metric['unweighted_shell_edge_perc_scores']
w_outer_mod = post_merging_metric['weighted_outer_modularity_scores']
uw_outer_mod = post_merging_metric['unweighted_outer_modularity_scores']
w_shell_mod = post_merging_metric['weighted_shell_modularity_scores']
uw_shell_mod = post_merging_metric['unweighted_shell_modularity_scores']

In [None]:
filtered_clusters = filter_by_name(clusters, df_orig.copy())
print (filtered_clusters)
filtered_clusters = set(filtered_clusters)
index_list = [i for i, e in enumerate(clusters) if e in filtered_clusters]
filtered_avg_label_scores = [avg_label_scores[i] for i in index_list] 
filtered_text_similarity = [text_similarity[i] for i in index_list] 
filtered_suspicious_scores = [suspicious_scores[i] for i in index_list] 
filtered_avg_binary_scores = [avg_binary_scores[i] for i in index_list] 
filtered_w_density = [w_density[i] for i in index_list] 
filtered_uw_density = [uw_density[i] for i in index_list] 
filtered_w_fraudar = [w_fraudar[i] for i in index_list] 
filtered_uw_fraudar = [uw_fraudar[i] for i in index_list] 
filtered_w_outer_edge = [w_outer_edge[i] for i in index_list] 
filtered_uw_outer_edge = [uw_outer_edge[i] for i in index_list] 
filtered_w_shell_edge = [w_shell_edge[i] for i in index_list] 
filtered_uw_shell_edge = [uw_shell_edge[i] for i in index_list] 
filtered_w_outer_mod = [w_outer_mod[i] for i in index_list] 
filtered_uw_outer_mod = [uw_outer_mod[i] for i in index_list] 
filtered_w_shell_mod = [w_shell_mod[i] for i in index_list] 
filtered_uw_shell_mod = [uw_shell_mod[i] for i in index_list] 

In [None]:
print (filtered_suspicious_scores)

In [None]:
from scipy.stats import spearmanr
import plotly.express as px

filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, \
filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, filtered_w_outer_edge, \
filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  filtered_w_outer_mod, \
filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod = \
zip(*sorted(zip(filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, 
                filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, 
                filtered_w_outer_edge, filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  
                filtered_w_outer_mod, filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod)))
filtered_suspicious_scores = list(reversed(filtered_suspicious_scores))
filtered_clusters = list(reversed(filtered_clusters))
filtered_avg_label_scores = list(reversed(filtered_avg_label_scores))
filtered_text_similarity = list(reversed(filtered_text_similarity))
filtered_avg_binary_scores = list(reversed(filtered_avg_binary_scores))
filtered_w_density = list(reversed(filtered_w_density))
filtered_uw_density = list(reversed(filtered_uw_density))
filtered_w_fraudar = list(reversed(filtered_w_fraudar))  
filtered_uw_fraudar = list(reversed(filtered_uw_fraudar))
filtered_w_outer_edge = list(reversed(filtered_w_outer_edge)) 
filtered_uw_outer_edge = list(reversed(filtered_w_density)) 
filtered_w_shell_edge = list(reversed(filtered_w_shell_edge))
filtered_uw_shell_edge = list(reversed(filtered_uw_shell_edge))  
filtered_w_outer_mod = list(reversed(filtered_w_outer_mod)) 
filtered_uw_outer_mod = list(reversed(filtered_uw_outer_mod)) 
filtered_w_shell_mod = list(reversed(filtered_w_shell_mod)) 
filtered_uw_shell_mod = list(reversed(filtered_uw_shell_mod))
top_k = [10, 50, 100, 150, 200, 250, 300, 350, 400]
rcParams['figure.figsize'] = 4,4
for k in top_k:
    print ('==================================k = {}========================================'.format(k) )
    plt.hist(filtered_avg_label_scores[:k], bins=7)
    plt.show()
    
    plt.hist(filtered_avg_binary_scores[:k], bins=3)
    plt.show()
    
    plt.plot(filtered_text_similarity[:k], marker='o', linestyle='--')
    plt.xlabel(' Clusters')
    plt.ylabel('avg text similarity')
    plt.show()
    
    print ("Average of average label scores : {}".format(sum(filtered_avg_label_scores[:k])/k))
    print ("Average text similarity : {}".format(sum(filtered_text_similarity[:k])/k))
    print ('\n')
    eig_ts_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_text_similarity[:k])[0]
    eig_w_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ("Spearman Correlation | Average Label Score |          Text Similarity          | {0:.2f} ".format(eig_ts_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Label Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
    
    eig_w_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ('\n')
    print ("Spearman Correlation | Average Binary Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Binary Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
print (len(clusters))

In [None]:
ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_w_outer_edge, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual')
plt.show()

# import seaborn as sns
ax = sns.regplot(x=filtered_text_similarity, y=filtered_w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Text Similarity', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual')
plt.show()


ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_text_similarity, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Text Similarity')
plt.title('After filtering out clusters with only one individual')
plt.show()

In [None]:
suspicious_scores = post_merging_metric['weighted_outer_edge_perc_scores']
text_similarity = post_merging_metric['pairwise_similarity']
clusters = post_merging_metric['clusters']
avg_label_scores = post_merging_metric['avg_label_scores']
avg_binary_scores = post_merging_metric['avg_binary_scores']
w_density = post_merging_metric['weighted_cluster_density']
uw_density = post_merging_metric['unweighted_cluster_density']
w_fraudar = post_merging_metric['weighted_fraudar_scores']
uw_fraudar = post_merging_metric['unweighted_fraudar_scores']
w_outer_edge = post_merging_metric['weighted_outer_edge_perc_scores']
uw_outer_edge = post_merging_metric['unweighted_outer_edge_perc_scores']
w_shell_edge = post_merging_metric['weighted_shell_edge_perc_scores']
uw_shell_edge = post_merging_metric['unweighted_shell_edge_perc_scores']
w_outer_mod = post_merging_metric['weighted_outer_modularity_scores']
uw_outer_mod = post_merging_metric['unweighted_outer_modularity_scores']
w_shell_mod = post_merging_metric['weighted_shell_modularity_scores']
uw_shell_mod = post_merging_metric['unweighted_shell_modularity_scores']
cluster_counts = post_merging_metric['cluster_counts']

In [None]:
suspicious_scores, clusters, avg_label_scores, text_similarity, \
w_density, uw_density, w_fraudar,  uw_fraudar, w_outer_edge, \
uw_outer_edge, w_shell_edge, uw_shell_edge,  w_outer_mod, \
uw_outer_mod, w_shell_mod, uw_shell_mod, cluster_counts = \
zip(*sorted(zip(suspicious_scores, clusters, avg_label_scores, text_similarity, 
                w_density, uw_density, w_fraudar,  uw_fraudar, w_outer_edge, 
                uw_outer_edge, w_shell_edge, uw_shell_edge,  w_outer_mod, 
                uw_outer_mod, w_shell_mod, uw_shell_mod, cluster_counts)))
suspicious_scores = list(reversed(suspicious_scores))
clusters = list(reversed(clusters))
avg_label_scores = list(reversed(avg_label_scores))
text_similarity = list(reversed(text_similarity))
# filtered_avg_binary_scores = list(reversed(filtered_avg_binary_scores))
w_density = list(reversed(w_density))
uw_density = list(reversed(uw_density))
w_fraudar = list(reversed(w_fraudar))  
uw_fraudar = list(reversed(uw_fraudar))
w_outer_edge = list(reversed(w_outer_edge)) 
uw_outer_edge = list(reversed(uw_outer_edge)) 
w_shell_edge = list(reversed(w_shell_edge))
uw_shell_edge = list(reversed(uw_shell_edge))  
w_outer_mod = list(reversed(w_outer_mod)) 
uw_outer_mod = list(reversed(uw_outer_mod)) 
w_shell_mod = list(reversed(w_shell_mod)) 
uw_shell_mod = list(reversed(uw_shell_mod))
cluster_counts = list(reversed(cluster_counts))

In [None]:
content = list(df_orig['content_p'])
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(2, 3), norm='l2', 
    smooth_idf=True, stop_words=stop_words, min_df=2, max_df=0.8)
bigram_matrix = vectorizer.fit_transform(content)

print (bigram_matrix.shape)

In [None]:
# df_data['cluster_label'] = labels

In [None]:
noise_index = clusters.index(-1)
del suspicious_scores[noise_index]
del text_similarity[noise_index]
del clusters[noise_index]
del avg_label_scores[noise_index]
del avg_binary_scores[noise_index]
del w_density[noise_index]
del uw_density[noise_index]
del w_fraudar[noise_index]
del uw_fraudar[noise_index]
del w_outer_edge[noise_index]
del uw_outer_edge[noise_index]
del w_shell_edge[noise_index]
del uw_shell_edge[noise_index]
del w_outer_mod[noise_index]
del uw_outer_mod[noise_index]
del w_shell_mod[noise_index]
del uw_shell_mod[noise_index]

In [None]:
new_index = range(len(df_orig))
df_orig = df_orig.reindex(new_index)
# print (df_da.index)

In [None]:
k=50
sort_ind = []
for ind, cl in enumerate(clusters[:k]):
    if cl==-1 or cl==-2:
        continue
    else:
        df_f = df_orig[df_orig['final_label'] == cl]
        print (df_f.shape)
#         if df_f.shape[0] < 20:
#             continue
        sort_ind += list(df_f.index)
print (sort_ind)

In [None]:

sim_matrix = cosine_similarity(bigram_matrix)
print (sim_matrix.shape)
# fig = plt.figure(figsize=(5, 5))
# ax = fig.add_subplot(111)
# ax.tick_params(
#     axis='both',          # changes apply to the x-axis
#     which='both',      # both major and minor ticks are affected
#     bottom=False,      # ticks along the bottom edge are off
#     top=False,         # ticks along the top edge are off
#     labelbottom=False,
# right='off', left='off', labelleft='off')
cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
ax = sns.heatmap(sim_matrix[sort_ind, :][:,sort_ind], cmap=cmap, xticklabels=False, yticklabels=False, cbar=False)
# ax.set(xlabel='cosine similarity', ylabel='cosine_similarity')
plt.savefig('../results/sim_matrix.png')

In [None]:
k = 50
total_ads_in_clusters = sum(cluster_counts)
avg_of_avg_label_scores = sum(avg_label_scores[:k])/k
avg_of_anomaly_scores = sum(suspicious_scores[:k])/k
avg_text_similarity = sum(text_similarity[:k])/k
avg_cluster_size = sum(cluster_counts[:k])/k

print ("Total ads in clusters: {}, Average of label scores per cluster: {}, \
average of anomaly scores per cluster: {}, Average text sim: {}, avg cluster size: {}".format(
total_ads_in_clusters, avg_of_avg_label_scores, avg_of_anomaly_scores, avg_text_similarity, avg_cluster_size))

In [None]:
from operator import add

class1_clusters = []
class2_clusters = []
class3_clusters = []
clusters = post_merging_metric['clusters']
scores = post_merging_metric['avg_label_scores']
# print (scores)
df_plot = df_orig[df_orig['sim_check'] == False]
for ind, cl in enumerate(clusters[:458]):
    if cl == -1:
        continue
    elif scores[ind] > 3.5:
        class1_clusters.append(cl)
    elif scores[ind] <=3.5 and scores[ind]> 2:
        class2_clusters.append(cl)
    else:
        class3_clusters.append(cl)

purity = np.zeros((3,3))
classes = [{'clusters' : class1_clusters,'purity':[]}, 
           {'clusters' : class2_clusters,'purity':[]}, 
           {'clusters' : class3_clusters,'purity':[]}]
for ind, cls in enumerate(classes):
    t_p = [0,0,0]
    cls_clusters = cls['clusters']
    for cl in cls_clusters:
        df_f = df_plot[df_plot['final_label']==cl]
        p = [len(df_f[df_f['label'] < 3]),
                 len(df_f[df_f['label']==3]),
                 len(df_f[df_f['label']>3]),
                 len(df_f[df_f['label']==3])]
        
        t_p = [sum(x) for x in zip(t_p, p)]
#     print (t_p)
    purity[ind] = np.array(t_p)
    cls['purity'] = t_p


In [None]:
print (purity)

In [None]:
rcParams['figure.figsize'] = 6,6
xaxis = range(3)
print (xaxis)
colors = ['Non-trafficking', 'Unsure', 'Trafficking']
# width = 0.35  
p0 = plt.bar(xaxis, purity.T[0], color='green', alpha=0.5, edgecolor='black', width=0.4)
# p1 = plt.bar(xaxis, purity.T[1], width, bottom=purity.T[0], color='green', alpha=0.5 )
# gap_1 = [sum(x) for x in zip(purity.T[0], purity.T[1])]
p2 = plt.bar(xaxis, purity.T[1], bottom=purity.T[0], color='grey', alpha=0.5, edgecolor='black', width=0.4)
gap_2 = [sum(x) for x in zip(purity.T[0], purity.T[1])]
p3 = plt.bar(xaxis, purity.T[2], bottom=gap_2, color='red', alpha=0.5, edgecolor='black', width=0.4)
# gap_3 = [sum(x) for x in zip(gap_2, purity.T[3])]
# p4 = plt.bar(xaxis, purity.T[4], width, bottom=gap_3, color='red', alpha=0.5, edgecolor='black')
# gap_4 = [sum(x) for x in zip(gap_3, purity.T[4])]
# p5 = plt.bar(xaxis, purity.T[5], width,bottom=gap_4, color='red', alpha=0.5, edgecolor='black')
# gap_5 = [sum(x) for x in zip(gap_4, purity.T[5])]
# p6 = plt.bar(xaxis, purity.T[6], width,bottom=gap_5,color='red', alpha=0.5, edgecolor='black')
plt.legend(colors,loc=2, fontsize='large')
plt.xticks(range(3), ['Corroboration', 'Scooping', 'New Attack'], rotation=0, fontsize='xx-large')
plt.ylabel('Escort Ads', fontsize='xx-large')
plt.show()

In [None]:
print (len(class1_clusters), len(class2_clusters), len(class3_clusters))

In [None]:
xaxis = range(3)
# width = 0.35  
p0 = plt.bar(xaxis, purity.T[0], width, color='green', alpha=0.6)
p1 = plt.bar(xaxis, purity.T[1], width, bottom=purity.T[0], color='green', alpha=0.5 )
gap_1 = [sum(x) for x in zip(purity.T[0], purity.T[1])]
p2 = plt.bar(xaxis, purity.T[2], width, bottom=gap_1, color='green', alpha=0.4)
gap_2 = [sum(x) for x in zip(gap_1, purity.T[2])]
p3 = plt.bar(xaxis, purity.T[3], width, bottom=gap_2, color='grey', alpha=0.4 )
gap_3 = [sum(x) for x in zip(gap_2, purity.T[3])]
p4 = plt.bar(xaxis, purity.T[4], width, bottom=gap_3, color='red', alpha=0.5 )
gap_4 = [sum(x) for x in zip(gap_3, purity.T[4])]
p5 = plt.bar(xaxis, purity.T[5], width,bottom=gap_4, color='red', alpha=0.6 )
gap_5 = [sum(x) for x in zip(gap_4, purity.T[5])]
p6 = plt.bar(xaxis, purity.T[6], width,bottom=gap_5,color='red', alpha=0.7)
plt.show()

In [None]:
suspicious_scores = pre_merging_metrics['weighted_outer_edge_perc_scores']
text_similarity = pre_merging_metrics['pairwise_similarity']
clusters = pre_merging_metrics['clusters']
avg_label_scores = pre_merging_metrics['avg_label_scores']
avg_binary_scores = pre_merging_metrics['avg_binary_scores']
w_density = pre_merging_metrics['weighted_cluster_density']
uw_density = pre_merging_metrics['unweighted_cluster_density']
w_fraudar = pre_merging_metrics['weighted_fraudar_scores']
uw_fraudar = pre_merging_metrics['unweighted_fraudar_scores']
w_outer_edge = pre_merging_metrics['weighted_outer_edge_perc_scores']
uw_outer_edge = pre_merging_metrics['unweighted_outer_edge_perc_scores']
w_shell_edge = pre_merging_metrics['weighted_shell_edge_perc_scores']
uw_shell_edge = pre_merging_metrics['unweighted_shell_edge_perc_scores']
w_outer_mod = pre_merging_metrics['weighted_outer_modularity_scores']
uw_outer_mod = pre_merging_metrics['unweighted_outer_modularity_scores']
w_shell_mod = pre_merging_metrics['weighted_shell_modularity_scores']
uw_shell_mod = pre_merging_metrics['unweighted_shell_modularity_scores']

suspicious_scores, clusters, avg_label_scores, text_similarity, \
w_density, uw_density, w_fraudar,  uw_fraudar, w_outer_edge, \
uw_outer_edge, w_shell_edge, uw_shell_edge,  w_outer_mod, \
uw_outer_mod, w_shell_mod, uw_shell_mod = \
zip(*sorted(zip(suspicious_scores, clusters, avg_label_scores, text_similarity, 
                w_density, uw_density, w_fraudar,  uw_fraudar, w_outer_edge, 
                uw_outer_edge, w_shell_edge, uw_shell_edge,  w_outer_mod, 
                uw_outer_mod, w_shell_mod, uw_shell_mod)))
suspicious_scores = list(reversed(suspicious_scores))
clusters = list(reversed(clusters))
avg_label_scores = list(reversed(avg_label_scores))
text_similarity = list(reversed(text_similarity))
# filtered_avg_binary_scores = list(reversed(filtered_avg_binary_scores))
w_density = list(reversed(w_density))
uw_density = list(reversed(uw_density))
w_fraudar = list(reversed(w_fraudar))  
uw_fraudar = list(reversed(uw_fraudar))
w_outer_edge = list(reversed(w_outer_edge)) 
uw_outer_edge = list(reversed(uw_outer_edge)) 
w_shell_edge = list(reversed(w_shell_edge))
uw_shell_edge = list(reversed(uw_shell_edge))  
w_outer_mod = list(reversed(w_outer_mod)) 
uw_outer_mod = list(reversed(uw_outer_mod)) 
w_shell_mod = list(reversed(w_shell_mod)) 
uw_shell_mod = list(reversed(uw_shell_mod))

content = list(df_data['content_p'])
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(2, 3), norm='l2', 
    smooth_idf=True, stop_words=stop_words, min_df=2, max_df=0.8)
bigram_matrix = vectorizer.fit_transform(content)

print (bigram_matrix.shape)

df_data['cluster_label'] = labels

new_index = range(len(df_data))
df_data = df_data.reindex(new_index)
# print (df_da.index)

k=20
sort_ind = []
for ind, cl in enumerate(clusters[:50]):
    if cl==-1 or cl==-2:
        continue
    else:
        df_f = df_data[df_data['cluster_label'] == cl]
        print (df_f.shape)
#         if df_f.shape[0] < 20:
#             continue
        sort_ind += list(df_f.index)
print (sort_ind)


sim_matrix = cosine_similarity(bigram_matrix)
print (sim_matrix.shape)
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111)
ax.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False,
right='off', left='off', labelleft='off')
plt.imshow(sim_matrix[sort_ind, :][:,sort_ind])
plt.savefig('../results/sim_matrix.png')
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

fraudar_threshold = 1
density_threshold = 0.05
modularity_threshold = 0.3

def execute_filtering(metric, cutoff_metric, cluster_counts, clusters, binary_true_labels, name_filter=True, threshold_filter=True, filter_type='FRAUDAR'):
    labels = df_data['cluster_label']
#     if filter_type == 'FRAUDAR':
#         anomaly_metric,cluster_counts, clusters = zip(*sorted(zip(anomaly_metric, cluster_counts, clusters)))
#         metric = fraudar_scores
#         thresh = fraudar_threshold
#     elif filter_type == 'DENSITY':
#         cluster_density, fraudar_scores, cluster_counts, clusters = zip(*sorted(zip(cluster_density, fraudar_scores, cluster_counts, clusters)))
#         metric = cluster_density
#         thresh = density_threshold
#     elif filter_type == 'MODULARITY':
#         modularity_scores, cluster_density, fraudar_scores, cluster_counts, clusters = zip(*sorted(zip(modularity_scores, cluster_density, fraudar_scores, cluster_counts, clusters)))
#         metric = modularity_scores
#         thresh = modularity_threshold
    metric,cluster_counts, clusters = zip(*sorted(zip(metric, cluster_counts, clusters)))
    if threshold_filter:
        max_index = None
        for index, item in enumerate(metric):
            if item > cutoff_metric:
                max_index = index
                break
        if not max_index:
            max_index = len(metric) - 1
    else:
        max_index = 0


    input_size = bigram_matrix.shape[0]
    binary_pred_labels = [0] * input_size
    #     print ("NUmber of anomalous clusters : {}".format(len(clusters[max_index:])))
    for c in clusters[max_index:]:
        cluster_idx = np.argwhere(labels == l).reshape(-1)
        for i in cluster_idx:
            binary_pred_labels[i] = 1

    anomaly_clusters = clusters[max_index:]
    anomaly_cluster_metric = metric[max_index:]

    #     print ("Number of dense clusters : {}".format(sum(cluster_counts[max_index:])))
    filtered_clusters = []
    filtered_cluster_metric = []
    df_orig = df_data.copy()
#     df_orig['cluster_label'] = labels

    for ind, c in enumerate(anomaly_clusters):
        if anomaly_cluster_metric[ind] == 1.0:
            continue
            
        suspicious_scores = post_merging_metric['weighted_outer_edge_perc_scores']
text_similarity = post_merging_metric['pairwise_similarity']
clusters = post_merging_metric['clusters']
avg_label_scores = post_merging_metric['avg_label_scores']
avg_binary_scores = post_merging_metric['avg_binary_scores']
w_density = post_merging_metric['weighted_cluster_density']
uw_density = post_merging_metric['unweighted_cluster_density']
w_fraudar = post_merging_metric['weighted_fraudar_scores']
uw_fraudar = post_merging_metric['unweighted_fraudar_scores']
w_outer_edge = post_merging_metric['weighted_outer_edge_perc_scores']
uw_outer_edge = post_merging_metric['unweighted_outer_edge_perc_scores']
w_shell_edge = post_merging_metric['weighted_shell_edge_perc_scores']
uw_shell_edge = post_merging_metric['unweighted_shell_edge_perc_scores']
w_outer_mod = post_merging_metric['weighted_outer_modularity_scores']
uw_outer_mod = post_merging_metric['unweighted_outer_modularity_scores']
w_shell_mod = post_merging_metric['weighted_shell_modularity_scores']
uw_shell_mod = post_merging_metric['unweighted_shell_modularity_scores']

filtered_clusters = filter_by_name(clusters, df_orig.copy())
print (filtered_clusters)
filtered_clusters = set(filtered_clusters)
index_list = [i for i, e in enumerate(clusters) if e in filtered_clusters]
filtered_avg_label_scores = [avg_label_scores[i] for i in index_list] 
filtered_text_similarity = [text_similarity[i] for i in index_list] 
filtered_suspicious_scores = [suspicious_scores[i] for i in index_list] 
filtered_avg_binary_scores = [avg_binary_scores[i] for i in index_list] 
filtered_w_density = [w_density[i] for i in index_list] 
filtered_uw_density = [uw_density[i] for i in index_list] 
filtered_w_fraudar = [w_fraudar[i] for i in index_list] 
filtered_uw_fraudar = [uw_fraudar[i] for i in index_list] 
filtered_w_outer_edge = [w_outer_edge[i] for i in index_list] 
filtered_uw_outer_edge = [uw_outer_edge[i] for i in index_list] 
filtered_w_shell_edge = [w_shell_edge[i] for i in index_list] 
filtered_uw_shell_edge = [uw_shell_edge[i] for i in index_list] 
filtered_w_outer_mod = [w_outer_mod[i] for i in index_list] 
filtered_uw_outer_mod = [uw_outer_mod[i] for i in index_list] 
filtered_w_shell_mod = [w_shell_mod[i] for i in index_list] 
filtered_uw_shell_mod = [uw_shell_mod[i] for i in index_list] 

print (filtered_suspicious_scores)

from scipy.stats import spearmanr
import plotly.express as px

filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, \
filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, filtered_w_outer_edge, \
filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  filtered_w_outer_mod, \
filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod = \
zip(*sorted(zip(filtered_suspicious_scores, filtered_clusters, filtered_avg_label_scores, filtered_text_similarity, 
                filtered_w_density, filtered_uw_density, filtered_w_fraudar,  filtered_uw_fraudar, 
                filtered_w_outer_edge, filtered_uw_outer_edge, filtered_w_shell_edge, filtered_uw_shell_edge,  
                filtered_w_outer_mod, filtered_uw_outer_mod, filtered_w_shell_mod, filtered_uw_shell_mod)))
filtered_suspicious_scores = list(reversed(filtered_suspicious_scores))
filtered_clusters = list(reversed(filtered_clusters))
filtered_avg_label_scores = list(reversed(filtered_avg_label_scores))
filtered_text_similarity = list(reversed(filtered_text_similarity))
filtered_avg_binary_scores = list(reversed(filtered_avg_binary_scores))
filtered_w_density = list(reversed(filtered_w_density))
filtered_uw_density = list(reversed(filtered_uw_density))
filtered_w_fraudar = list(reversed(filtered_w_fraudar))  
filtered_uw_fraudar = list(reversed(filtered_uw_fraudar))
filtered_w_outer_edge = list(reversed(filtered_w_outer_edge)) 
filtered_uw_outer_edge = list(reversed(filtered_w_density)) 
filtered_w_shell_edge = list(reversed(filtered_w_shell_edge))
filtered_uw_shell_edge = list(reversed(filtered_uw_shell_edge))  
filtered_w_outer_mod = list(reversed(filtered_w_outer_mod)) 
filtered_uw_outer_mod = list(reversed(filtered_uw_outer_mod)) 
filtered_w_shell_mod = list(reversed(filtered_w_shell_mod)) 
filtered_uw_shell_mod = list(reversed(filtered_uw_shell_mod))
top_k = [10, 50, 100, 150, 200, 250, 300, 350, 400]
rcParams['figure.figsize'] = 4,4
for k in top_k:
    print ('==================================k = {}========================================'.format(k) )
    plt.hist(filtered_avg_label_scores[:k], bins=7)
    plt.show()
    
    plt.hist(filtered_avg_binary_scores[:k], bins=3)
    plt.show()
    
    plt.plot(filtered_text_similarity[:k], marker='o', linestyle='--')
    plt.xlabel(' Clusters')
    plt.ylabel('avg text similarity')
    plt.show()
    
    print ("Average of average label scores : {}".format(sum(filtered_avg_label_scores[:k])/k))
    print ("Average text similarity : {}".format(sum(filtered_text_similarity[:k])/k))
    print ('\n')
    eig_ts_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_text_similarity[:k])[0]
    eig_w_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_label_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ("Spearman Correlation | Average Label Score |          Text Similarity          | {0:.2f} ".format(eig_ts_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Label Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Label Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Label Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
    
    eig_w_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_density[:k])[0]
    eig_uw_den_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_density[:k])[0]
    eig_w_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_fraudar[:k])[0]
    eig_uw_fraud_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_fraudar[:k])[0]
    eig_w_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_edge[:k])[0]
    eig_uw_edge_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_edge[:k])[0]
    eig_w_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_edge[:k])[0]
    eig_uw_edge_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_edge[:k])[0]
    eig_w_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_outer_mod[:k])[0]
    eig_uw_mod_out_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_outer_mod[:k])[0]
    eig_w_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_w_shell_mod[:k])[0]
    eig_uw_mod_shell_pscore = spearmanr(filtered_avg_binary_scores[:k], filtered_uw_shell_mod[:k])[0]
    
    print ('\n')
    print ("Spearman Correlation | Average Binary Score |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
    print ("Spearman Correlation | Average Binary Score |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
    print ("Spearman Correlation | Average Binary Score |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
    print ("Spearman Correlation | Average Binary Score | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))
print (len(clusters))

ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_w_outer_edge, x_ci=68, truncate=False, lowess=True, scatter_kws = {'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual')
plt.show()

# import seaborn as sns
ax = sns.regplot(x=filtered_text_similarity, y=filtered_w_outer_edge, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Text Similarity', ylabel='Weighted outer edge percentage')
plt.title('After filtering out clusters with only one individual')
plt.show()


ax = sns.regplot(x=filtered_avg_label_scores, y=filtered_text_similarity, ci=68, truncate=False, lowess=True, scatter_kws = {'color': 'b', 'alpha': 0.2}, line_kws = {'color': 'red'})
ax.set(xlabel='Average Label Score', ylabel='Text Similarity')
plt.title('After filtering out clusters with only one individual')
plt.show()

    binary_dense_pred_labels = [0]*input_size
    for ind, c in enumerate(filtered_clusters):
        
        cluster_idx = np.argwhere(labels == c).reshape(-1)
#         score = sum(df_fil['label'])/len(df_fil)
#         if score >= 0:
        for i in cluster_idx:
            if probs[i] > 0.0:
                binary_dense_pred_labels[i] = 1
    #     print ("Number of predicted ads : {}".format(binary_dense_pred_labels.count(1)))
    fmeasure = f1_score(binary_true_labels,binary_dense_pred_labels,average='macro')
    print ("F-measure : {}".format(fmeasure))
    pscore = classification_report(binary_true_labels, binary_dense_pred_labels)
    print (pscore)
    
    return filtered_clusters, filtered_cluster_metric


In [None]:
from scipy.stats import spearmanr
eigen_ratios = post_merging_metric['eigen_ratios']

eig_w_den_pscore = spearmanr(avg_label_scores[:top_k], w_density[:top_k])[0]
eig_uw_den_pscore = spearmanr(avg_label_scores[:top_k], uw_density[:top_k])[0]
eig_w_fraud_pscore = spearmanr(avg_label_scores[:top_k],w_fraudar[:top_k])[0]
eig_uw_fraud_pscore = spearmanr(avg_label_scores[:top_k], uw_fraudar[:top_k])[0]
eig_w_edge_out_pscore = spearmanr(avg_label_scores[:top_k], w_outer_edge[:top_k])[0]
eig_uw_edge_out_pscore = spearmanr(avg_label_scores[:top_k], uw_outer_edge[:top_k])[0]
eig_w_edge_shell_pscore = spearmanr(avg_label_scores[:top_k], w_shell_edge[:top_k])[0]
eig_uw_edge_shell_pscore = spearmanr(avg_label_scores[:top_k], uw_shell_edge[:top_k])[0]
eig_w_mod_out_pscore = spearmanr(avg_label_scores[:top_k], w_outer_mod[:top_k])[0]
eig_uw_mod_out_pscore = spearmanr(avg_label_scores[:top_k], uw_outer_mod[:top_k])[0]
eig_w_mod_shell_pscore = spearmanr(avg_label_scores[:top_k], w_shell_mod[:top_k])[0]
eig_uw_mod_shell_pscore = spearmanr(avg_label_scores[:top_k], uw_shell_mod[:top_k])[0]

In [None]:
print ("Spearman Correlation | Eigen Ratio |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
print ("Spearman Correlation | Eigen Ratio |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
print ("Spearman Correlation | Eigen Ratio |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
print ("Spearman Correlation | Eigen Ratio |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
print ("Spearman Correlation | Eigen Ratio |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
print ("Spearman Correlation | Eigen Ratio | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
print ("Spearman Correlation | Eigen Ratio |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
print ("Spearman Correlation | Eigen Ratio | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))

In [None]:
for cl in clusters[:100]:
    if cl < 0:
        continue
    df_d = df_orig[df_orig['final_label']==cl]
    for i, row in df_d.iterrows():
        print (row['body'])
        print ('-------------------------------------------------------------')
    print ('==========================================================')

In [None]:
random.choices([0,1], weights=[0.33, 0.67], k=1)



In [None]:
import random

binary_true_labels = df_orig['binary_label']
binary_pred_labels = np.zeros(len(binary_true_labels))
metric = 'max_binary'
clusters = post_merging_metric['clusters']
metric = post_merging_metric['weighted_outer_edge_perc_scores']
thresholds = 3
use_metric = False
use_random = True
for ind, row in df_orig.iterrows():
    cl = row['final_label']
    if cl == -1:
        if use_random:
            binary_pred_labels[ind] = random.choices([0,1], weights=[0.67, 0.33], k=1)[0]
        else:
            binary_pred_labels[ind] = 0
    elif cl == -2:
        if use_random:
            binary_pred_labels[ind] = random.choices([0,1], weights=[0.67, 0.33], k=1)[0]
        else:
            binary_pred_labels[ind] = 0
    else:
#         binary_pred_labels[ind] = random.choices([0,1], weights=[0.67, 0.33], k=1)[0]
        if use_metric:
            i = clusters.index(cl)
            score = metric[i]
            if score > threshold:
                binary_pred_labels[ind] = 1
        else:
            binary_pred_labels[ind] = 1



In [None]:
def get_weighted_acc(tp, tn, n, p):
    return (tp*n/p + (n))/(2*n)

def get_tpr(tp, fn):
    return tp/(tp+fn)

def get_fpr(fp, tn):
    return fp/(fp+tn)

In [None]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix


c_mat = confusion_matrix(binary_true_labels, binary_pred_labels)

tn, fp, fn, tp = c_mat[0][0], c_mat[0][1], c_mat[1][0], c_mat[1][1]
total_n = len(np.where(binary_true_labels==0)[0])
total_p = len(np.where(binary_true_labels==1)[0])
weight_acc = get_weighted_acc(tp, tn, total_n, total_p)
print (total_p, total_n)
print ("Weighted Accuracy : {}".format(round(weight_acc, 2)))

print (c_mat)

pscore = classification_report(binary_true_labels, binary_pred_labels)
# fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)
print (pscore)

In [None]:
content = list(df_orig['content_p'])
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(2, 3), norm='l2', 
    smooth_idf=True, stop_words=stop_words, min_df=2, max_df=0.8)
bigram_matrix = vectorizer.fit_transform(content)

print (bigram_matrix.shape)

In [None]:
sort_ind = []
for ind, cl in enumerate(clusters):
    if cl==-1 or cl==-2:
        continue
    else:
        df_f = df_orig[df_orig['final_label'] == cl]
        if df_f.shape[0] < 20:
            continue
        sort_ind += list(df_f.index)
print (sort_ind)

In [None]:

sim_matrix = cosine_similarity(bigram_matrix)
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111)
ax.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False,
right='off', left='off', labelleft='off')
plt.imshow(sim_matrix[sort_ind, :][:,sort_ind])
plt.savefig('../results/sim_matrix.png')

In [None]:
import random

rcParams['figure.figsize'] = 4,4
metric = 'max_binary'
clusters = post_merging_metric['clusters']
metric = post_merging_metric['weighted_outer_edge_perc_scores']
threshold = [0, 0.25, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]
use_metric = True
use_random = True
fprs= []
tprs = []
for thresh in threshold:
    binary_true_labels = df_orig['binary_label']
    binary_pred_labels = np.zeros(len(binary_true_labels))
    for ind, row in df_orig.iterrows():
        cl = row['final_label']
        if cl == -1:
            binary_pred_labels[ind] = 0
        elif cl == -2:
            if use_random:
                binary_pred_labels[ind] = random.choice([0,1])
            else:
                binary_pred_labels[ind] = 0
        else:
            if use_metric:
                i = clusters.index(cl)
                score = metric[i]
                if score > thresh:
                    binary_pred_labels[ind] = 1
            else:
                binary_pred_labels[ind] = 1

    c_mat = confusion_matrix(binary_true_labels, binary_pred_labels)

    tn, fp, fn, tp = c_mat[0][0], c_mat[0][1], c_mat[1][0], c_mat[1][1]
    # total_n = len(np.where(binary_true_labels==0)[0])
    # total_p = len(np.where(binary_true_labels==1)[0])
    # weight_acc = get_weighted_acc(tp, tn, total_n, total_p)
    # print (total_p, total_n)
    # print ("Weighted Accuracy : {}".format(round(weight_acc, 2)))

    # pscore = classification_report(binary_true_labels, binary_pred_labels)
    fprs.append(get_fpr(fp, tn))
    tprs.append(get_tpr(tp, fn))

plt.step(fprs, tprs)
plt.ylabel('TPR (Sensitivity)')
plt.xlabel('FPR (1-Specificity)')
# plt.tick_params(
#     axis='both',          # changes apply to the x-axis
#     which='both',      # both major and minor ticks are affected
#     bottom=False,      # ticks along the bottom edge are off
#     top=False,         # ticks along the top edge are off
#     labelbottom=False,
# right='off', left='off', labelleft='off')
plt.show()



In [None]:
def calculate_score(df, score_type):
    if score_type == 'max_binary':
        score = max(df['binary_label'])
    elif score_type == 'avg_label':
        score = sum(df['label'])/len(df)
        
    return score

In [None]:
#Supervised Setting

import random

binary_true_labels = df_orig['binary_label']

n_sample = int(0.1 * len(df_orig))
random_sample = random.sample(range(len(df_orig)), n_sample)
clusters.append(-2)
score_vals = []

for cl in clusters:
    if cl == -1:
        score_vals.append(-1)
    elif cl== -2:
        score_vals.append(-1)
    else:
        df_f = df_orig[df_orig['final_label'] == cl]
#         print (len(df_f))
        for i, row in df_f.iterrows():
            if i in random_sample:
                df_f.drop(i, inplace=True)
        score_vals.append(calculate_score(df_f, 'avg_label'))

print (len(score_vals))


# metric = 'max_binary_scores'
clusters = post_merging_metric['clusters']
metric = score_vals
use_metric = True
use_random = True
threshold = [0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6]
tprs = []
fprs= []
for thresh in threshold:
    binary_pred_labels = np.zeros(len(binary_true_labels))
    for ind, row in df_orig.iterrows():
        cl = row['final_label']
        if cl == -1:
            binary_pred_labels[ind] = 0
        elif cl == -2:
            if use_random:
                binary_pred_labels[ind] = random.choice([0,1])
            else:
                binary_pred_labels[ind] = 0
        else:
            if use_metric:
                i = clusters.index(cl)
                score = metric[i]
                if score > thresh:
                    binary_pred_labels[ind] = 1
            else:
                binary_pred_labels[ind] = 1

    c_mat = confusion_matrix(binary_true_labels, binary_pred_labels)

    tn, fp, fn, tp = c_mat[0][0], c_mat[0][1], c_mat[1][0], c_mat[1][1]

    fprs.append(get_fpr(fp, tn))
    tprs.append(get_tpr(tp, fn))

plt.step(fprs, tprs)
plt.ylabel('TPR (Sensitivity)')
plt.xlabel('FPR (1-Specificity)')
# plt.tick_params(
#     axis='both',          # changes apply to the x-axis
#     which='both',      # both major and minor ticks are affected
#     bottom=False,      # ticks along the bottom edge are off
#     top=False,         # ticks along the top edge are off
#     labelbottom=False,
# right='off', left='off', labelleft='off')
plt.show()

In [None]:
plt.step(fprs, tprs)
plt.ylabel('TPR (Sensitivity)')
plt.xlabel('FPR (1-Specificity)')
# plt.tick_params(
#     axis='both',          # changes apply to the x-axis
#     which='both',      # both major and minor ticks are affected
#     bottom=False,      # ticks along the bottom edge are off
#     top=False,         # ticks along the top edge are off
#     labelbottom=False,
# right='off', left='off', labelleft='off')
plt.show()

In [None]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix


c_mat = confusion_matrix(binary_true_labels, binary_pred_labels)

tn, fp, fn, tp = c_mat[0][0], c_mat[0][1], c_mat[1][0], c_mat[1][1]
total_n = len(np.where(binary_true_labels==0)[0])
total_p = len(np.where(binary_true_labels==1)[0])
weight_acc = get_weighted_acc(tp, tn, total_n, total_p)
print (total_p, total_n)
print ("Weighted Accuracy : {}".format(round(weight_acc, 2)))

pscore = classification_report(binary_true_labels, binary_pred_labels)
print (pscore)

In [None]:
print (len(post_merging_metric['clusters']), len(post_merging_metric['avg_label_scores']))

In [None]:
from operator import add

class1_clusters = []
class2_clusters = []
class3_clusters = []
clusters = post_merging_metric['clusters']
scores = post_merging_metric['avg_label_scores']
# print (scores)
df_plot = df_orig[df_orig['sim_check'] == False]
for ind, cl in enumerate(clusters[:458]):
    if cl == -1:
        continue
    elif scores[ind] > 3.5:
        class1_clusters.append(cl)
    elif scores[ind] <=3.5 and scores[ind]> 2:
        class2_clusters.append(cl)
    else:
        class3_clusters.append(cl)

purity = np.zeros((3,3))
classes = [{'clusters' : class1_clusters,'purity':[]}, 
           {'clusters' : class2_clusters,'purity':[]}, 
           {'clusters' : class3_clusters,'purity':[]}]
for ind, cls in enumerate(classes):
    t_p = [0,0,0]
    cls_clusters = cls['clusters']
    for cl in cls_clusters:
        df_f = df_plot[df_plot['final_label']==cl]
        p = [len(df_f[df_f['label']<3]),
                 len(df_f[df_f['label']==3]),
                 len(df_f[df_f['label']>3])]
        
        t_p = [sum(x) for x in zip(t_p, p)]
#     print (t_p)
    purity[ind] = np.array(t_p)
    cls['purity'] = t_p


In [None]:
xaxis = range(3)
width = 0.35  
p0 = plt.bar(xaxis, purity.T[0], width, color='green', alpha=0.4)
p1 = plt.bar(xaxis, purity.T[1], width, bottom=purity.T[0], color='grey', alpha=0.6 )
gap_1 = [sum(x) for x in zip(purity.T[0], purity.T[1])]
p2 = plt.bar(xaxis, purity.T[2], width, bottom=gap_1, color='red', alpha=0.6)
# gap_2 = [sum(x) for x in zip(gap_1, purity.T[2])]
# p3 = plt.bar(xaxis, purity.T[3], width, bottom=gap_2, color='grey', alpha=0.4 )
# gap_3 = [sum(x) for x in zip(gap_2, purity.T[3])]
# p4 = plt.bar(xaxis, purity.T[4], width, bottom=gap_3, color='red', alpha=0.5 )
# gap_4 = [sum(x) for x in zip(gap_3, purity.T[4])]
# p5 = plt.bar(xaxis, purity.T[5], width,bottom=gap_4, color='red', alpha=0.6 )
# gap_5 = [sum(x) for x in zip(gap_4, purity.T[5])]
# p6 = plt.bar(xaxis, purity.T[6], width,bottom=gap_5,color='red', alpha=0.7)
plt.show()

In [None]:
from sklearn.metrics import classification_report, f1_score

true_labels_all = df_original['binary_label']
print (true_labels_all)
c_mat = confusion_matrix(true_labels_all, binary_all_labels)
b_score = balanced_accuracy_score(true_labels_all, binary_all_labels)
tn, fp, fn, tp = c_mat[0][0], c_mat[0][1], c_mat[1][0], c_mat[1][1]
total_n = len(np.where(true_labels_all==0)[0])
total_p = len(np.where(true_labels_all==1)[0])
weight_acc = get_weighted_acc(tp, tn, total_n, total_p)
print (total_n, total_p)
pscore = classification_report(true_labels_all, binary_all_labels)
fmeasure = f1_score(true_labels_all, binary_all_labels,average='weighted')
clusters = list(set(df_data['cluster_label']))
print (pscore)
print ("Fmeasure: {}".format(fmeasure))
print ("Weighted Acc : {}".format(weight_acc))
class1_purity = []
class2_purity = []
class3_purity = []
class1_count = 0
class2_count = 0
class3_count = 0
avg_label_score = []
for c in clusters:
    df_fil = df_data[df_data['cluster_label']==c]
    if len(df_fil) == 0:
        score = 0
    else:
        score = sum(df_fil['label'])/len(df_fil)
    binary_label_purity = sum(df_fil['binary_label'])/len(df_fil)
    if score >= 3.5:
        class1_count += 1
        class1_purity.append(binary_label_purity)
    elif score < 3.5 and score >= 2:
        class2_count += 1
        class2_purity.append(binary_label_purity)
    else:
        class3_count += 1
        class3_purity.append(binary_label_purity)
        

print ("\n")
print ("|           |   Total   | Filtered |  Avg Purity  |")
print ("|  Class 1  |    {}    |    {}    |     {}     |".format(class1_count, class1_count, round(sum(class1_purity)/class1_count,2)))
print ("|  Class 2  |    {}    |    {}    |     {}     |".format(class2_count, class2_count, round(sum(class2_purity)/class2_count,2)))
print ("|  Class 3  |    {}    |    {}    |     {}     |".format(class3_count, class3_count, round(sum(class3_purity)/class3_count, 2)))


In [None]:
plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['cluster_counts'], alpha=0.2)

In [None]:
METRIC_MAP = {
    'WEIGHTED_FRAUDAR' : 'weighted_fraudar_scores',
    'WEIGHTED_OUTER_MODULARITY' : 'weighted_outer_modularity_scores',
    'WEIGHTED_OUTER_EDGE_PERCENTAGE' : 'weighted_outer_edge_perc_scores',
    'WEIGHTED_DENSITY' : 'weighted_cluster_density',
    'UNWEIGHTED_FRAUDAR' : 'unweighted_fraudar_scores',
    'UNWEIGHTED_OUTER_MODULARITY' : 'unweighted_outer_modularity_scores',
    'UNWEIGHTED_OUTER_EDGE_PERCENTAGE' : 'unweighted_outer_edge_perc_scores',
    'UNWEIGHTED_DENSITY' : 'unweighted_cluster_density'
}

METRIC_CUTOFF_MAP = {
    'WEIGHTED_FRAUDAR' : 0.2,
    'WEIGHTED_OUTER_MODULARITY' : 0.0005,
    'WEIGHTED_SHELL_MODULARITY' : 0.0005,
    'WEIGHTED_OUTER_EDGE_PERCENTAGE' : 0.5,
    'WEIGHTED_SHELL_EDGE_PERCENTAGE' : 0.2,
    'WEIGHTED_DENSITY' : 0.05,
    'UNWEIGHTED_FRAUDAR' : 2,
    'UNWEIGHTED_OUTER_MODULARITY' : 0.0005,
    'UNWEIGHTED_SHELL_MODULARITY' : 0.0005,
    'UNWEIGHTED_OUTER_EDGE_PERCENTAGE' : 0.5,
    'UNWEIGHTED_SHELL_EDGE_PERCENTAGE' : 0.2,
    'UNWEIGHTED_DENSITY' : 0.5
}

In [None]:
def get_fourth_quadrant_count(avg_label_scores, metric, threshold):
    avg_label_scores_np = np.array(avg_label_scores)
    metric_np = np.array(metric)
    trafficking_indices = list(np.where(avg_label_scores_np>3)[0])
    low_metric_threshold = list(np.where(metric_np<threshold)[0])
    
    quad4 = [x for x in trafficking_indices if x in low_metric_threshold]
    
    return quad4

In [None]:
rcParams['figure.figsize'] = 4,4
print ("########## AVERAGE LABEL SCORES ################")
plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_cluster_density'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['weighted_cluster_density'], METRIC_CUTOFF_MAP['WEIGHTED_DENSITY'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['WEIGHTED_DENSITY'], color='black', linestyle='--')
plt.title("Average Label Scores vs Weighted Cluster Density")
plt.savefig('../results/anomaly_metric_scatter_plots/weighted_density.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['unweighted_cluster_density'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['unweighted_cluster_density'], 
                                  METRIC_CUTOFF_MAP['UNWEIGHTED_DENSITY'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['UNWEIGHTED_DENSITY'], color='black', linestyle='--')
plt.title("Average Label Scores vs UnWeighted Cluster Density")
plt.savefig('../results/anomaly_metric_scatter_plots/unweighted_density.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_fraudar_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['weighted_fraudar_scores'], 
                                  METRIC_CUTOFF_MAP['WEIGHTED_FRAUDAR'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['WEIGHTED_FRAUDAR'], color='black', linestyle='--')
plt.title("Average Label Scores vs Weighted Fraudar Scores")
plt.savefig('../results/anomaly_metric_scatter_plots/weighted_fraudar.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['unweighted_fraudar_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['unweighted_fraudar_scores'], 
                                  METRIC_CUTOFF_MAP['UNWEIGHTED_FRAUDAR'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['UNWEIGHTED_FRAUDAR'], color='black', linestyle='--')
plt.title("Average Label Scores vs UnWeighted Fraudar Scores")
plt.savefig('../results/anomaly_metric_scatter_plots/unweighted_fraudar.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_outer_edge_perc_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['weighted_outer_edge_perc_scores'], 
                                  METRIC_CUTOFF_MAP['WEIGHTED_OUTER_EDGE_PERCENTAGE'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['WEIGHTED_OUTER_EDGE_PERCENTAGE'], color='black', linestyle='--')
plt.title("Average Label Scores vs Weighted Outer Edge Perc")
plt.savefig('../results/anomaly_metric_scatter_plots/weighted_outer_edge.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['unweighted_outer_edge_perc_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['unweighted_outer_edge_perc_scores'], 
                                  METRIC_CUTOFF_MAP['UNWEIGHTED_OUTER_EDGE_PERCENTAGE'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['UNWEIGHTED_OUTER_EDGE_PERCENTAGE'], color='black', linestyle='--')
plt.title("Average Label Scores vs Unweighted Outer Edge Perc")
plt.savefig('../results/anomaly_metric_scatter_plots/unweighted_outer_edge.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['unweighted_shell_edge_perc_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['unweighted_shell_edge_perc_scores'], 
                                  METRIC_CUTOFF_MAP['UNWEIGHTED_SHELL_EDGE_PERCENTAGE'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['UNWEIGHTED_SHELL_EDGE_PERCENTAGE'], color='black', linestyle='--')
plt.title("Average Label Scores vs Unweighted Shell Edge Perc")
plt.savefig('../results/anomaly_metric_scatter_plots/unweighted_shell_edge.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_shell_edge_perc_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['weighted_shell_edge_perc_scores'], 
                                  METRIC_CUTOFF_MAP['WEIGHTED_SHELL_EDGE_PERCENTAGE'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['WEIGHTED_SHELL_EDGE_PERCENTAGE'], color='black', linestyle='--')
plt.title("Average Label Scores vs weighted Shell Edge Perc")
plt.savefig('../results/anomaly_metric_scatter_plots/weighted_shell_edge.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_outer_modularity_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['weighted_outer_modularity_scores'], 
                                  METRIC_CUTOFF_MAP['WEIGHTED_OUTER_MODULARITY'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['WEIGHTED_OUTER_MODULARITY'], color='black', linestyle='--')
plt.title("Average Label Scores vs Weighted Outer Modularity")
plt.savefig('../results/anomaly_metric_scatter_plots/weighted_outer_modularity.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['unweighted_outer_modularity_scores'], alpha=0.2)

quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['unweighted_outer_modularity_scores'], 
                                  METRIC_CUTOFF_MAP['UNWEIGHTED_OUTER_MODULARITY'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['UNWEIGHTED_OUTER_MODULARITY'], color='black', linestyle='--')
plt.title("Average Label Scores vs Unweighted Outer Modularity")
plt.savefig('../results/anomaly_metric_scatter_plots/unweighted_outer_modularity.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_shell_modularity_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['weighted_shell_modularity_scores'], 
                                  METRIC_CUTOFF_MAP['WEIGHTED_SHELL_MODULARITY'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['WEIGHTED_SHELL_MODULARITY'], color='black', linestyle='--')
plt.title("Average Label Scores vs Weighted Shell Modularity")
plt.savefig('../results/anomaly_metric_scatter_plots/weighted_shell_modularity.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_shell_modularity_scores'], alpha=0.2)
quad4 = get_fourth_quadrant_count(post_merging_metric['avg_label_scores'], 
                                  post_merging_metric['unweighted_shell_modularity_scores'], 
                                  METRIC_CUTOFF_MAP['UNWEIGHTED_SHELL_MODULARITY'])
plt.legend(['Number of clusters in Quad4={}'.format(len(quad4))])
plt.axvline(x=3, color='red', linestyle='--')
plt.axhline(y=METRIC_CUTOFF_MAP['UNWEIGHTED_SHELL_MODULARITY'], color='black', linestyle='--')
plt.title("Average Label Scores vs Weighted Shell Modularity")
plt.savefig('../results/anomaly_metric_scatter_plots/unweighted_shell_modularity.png')
plt.show()


plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['cluster_counts'], alpha=0.2)
plt.title("Average Label Scores vs Weighted Shell Modularity")
plt.show()

In [None]:
# Evaluate using overall modularity of the graph
#Increase in eigen ratio in total results (% of results with eigenratio above 0.8)
# % of results in quadrant 4 with and without rerunning on noisy set and merging of clusters
# total number of anomalous clusters
# Average cluster sizes before and after rerunning
# for merged clusters, if modularity or density increased or decreased
from sklearn.metrics import silhouette_score
score = silhouette_score(encoded_vecs, list(df_data['cluster_label']), metric='euclidean')

print (score)

In [None]:
scoring_vals = post_merging_metric['max_binary_scores']
labels = list(df_data['cluster_label'].copy())
clusters = post_merging_metric['clusters']
binary_labels = df_original['binary_label']
print (binary_labels.shape)
# scoring_vals, labels, clusters, binary_labels = zip(*sorted(zip(scoring_vals, labels, clusters, binary_labels)))
# print (binary_labels.shape)

In [None]:
labels_all = np.zeros(len(df_original))
cnt = 0
for i in range(len(df_original)):
    if i in noisy_set:
        labels_all[i] = -1
    else:
        labels_all[i] = labels[cnt]
        cnt += 1

In [None]:
print (labels_all.shape)

In [None]:
def get_weighted_acc(tp, tn, n, p):
    return (tp*n/p + (n))/(2*n)

In [None]:
import random
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from sklearn.metrics import roc_curve, auc

n_sample = int(0.1 * len(df_original))
random_sample = random.sample(range(len(df_original)), n_sample)

print (n_sample)
binary_labels = np.array(binary_labels)
true_labels_random_sample = binary_labels[random_sample]
pred_labels_random_sample = np.zeros(n_sample)

print (len(np.where(true_labels_random_sample==1)[0]))
threshold = 0
anomalous_clusters = []
for ind, c in enumerate(clusters):
    if c == -1:
        continue
    if scoring_vals[ind] > threshold:
        anomalous_clusters.append(c)

        
for i, sam in enumerate(random_sample):
    if sam in noisy_set:
        continue
    elif labels_all[sam] in anomalous_clusters:
        pred_labels_random_sample[i] = 1

pscore = classification_report(true_labels_random_sample, list(pred_labels_random_sample))
fmeasure = f1_score(true_labels_random_sample,list(pred_labels_random_sample),average='weighted')
c_mat = confusion_matrix(true_labels_random_sample, pred_labels_random_sample)
b_score = balanced_accuracy_score(true_labels_random_sample, pred_labels_random_sample)
tn, fp, fn, tp = c_mat[0][0], c_mat[0][1], c_mat[1][0], c_mat[1][1]
total_n = len(np.where(true_labels_random_sample==0)[0])
total_p = len(np.where(true_labels_random_sample==1)[0])
weight_acc = get_weighted_acc(tp, tn, total_n, total_p)
roc_curve(true_labels_random_sample, pred_labels_random_sample)
print ("weighted acc : {}".format(round(weight_acc, 2)))
print ("Confusion matrix:")
print (c_mat)
print (pscore)
print ("Weighted Accuracy : {}".format(round(b_score, 2)))

In [None]:

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

fraudar_threshold = 1
density_threshold = 0.05
modularity_threshold = 0.3

def execute_filtering_by_count(metric, cluster_counts, clusters, binary_true_labels, n_top, name_filter=True):
    labels = df_data['cluster_label']

    metric,cluster_counts, clusters = zip(*sorted(zip(metric, cluster_counts, clusters)))
    metric = list(reversed(metric))
    cluster_counts = list(reversed(cluster_counts))
    clusters = list(reversed(clusters))
    
    input_size = bigram_matrix.shape[0]
    filtered_clusters = []
    filtered_cluster_metric = []
    df_orig = df_data.copy()
#     df_orig['cluster_label'] = labels

    for ind, c in enumerate(clusters):
        if c == -1:
            continue
        df_fil = df_orig[df_orig['cluster_label']==c]
        if name_filter:
            names = [x.lower() if type(x) == type('') else None for x in df_fil['Name'].unique()]
            if math.nan in names:
                names.remove(math.nan)
            if None in names:
                names.remove(None)
            names = list(set(names))
            if len(names)>1:
    #             print (names)
                filtered_clusters.append(c)
                filtered_cluster_metric.append(metric[ind])
        else:
            filtered_clusters.append(c)
            filtered_cluster_metric.append(metric[ind])
        if len(filtered_clusters) == n_top:
            break

    binary_dense_pred_labels = [0]*input_size
    for ind, c in enumerate(filtered_clusters):
        
        cluster_idx = np.argwhere(labels == c).reshape(-1)
#         score = sum(df_fil['label'])/len(df_fil)
#         if score >= 0:
        for i in cluster_idx:
            binary_dense_pred_labels[i] = 1
    #     print ("Number of predicted ads : {}".format(binary_dense_pred_labels.count(1)))
    fmeasure = f1_score(binary_true_labels,binary_dense_pred_labels,average='macro')
    print ("F-measure : {}".format(fmeasure))
    pscore = classification_report(binary_true_labels, binary_dense_pred_labels)
    print (pscore)
    
    return filtered_clusters, filtered_cluster_metric


In [None]:

        
#     max_index = 1
#     max_diff = densities[1] - densities[0]
name_filter = [True, False]

filter_metrics = ['WEIGHTED_FRAUDAR', 'WEIGHTED_OUTER_MODULARITY', 'WEIGHTED_OUTER_EDGE_PERCENTAGE', 'WEIGHTED_DENSITY', 
                 'UNWEIGHTED_FRAUDAR', 'UNWEIGHTED_OUTER_MODULARITY', 'UNWEIGHTED_OUTER_EDGE_PERCENTAGE', 'UNWEIGHTED_DENSITY']
for n_filter in name_filter:
    for filter_param in filter_metrics:
        class1_purity = []
        class2_purity = []
        class3_purity = []
        class1_count = 0
        class2_count = 0
        class3_count = 0
        print ('==============Anomally Metric: {}, Name Filtering: {}======================'.format(filter_param, 
                                                                                                    n_filter))
        metric = post_merging_metric[METRIC_MAP[filter_param]]
        metric_cutoff = METRIC_CUTOFF_MAP[filter_param]
        cluster_counts = post_merging_metric['cluster_counts']
        clusters = list(set(df_data['cluster_label']))

        filtered_clusters, filtered_cluster_metric = execute_filtering_by_count(metric, 
                                                                       cluster_counts, clusters, 
                                                                       binary_true_labels, 100, n_filter)
        print ("Number of anomalous clusters : {}".format(len(filtered_clusters)))
        avg_label_score = []
        for c in filtered_clusters:
            df_fil = df_data[df_data['cluster_label']==c]
            if len(df_fil) == 0:
                score = 0
            else:
                score = sum(df_fil['label'])/len(df_fil)
            avg_label_score.append(score)
            binary_label_purity = sum(df_fil['binary_label'])/len(df_fil)
            if score >= 3.5:
                class1_count += 1
                class1_purity.append(binary_label_purity)
            elif score < 3.5 and score >= 2:
                class2_count += 1
                class2_purity.append(binary_label_purity)
            else:
                class3_count += 1
                class3_purity.append(binary_label_purity)


    #     mean_score = sum(avg_label_score)/len(avg_label_score)
    #     print ("Mean Trafficking score for ads : {}".format(mean_score))

        pre_cl = np.array(post_merging_metric['avg_label_scores'])
#         avg_l = np.array(avg_label_score)
        print ("\n")
        print ("|           |   Total   | Filtered |  Avg Purity  |")
        print ("|  Class 1  |    {}    |    {}    |     {}     |".format(len(np.where(pre_cl>=3.5)[0]), class1_count, round(sum(class1_purity)/class1_count,2)))
        print ("|  Class 2  |    {}    |    {}    |     {}     |".format(len(np.where((pre_cl>=2.0) & (pre_cl<3.5))[0]), class2_count, round(sum(class2_purity)/class2_count,2)))
        print ("|  Class 3  |    {}    |    {}    |     {}     |".format(len(np.where(pre_cl<=2.0)[0]), class3_count, round(sum(class3_purity)/class3_count, 2)))
    

In [None]:

        
#     max_index = 1
#     max_diff = densities[1] - densities[0]

settings = [{'name_filter': False, 'threshold_filter' : True},
           {'name_filter': True, 'threshold_filter': False},
           {'name_filter' : True, 'threshold_filter': True}]

filter_metrics = ['WEIGHTED_FRAUDAR', 'WEIGHTED_OUTER_MODULARITY', 'WEIGHTED_OUTER_EDGE_PERCENTAGE', 'WEIGHTED_DENSITY', 
                 'UNWEIGHTED_FRAUDAR', 'UNWEIGHTED_OUTER_MODULARITY', 'UNWEIGHTED_OUTER_EDGE_PERCENTAGE', 'UNWEIGHTED_DENSITY']
for setting in settings:
    if setting['threshold_filter'] is True:
        for filter_param in filter_metrics:
            class1_purity = []
            class2_purity = []
            class3_purity = []
            class1_count = 0
            class2_count = 0
            class3_count = 0
            print ('==============Anomally Metric: {}, Name Filtering: {}======================'.format(filter_param, 
                                                                                                        setting['name_filter']))
            metric = post_merging_metric[METRIC_MAP[filter_param]]
            metric_cutoff = METRIC_CUTOFF_MAP[filter_param]
            cluster_counts = post_merging_metric['cluster_counts']
            clusters = list(set(df_data['cluster_label']))

            filtered_clusters, filtered_cluster_metric = execute_filtering(metric, metric_cutoff, 
                                                                           cluster_counts, clusters, 
                                                                           binary_true_labels,setting['name_filter'], 
                                                                           setting['threshold_filter'], filter_param)
            print ("Number of anomalous clusters : {}".format(len(filtered_clusters)))
            avg_label_score = []
            for c in filtered_clusters:
                df_fil = df_data[df_data['cluster_label']==c]
                if len(df_fil) == 0:
                    score = 0
                else:
                    score = sum(df_fil['label'])/len(df_fil)
                avg_label_score.append(score)
                binary_label_purity = sum(df_fil['binary_label'])/len(df_fil)
                if score >= 3.5:
                    class1_count += 1
                    class1_purity.append(binary_label_purity)
                elif score < 3.5 and score >= 2:
                    class2_count += 1
                    class2_purity.append(binary_label_purity)
                else:
                    class3_count += 1
                    class3_purity.append(binary_label_purity)

            pre_cl = np.array(post_merging_metric['avg_label_scores'])

            print ("\n")
            print ("|           |   Total   | Filtered |  Avg Purity  |")
            print ("|  Class 1  |    {}    |    {}    |     {}     |".format(len(np.where(pre_cl>=3.5)[0]), class1_count, round(sum(class1_purity)/class1_count,2)))
            print ("|  Class 2  |    {}    |    {}    |     {}     |".format(len(np.where((pre_cl>=2.0) & (pre_cl<3.5))[0]), class2_count, round(sum(class2_purity)/class2_count,2)))
            print ("|  Class 3  |    {}    |    {}    |     {}     |".format(len(np.where(pre_cl<=2.0)[0]), class3_count, round(sum(class3_purity)/class3_count, 2)))
    
    else:
        print ("===================Only Name Filter===========================")
        filtered_clusters, filtered_cluster_metric = execute_filtering(metric, metric_cutoff, cluster_counts, clusters, 
                                                                           binary_true_labels,setting['name_filter'], setting['threshold_filter'], filter_param)
        print ("Number of anomalous clusters : {}".format(len(filtered_clusters)))
        class1_purity = []
        class2_purity = []
        class3_purity = []
        class1_count = 0
        class2_count = 0
        class3_count = 0
        avg_label_score = []
        for c in filtered_clusters:
            df_fil = df_data[df_data['cluster_label']==c]
            if len(df_fil) == 0:
                score = 0
            else:
                score = sum(df_fil['label'])/len(df_fil)
            avg_label_score.append(score)
            binary_label_purity = sum(df_fil['binary_label'])/len(df_fil)
            if score >= 3.5:
                class1_count += 1
                class1_purity.append(binary_label_purity)
            elif score < 3.5 and score >= 2:
                class2_count += 1
                class2_purity.append(binary_label_purity)
            else:
                class3_count += 1
                class3_purity.append(binary_label_purity)

    #     mean_score = sum(avg_label_score)/len(avg_label_score)
    #     print ("Mean Trafficking score for ads : {}".format(mean_score))

        pre_cl = np.array(post_merging_metric['avg_label_scores'])

        print ("\n")
        print ("|           |   Total   | Filtered |  Avg Purity  |")
        print ("|  Class 1  |    {}    |    {}    |     {}     |".format(len(np.where(pre_cl>=3.5)[0]), class1_count, round(sum(class1_purity)/class1_count,2)))
        print ("|  Class 2  |    {}    |    {}    |     {}     |".format(len(np.where((pre_cl>=2.0) & (pre_cl<3.5))[0]), class2_count, round(sum(class2_purity)/class2_count,2)))
        print ("|  Class 3  |    {}    |    {}    |     {}     |".format(len(np.where(pre_cl<=2.0)[0]), class3_count, round(sum(class3_purity)/class3_count, 2)))
    
    
#     rcParams['figure.figsize'] = 5,5
#     num_bins = 7
#     n, bins, patches = plt.hist(avg_label_score, num_bins, facecolor='blue', alpha=0.5)
#     plt.show()


In [None]:
rcParams['figure.figsize'] = 10,10
plt.scatter(post_merging_metric['avg_label_scores'], post_merging_metric['weighted_outer_edge_perc_scores'], alpha=0.2)
plt.title("Max Label Scores vs Unweighted Shell Modularity")
plt.show()

In [None]:
import json

df_orig.to_csv(OUTPUT_FILE)
avg_label_score = post_merging_metric['avg_label_scores']
filtered_cluster_metric = post_merging_metric['weighted_outer_edge_perc_scores']
cluster_prop = {}
for cl in filtered_clusters:
    print (cl)
    if cl == -1:
        continue
    index = clusters.index(cl)
    
    cluster_prop[int(cl)] = {'avg_label_score' : avg_label_score[index], 
                             'cluster_metric' : filtered_cluster_metric[index],
                             'word_list' : ', '.join(bigrams_dict[int(cl)]),
                            'top_bigrams' : ', '.join(top_bigrams[int(cl)])}
with open(OUTPUT_PROP_FILE, 'w') as f:
    json.dump(cluster_prop, f)

In [None]:

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.metrics import silhouette_score

df_data.reset_index(drop=True, inplace=True)
labels = post_merging_metric['cluster_label']
clusters = list(set(df_data['cluster_label']))
import math

stop_words = set(list(stop_words) + ['height', 'weight', 'age', 'am', 'lbs', 'years', 'year'])
content = df_data['content_p']
print (len(content))
content = content.replace(np.nan, '', regex=True)
vectorizer1 = TfidfVectorizer(lowercase=True, ngram_range=(2,2))
b_mat = vectorizer1.fit_transform(content)
features_col = vectorizer1.get_feature_names()

top_bigrams = {}
bigrams_dict = {}

for l in clusters:
    
    cluster_idx = np.argwhere(labels == l).reshape(-1)
#     clusters.append(l)
#     shell_subgraph, outer_subgraph, core_subgraph = get_all_subgraphs(bigram_matrix, cluster_idx)
    
    if l== -1:
        continue
#     cluster_ajac = (bigram_matrix[cluster_idx,:][:])
    df_filt = df_data[df_data['cluster_label']== l]
    local_content = list(df_filt['content_p'])
#     print (content.shape)
    # Create and fit the LDA model
    count_vectorizer = TfidfVectorizer(ngram_range=(2,2), use_idf=False)
    count_data = count_vectorizer.fit_transform(local_content).todense()
    
    svd = TruncatedSVD(n_components=2)
    local_vecs = svd.fit_transform(count_data)
    
    ajac_big = (b_mat[cluster_idx,:][:]).todense()
#     top_bigram_ajac = (count_data[cluster_idx,:][:]).todense()
    top_bigram_features = count_vectorizer.get_feature_names()
    
    bigram_sums = np.sum(np.asarray(count_data), axis=0)
#     print (bigram_sums)
    bigram_index = bigram_sums.argsort()[:-20:-1]
#     print (bigram_index)
    n_top_words = 20
    number_topics = 1
    lda = LDA(n_components=number_topics)
    lda.fit(ajac_big)
    bigrams_list = []
    for topic_idx, topic in enumerate(lda.components_):
#         print("\nTopic #%d:" % topic_idx)
#         print(", ".join([words[i]
#                         for i in topic.argsort()[:-n_top_words - 1:-1]]))
        word_list = [features_col[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    bigrams_dict[l] = word_list
    
    
    for i in bigram_index:
#         print (bigram_sums[i])
        f = top_bigram_features[i]
        if f.split()[0] in stop_words or f.split()[1] in stop_words or len(f.split()[0]) < 3 or \
        len(f.split()[1]) < 3 or f.split()[0] == f.split()[1]:
            continue
        bigrams_list.append(f)
        if len(bigrams_list) == 5:
            break
        
#     print (bigrams_list)
    top_bigrams[l] = bigrams_list
    print (bigrams_list)
#     for ix, row in df_filt.iterrows():
#         print ('---------------------------------')
#         print (row['body'])
#     break
    if l % 50 == 0:
        print (l)

# original_labels = labels.copy()


In [None]:
print (len(filtered_clusters))
print (len(clusters))
pre_cl = np.array(post_merging_metric['avg_label_scores'])
print (len(np.where(pre_cl>=3.5)[0]), )
avg_l = np.array(avg_label_score)
print (len(np.where(avg_l>=3.5)[0]))


print ("     {}     |    {}    |".format(len(np.where(pre_cl>=3.5)[0]), len(np.where(avg_l>=3.5)[0])))
print ("     {}     |    {}    |".format(len(np.where((pre_cl>=2.0) & (pre_cl<=3.5))[0]), len(np.where((avg_l>=2.0) & (avg_l<=3.5))[0])))
print ("     {}     |    {}    |".format(len(np.where(pre_cl<=2.0)[0]), len(np.where(avg_l<2.0)[0])))

# print ("     {}     |    {}    |".format(len(np.where(pre_cl>=3.5)[0])), len(np.where(avg_l>=3.5)[0]))

In [None]:
filtered_clusters = []

for ind, c in enumerate(clusters):
    df_fil = df_data[df_data['cluster_label']==c]
    names = [x.lower() if type(x) == type('') else None for x in df_fil['Name'].unique()]
    if math.nan in names:
        names.remove(math.nan)
    if None in names:
        names.remove(None)
    names = list(set(names))
    if len(names)>1:
#         print (names)
        filtered_clusters.append(c)
#         filtered_cluster_metric.append(anomaly_cluster_metric[ind])

In [None]:
print (df_data['cluster_label'].unique())

In [None]:
df_data['cluster_label'].nunique()


In [None]:
from scipy.stats import spearmanr
eigen_ratios = post_merging_metric['eigen_ratios']

eig_w_den_pscore = spearmanr(eigen_ratios, post_merging_metric['weighted_cluster_density'])[0]
eig_uw_den_pscore = spearmanr(eigen_ratios, post_merging_metric['unweighted_cluster_density'])[0]
eig_w_fraud_pscore = spearmanr(eigen_ratios,post_merging_metric['weighted_fraudar_scores'])[0]
eig_uw_fraud_pscore = spearmanr(eigen_ratios, post_merging_metric['unweighted_fraudar_scores'])[0]
eig_w_edge_out_pscore = spearmanr(eigen_ratios, post_merging_metric['weighted_outer_edge_perc_scores'])[0]
eig_uw_edge_out_pscore = spearmanr(eigen_ratios, post_merging_metric['unweighted_outer_edge_perc_scores'])[0]
eig_w_edge_shell_pscore = spearmanr(eigen_ratios, post_merging_metric['weighted_shell_edge_perc_scores'])[0]
eig_uw_edge_shell_pscore = spearmanr(eigen_ratios, post_merging_metric['unweighted_shell_edge_perc_scores'])[0]
eig_w_mod_out_pscore = spearmanr(eigen_ratios, post_merging_metric['weighted_outer_modularity_scores'])[0]
eig_uw_mod_out_pscore = spearmanr(eigen_ratios, post_merging_metric['unweighted_outer_modularity_scores'])[0]
eig_w_mod_shell_pscore = spearmanr(eigen_ratios, post_merging_metric['weighted_shell_modularity_scores'])[0]
eig_uw_mod_shell_pscore = spearmanr(eigen_ratios, post_merging_metric['unweighted_shell_modularity_scores'])[0]

In [None]:
print ("Spearman Correlation | Eigen Ratio |     Weighted Cluster Density      | {0:.2f} ".format(eig_w_den_pscore))
print ("Spearman Correlation | Eigen Ratio |    UnWeighted Cluster Density     | {0:.2f} ".format(eig_uw_den_pscore))
print ("Spearman Correlation | Eigen Ratio |      Weighted Fraudar Score       | {0:.2f} ".format(eig_w_fraud_pscore))
print ("Spearman Correlation | Eigen Ratio |     UnWeighted Fraudar Score      | {0:.2f} ".format(eig_uw_fraud_pscore))
print ("Spearman Correlation | Eigen Ratio |     Weighted Outer Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |    UnWeighted Outer Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |     Weighted Shell Edge Perc      | {0:.2f} ".format(eig_w_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |    UnWeighted Shell Edge Perc     | {0:.2f} ".format(eig_uw_edge_out_pscore))
print ("Spearman Correlation | Eigen Ratio |  Weighted Outer Modularity Score  | {0:.2f} ".format(eig_w_mod_out_pscore))
print ("Spearman Correlation | Eigen Ratio | UnWeighted Outer Modularity Score | {0:.2f} ".format(eig_uw_mod_out_pscore))
print ("Spearman Correlation | Eigen Ratio |  Weighted Shell Modularity Score  | {0:.2f} ".format(eig_w_mod_shell_pscore))
print ("Spearman Correlation | Eigen Ratio | UnWeighted Shell Modularity Score | {0:.2f} ".format(eig_uw_mod_shell_pscore))

In [None]:
avg_label_scores = []
max_label_scores = []
sum_label_scores = []
avg_binary_scores = []
max_binary_scores = []
sum_binary_scores = []
df_data['cluster_label'] = labels
df_data['probabilities'] = probs
df_data['binary_label'] = binary_true_labels
for c in unique_labels:
    df_fil = df_data[df_data['cluster_label']==c]
    
    max_label_scores.append(max(df_fil['label']))
    avg_label_scores.append(sum(df_fil['label'])/len(df_fil['label']))
    sum_label_scores.append(sum(df_fil['label']))
    
    max_binary_scores.append(max(df_fil['binary_label']))
    avg_binary_scores.append(sum(df_fil['binary_label'])/len(df_fil['binary_label']))
    sum_binary_scores.append(sum(df_fil['binary_label']))
    

In [None]:
noisy_cluster_index = clusters.index(-1)
avg_label_scores[noisy_cluster_index] = 0
max_label_scores[noisy_cluster_index] = 0
sum_label_scores[noisy_cluster_index] = 0

max_binary_scores[noisy_cluster_index] = 0
avg_binary_scores[noisy_cluster_index] = 0
sum_binary_scores[noisy_cluster_index] = 0

In [None]:
score = spearmanr(eigen_ratios, avg_scores)[0]
print (score)

print ("Eigen Ratios |  Average Label Scores  | {} ".format(spearmanr(eigen_ratios, avg_label_scores)[0]))
print ("Eigen Ratios |    Max Label Scores    | {} ".format(spearmanr(eigen_ratios, max_label_scores)[0]))
print ("Eigen Ratios |    Sum Label Scores    | {} ".format(spearmanr(eigen_ratios, sum_label_scores)[0]))
print ("Eigen Ratios |    Max Binary Scores   | {} ".format(spearmanr(eigen_ratios, max_binary_scores)[0]))
print ("Eigen Ratios |  Average Binary Scores | {} ".format(spearmanr(eigen_ratios, avg_binary_scores)[0]))
print ("Eigen Ratios |   Sum Binary Scores    | {} ".format(spearmanr(eigen_ratios, sum_binary_scores)[0]))


In [None]:
print ("Spearman Correlation | Average Label Score |       Weighted Density         | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_cluster_density'])[0]))
print ("Spearman Correlation | Average Label Score |      Unweighted Density        | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_cluster_density'])[0]))
print ("Spearman Correlation | Average Label Score |       Weighted Fraudar         | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Average Label Score |      Unweighted Fraudar        | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Average Label Score |    Weighted Outer Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Label Score |   Unweighted Outer Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Label Score |    Weighted Shell Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Label Score |   Unweighted Shell Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Label Score |    Weighted Outer Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Average Label Score |   Unweighted Outer Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Average Label Score |    Weighted Shell Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_modularity_scores'])[0]))
print ("Spearman Correlation | Average Label Score |   Unweighted Shell Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_modularity_scores'])[0]))

In [None]:
from scipy.stats import pearsonr

print ("Pearson Correlation | Average Label Score |       Weighted Density         | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_cluster_density'])[0]))
print ("Pearson Correlation | Average Label Score |      Unweighted Density        | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_cluster_density'])[0]))
print ("Pearson Correlation | Average Label Score |       Weighted Fraudar         | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_fraudar_scores'])[0]))
print ("Pearson Correlation | Average Label Score |      Unweighted Fraudar        | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_fraudar_scores'])[0]))
print ("Pearson Correlation | Average Label Score |    Weighted Outer Edge Perc    | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Label Score |   Unweighted Outer Edge Perc   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Label Score |    Weighted Shell Edge Perc    | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Label Score |   Unweighted Shell Edge Perc   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Label Score |    Weighted Outer Modularity   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_modularity_scores'])[0]))
print ("Pearson Correlation | Average Label Score |   Unweighted Outer Modularity  | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_modularity_scores'])[0]))
print ("Pearson Correlation | Average Label Score |    Weighted Shell Modularity   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_modularity_scores'])[0]))
print ("Pearson Correlation | Average Label Score |   Unweighted Shell Modularity  | {0:.2f} ".format(pearsonr(post_merging_metric['avg_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_modularity_scores'])[0]))


In [None]:
print ("Spearman Correlation | Average Binary Score |       Weighted Density         | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_cluster_density'])[0]))
print ("Spearman Correlation | Average Binary Score |      Unweighted Density        | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_cluster_density'])[0]))
print ("Spearman Correlation | Average Binary Score |       Weighted Fraudar         | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |      Unweighted Fraudar        | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |    Weighted Outer Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |   Unweighted Outer Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |    Weighted Shell Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |   Unweighted Shell Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |    Weighted Outer Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |   Unweighted Outer Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |    Weighted Shell Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_modularity_scores'])[0]))
print ("Spearman Correlation | Average Binary Score |   Unweighted Shell Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_modularity_scores'])[0]))

In [None]:
from scipy.stats import pearsonr

print ("Pearson Correlation | Average Binary Score |       Weighted Density         | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_cluster_density'])[0]))
print ("Pearson Correlation | Average Binary Score |      Unweighted Density        | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_cluster_density'])[0]))
print ("Pearson Correlation | Average Binary Score |       Weighted Fraudar         | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_fraudar_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |      Unweighted Fraudar        | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_fraudar_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |    Weighted Outer Edge Perc    | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |   Unweighted Outer Edge Perc   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |    Weighted Shell Edge Perc    | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |   Unweighted Shell Edge Perc   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_edge_perc_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |    Weighted Outer Modularity   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_modularity_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |   Unweighted Outer Modularity  | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_modularity_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |    Weighted Shell Modularity   | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_modularity_scores'])[0]))
print ("Pearson Correlation | Average Binary Score |   Unweighted Shell Modularity  | {0:.2f} ".format(pearsonr(post_merging_metric['avg_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_modularity_scores'])[0]))


In [None]:
print ("Spearman Correlation | Max Label Score |       Weighted Density         | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['weighted_cluster_density'])[0]))
print ("Spearman Correlation | Max Label Score |      Unweighted Density        | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_cluster_density'])[0]))
print ("Spearman Correlation | Max Label Score |       Weighted Fraudar         | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['weighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Max Label Score |      Unweighted Fraudar        | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Max Label Score |    Weighted Outer Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Label Score |   Unweighted Outer Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Label Score |    Weighted Shell Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Label Score |   Unweighted Shell Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Label Score |    Weighted Outer Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Max Label Score |   Unweighted Outer Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Max Label Score |    Weighted Shell Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_modularity_scores'])[0]))
print ("Spearman Correlation | Max Label Score |   Unweighted Shell Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['max_label_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_modularity_scores'])[0]))

In [None]:
print ("Spearman Correlation | Max Binary Score |       Weighted Density         | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_cluster_density'])[0]))
print ("Spearman Correlation | Max Binary Score |      Unweighted Density        | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_cluster_density'])[0]))
print ("Spearman Correlation | Max Binary Score |       Weighted Fraudar         | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |      Unweighted Fraudar        | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_fraudar_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |    Weighted Outer Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |   Unweighted Outer Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |    Weighted Shell Edge Perc    | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |   Unweighted Shell Edge Perc   | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_edge_perc_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |    Weighted Outer Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |   Unweighted Outer Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_modularity_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |    Weighted Shell Modularity   | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_modularity_scores'])[0]))
print ("Spearman Correlation | Max Binary Score |   Unweighted Shell Modularity  | {0:.2f} ".format(spearmanr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_modularity_scores'])[0]))

In [None]:
from scipy.stats import pearsonr

print ("Pearson Correlation | Max Binary Score |       Weighted Density         | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_cluster_density'])[0]))
print ("Pearson Correlation | Max Binary Score |      Unweighted Density        | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_cluster_density'])[0]))
print ("Pearson Correlation | Max Binary Score |       Weighted Fraudar         | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_fraudar_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |      Unweighted Fraudar        | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_fraudar_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |    Weighted Outer Edge Perc    | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_edge_perc_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |   Unweighted Outer Edge Perc   | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_edge_perc_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |    Weighted Shell Edge Perc    | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_edge_perc_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |   Unweighted Shell Edge Perc   | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_edge_perc_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |    Weighted Outer Modularity   | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_outer_modularity_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |   Unweighted Outer Modularity  | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_outer_modularity_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |    Weighted Shell Modularity   | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['weighted_shell_modularity_scores'])[0]))
print ("Pearson Correlation | Max Binary Score |   Unweighted Shell Modularity  | {0:.2f} ".format(pearsonr(post_merging_metric['max_binary_scores'], 
                                                                                                                 post_merging_metric['unweighted_shell_modularity_scores'])[0]))


In [None]:
pearsonr(unweighted_shell_edge_perc_scores, unweighted_shell_modularity_scores)

In [None]:
rcParams['figure.figsize'] = 5,5
num_bins = 7
n, bins, patches = plt.hist(avg_label_scores, num_bins, facecolor='blue', alpha=0.5)
plt.title("Average Label Scores Histogram")
plt.show()

n, bins, patches = plt.hist(max_label_scores, num_bins, facecolor='blue', alpha=0.5)
plt.title("Max Label Scores Histogram")
plt.show()

n, bins, patches = plt.hist(sum_label_scores, num_bins, facecolor='blue', alpha=0.5)
plt.title("Sum Label Scores Histogram")
plt.show()


In [None]:
# print ("########## AVERAGE LABEL SCORES ################")
plt.scatter(max_label_scores, weighted_cluster_density, alpha=0.2)
plt.title("Max Label Scores vs Weighted Cluster Density")
plt.show()

plt.scatter(max_label_scores, unweighted_cluster_density, alpha=0.2)
plt.title("Max Label Scores vs UnWeighted Cluster Density")
plt.show()

plt.scatter(max_label_scores, weighted_fraudar_scores, alpha=0.2)
plt.title("Max Label Scores vs Weighted Fraudar Scores")
plt.show()

plt.scatter(max_label_scores, unweighted_fraudar_scores, alpha=0.2)
plt.title("Max Label Scores vs UnWeighted Fraudar Scores")
plt.show()

plt.scatter(max_label_scores, weighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Max Label Scores vs Weighted Outer Edge Perc")
plt.show()

plt.scatter(max_label_scores, unweighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Max Label Scores vs Unweighted Outer Edge Perc")
plt.show()

plt.scatter(max_label_scores, weighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Max Label Scores vs Weighted Shell Edge Perc")
plt.show()

plt.scatter(max_label_scores, unweighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Max Label Scores vs Unweighted Shell Edge Perc")
plt.show()

plt.scatter(max_label_scores, weighted_outer_modularity_scores, alpha=0.2)
plt.title("Max Label Scores vs Weighted Outer Modularity")
plt.show()

plt.scatter(max_label_scores, unweighted_outer_modularity_scores, alpha=0.2)
plt.title("Max Label Scores vs Unweighted Outer Modularity")
plt.show()


plt.scatter(max_label_scores, weighted_shell_modularity_scores, alpha=0.2)
plt.title("Max Label Scores vs Weighted Shell Modularity")
plt.show()

plt.scatter(max_label_scores, unweighted_shell_modularity_scores, alpha=0.2)
plt.title("Max Label Scores vs Unweighted Shell Modularity")
plt.show()


In [None]:
# print ("########## AVERAGE LABEL SCORES ################")
plt.scatter(sum_label_scores, weighted_cluster_density, alpha=0.2)
plt.title("Sum Label Scores vs Weighted Cluster Density")
plt.show()

plt.scatter(sum_label_scores, unweighted_cluster_density, alpha=0.2)
plt.title("Sum Label Scores vs UnWeighted Cluster Density")
plt.show()

plt.scatter(sum_label_scores, weighted_fraudar_scores, alpha=0.2)
plt.title("Sum Label Scores vs Weighted Fraudar Scores")
plt.show()

plt.scatter(sum_label_scores, unweighted_fraudar_scores, alpha=0.2)
plt.title("Sum Label Scores vs UnWeighted Fraudar Scores")
plt.show()

plt.scatter(sum_label_scores, weighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Sum Label Scores vs Weighted Outer Edge Perc")
plt.show()

plt.scatter(sum_label_scores, unweighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Sum Label Scores vs Unweighted Outer Edge Perc")
plt.show()

plt.scatter(sum_label_scores, weighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Sum Label Scores vs Weighted Shell Edge Perc")
plt.show()

plt.scatter(sum_label_scores, unweighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Sum Label Scores vs Unweighted Shell Edge Perc")
plt.show()

plt.scatter(sum_label_scores, weighted_outer_modularity_scores, alpha=0.2)
plt.title("Sum Label Scores vs Weighted Outer Modularity")
plt.show()

plt.scatter(sum_label_scores, unweighted_outer_modularity_scores, alpha=0.2)
plt.title("Sum Label Scores vs Unweighted Outer Modularity")
plt.show()


plt.scatter(sum_label_scores, weighted_shell_modularity_scores, alpha=0.2)
plt.title("Sum Label Scores vs Weighted Shell Modularity")
plt.show()

plt.scatter(sum_label_scores, unweighted_shell_modularity_scores, alpha=0.2)
plt.title("Sum Label Scores vs Unweighted Shell Modularity")
plt.show()

In [None]:
# print ("########## AVERAGE LABEL SCORES ################")
plt.scatter(avg_binary_scores, weighted_cluster_density, alpha=0.2)
plt.title("Average Binary Scores vs Weighted Cluster Density")
plt.show()

plt.scatter(avg_binary_scores, unweighted_cluster_density, alpha=0.2)
plt.title("Average Binary Scores vs UnWeighted Cluster Density")
plt.show()

plt.scatter(avg_binary_scores, weighted_fraudar_scores, alpha=0.2)
plt.title("Average Binary Scores vs Weighted Fraudar Scores")
plt.show()

plt.scatter(avg_binary_scores, unweighted_fraudar_scores, alpha=0.2)
plt.title("Average Binary Scores vs UnWeighted Fraudar Scores")
plt.show()

plt.scatter(avg_binary_scores, weighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Average Binary Scores vs Weighted Outer Edge Perc")
plt.show()

plt.scatter(avg_binary_scores, unweighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Average Binary Scores vs Unweighted Outer Edge Perc")
plt.show()

plt.scatter(avg_binary_scores, weighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Average Binary Scores vs Weighted Shell Edge Perc")
plt.show()

plt.scatter(avg_binary_scores, unweighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Average Binary Scores vs Unweighted Shell Edge Perc")
plt.show()

plt.scatter(avg_binary_scores, weighted_outer_modularity_scores, alpha=0.2)
plt.title("Average Binary Scores vs Weighted Outer Modularity")
plt.show()

plt.scatter(avg_binary_scores, unweighted_outer_modularity_scores, alpha=0.2)
plt.title("Average Binary Scores vs Unweighted Outer Modularity")
plt.show()


plt.scatter(avg_binary_scores, weighted_shell_modularity_scores, alpha=0.2)
plt.title("Average Binary Scores vs Weighted Shell Modularity")
plt.show()

plt.scatter(avg_binary_scores, unweighted_shell_modularity_scores, alpha=0.2)
plt.title("Average Binary Scores vs Unweighted Shell Modularity")
plt.show()

In [None]:
# print ("########## AVERAGE LABEL SCORES ################")
plt.scatter(max_binary_scores, weighted_cluster_density, alpha=0.2)
plt.title("Max Binary Scores vs Weighted Cluster Density")
plt.show()

plt.scatter(max_binary_scores, unweighted_cluster_density, alpha=0.2)
plt.title("Max Binary Scores vs UnWeighted Cluster Density")
plt.show()

plt.scatter(max_binary_scores, weighted_fraudar_scores, alpha=0.2)
plt.title("Max Binary Scores vs Weighted Fraudar Scores")
plt.show()

plt.scatter(max_binary_scores, unweighted_fraudar_scores, alpha=0.2)
plt.title("Max Binary Scores vs UnWeighted Fraudar Scores")
plt.show()

plt.scatter(max_binary_scores, weighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Max Binary Scores vs Weighted Outer Edge Perc")
plt.show()

plt.scatter(max_binary_scores, unweighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Max Binary Scores vs Unweighted Outer Edge Perc")
plt.show()

plt.scatter(max_binary_scores, weighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Max Binary Scores vs Weighted Shell Edge Perc")
plt.show()

plt.scatter(max_binary_scores, unweighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Max Binary Scores vs Unweighted Shell Edge Perc")
plt.show()

plt.scatter(max_binary_scores, weighted_outer_modularity_scores, alpha=0.2)
plt.title("Max Binary Scores vs Weighted Outer Modularity")
plt.show()

plt.scatter(max_binary_scores, unweighted_outer_modularity_scores, alpha=0.2)
plt.title("Max Binary Scores vs Unweighted Outer Modularity")
plt.show()

plt.scatter(max_binary_scores, weighted_shell_modularity_scores, alpha=0.2)
plt.title("Max Binary Scores vs Weighted Shell Modularity")
plt.show()

plt.scatter(max_binary_scores, unweighted_shell_modularity_scores, alpha=0.2)
plt.title("Max Binary Scores vs Unweighted Shell Modularity")
plt.show()

In [None]:
# print ("########## AVERAGE LABEL SCORES ################")
plt.scatter(sum_binary_scores, weighted_cluster_density, alpha=0.2)
plt.title("Sum Binary Scores vs Weighted Cluster Density")
plt.show()

plt.scatter(sum_binary_scores, unweighted_cluster_density, alpha=0.2)
plt.title("Sum Binary Scores vs UnWeighted Cluster Density")
plt.show()

plt.scatter(sum_binary_scores, weighted_fraudar_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Weighted Fraudar Scores")
plt.show()

plt.scatter(sum_binary_scores, unweighted_fraudar_scores, alpha=0.2)
plt.title("Sum Binary Scores vs UnWeighted Fraudar Scores")
plt.show()

plt.scatter(sum_binary_scores, weighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Weighted Outer Edge Perc")
plt.show()

plt.scatter(sum_binary_scores, unweighted_outer_edge_perc_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Unweighted Outer Edge Perc")
plt.show()

plt.scatter(sum_binary_scores, weighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Weighted Shell Edge Perc")
plt.show()

plt.scatter(sum_binary_scores, unweighted_shell_edge_perc_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Unweighted Shell Edge Perc")
plt.show()

plt.scatter(sum_binary_scores, weighted_outer_modularity_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Weighted Outer Modularity")
plt.show()

plt.scatter(sum_binary_scores, unweighted_outer_modularity_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Unweighted Outer Modularity")
plt.show()

plt.scatter(sum_binary_scores, weighted_shell_modularity_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Weighted Shell Modularity")
plt.show()

plt.scatter(sum_binary_scores, unweighted_shell_modularity_scores, alpha=0.2)
plt.title("Sum Binary Scores vs Unweighted Shell Modularity")
plt.show()

In [None]:
# plt.scatter(avg_scores, weighted_fraudar_scores, alpha=0.2)
# plt.show()

# plt.scatter(max_scores, weighted_fraudar_scores, alpha=0.2)
# plt.show()
# percentage of clusters in quadrant 4
# new definition for edge percentage
# look into clusters with high label but low anomaly score
# correct modularity


In [None]:
eigen_ratios[clusters.index(-1)]

In [None]:
noise_data.shape

In [None]:
content = df_data['content_p']
content = content.replace(np.nan, '', regex=True)
vectorizer1 = TfidfVectorizer(lowercase=True, ngram_range=(2,2))
b_mat = vectorizer1.fit_transform(content)
features_col = vectorizer1.get_feature_names()


cluster_counts = []

weighted_cluster_density = []
unweighted_cluster_density = []
unweighted_fraudar_scores = []
weighted_fraudar_scores = []
unweighted_outer_edge_perc_scores = []
weighted_outer_edge_perc_scores = []
unweighted_shell_edge_perc_scores = []
weighted_shell_edge_perc_scores = []
weighted_outer_modularity_scores = []
unweighted_outer_modularity_scores = []
weighted_shell_modularity_scores = []
unweighted_shell_modularity_scores = []
bigrams_dict = {}
total_edges_unweighted = np.count_nonzero(bigram_matrix)
sil_scores = []
total_edges_weighted = np.sum(bigram_matrix)
eigen_ratios = []
for l in clusters:
    
    cluster_idx = np.argwhere(labels == l).reshape(-1)
#     clusters.append(l)
    shell_subgraph, outer_subgraph, core_subgraph = get_all_subgraphs(bigram_matrix, cluster_idx)
    
    if l== -1:
        weighted_cluster_density.append(0)
        unweighted_cluster_density.append(0)
        weighted_fraudar_scores.append(0)
        unweighted_fraudar_scores.append(0)
        weighted_outer_edge_perc_scores.append(0)
        unweighted_outer_edge_perc_scores.append(0)
        weighted_shell_edge_perc_scores.append(0)
        unweighted_shell_edge_perc_scores.append(0)
    #     fraudar_scores.append(calculate_fradaur_score(cluster_ajac))
        unweighted_outer_modularity_scores.append(0)
        weighted_outer_modularity_scores.append(0)
        unweighted_shell_modularity_scores.append(0)
        weighted_shell_modularity_scores.append(0)
        cluster_counts.append(math.log(len(cluster_idx), 10)) 
        eigen_ratios.append(0)
        continue
#     cluster_ajac = (bigram_matrix[cluster_idx,:][:])
    df_filt = df_data[df_data['cluster_label']== l]
    local_content = list(df_filt['content_p'])
#     print (content.shape)
    # Create and fit the LDA model
    count_vectorizer = TfidfVectorizer(ngram_range=(2,2), use_idf=False)
    count_data = count_vectorizer.fit_transform(local_content)
    
    svd = TruncatedSVD(n_components=2)
    local_vecs = svd.fit_transform(count_data)
    w = svd.singular_values_
    eigen_rat = w[1]/w[0]
#     eigen_rat = eigen_rat * eigen_rat
    eigen_ratios.append(eigen_rat)
    
    ajac_big = (b_mat[cluster_idx,:][:]).todense()
    bigram_sums = np.max(np.asarray(ajac_big), axis=0)
#     print (bigram_sums)
#     bigram_index = bigram_sums.argsort()[::-1][:50]
#     print (bigram_index)
    n_top_words = 20
    number_topics = 1
    lda = LDA(n_components=number_topics)
    lda.fit(ajac_big)
    bigrams_list = []
    for topic_idx, topic in enumerate(lda.components_):
#         print("\nTopic #%d:" % topic_idx)
#         print(", ".join([words[i]
#                         for i in topic.argsort()[:-n_top_words - 1:-1]]))
        word_list = [features_col[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    bigrams_dict[l] = word_list
    
    
#     for i in bigram_index:
# #         print (bigram_sums[i])
#         bigrams_list.append(features_col[i])
# #     print (bigrams_list)
#     bigrams_dict[l] = bigrams_list
    
    weighted_cluster_density.append(calculate_weighted_density(core_subgraph))
    unweighted_cluster_density.append(calculate_unweighted_density(core_subgraph))
    weighted_fraudar_scores.append(calculate_weighted_fraudar_score(core_subgraph))
    unweighted_fraudar_scores.append(calculate_unweighted_fraudar_score(core_subgraph))
    weighted_outer_edge_perc_scores.append(calculate_weighted_edge_per_score(core_subgraph, outer_subgraph))
    unweighted_outer_edge_perc_scores.append(calculate_unweighted_edge_per_score(core_subgraph, outer_subgraph))
    weighted_shell_edge_perc_scores.append(calculate_weighted_edge_per_score(core_subgraph, shell_subgraph))
    unweighted_shell_edge_perc_scores.append(calculate_unweighted_edge_per_score(core_subgraph, shell_subgraph))
    unweighted_outer_modularity_scores.append(calculate_unweighted_modularity_score(core_subgraph, outer_subgraph, total_edges_unweighted))
    weighted_outer_modularity_scores.append(calculate_weighted_modularity_score(core_subgraph, outer_subgraph, total_edges_weighted))
    unweighted_shell_modularity_scores.append(calculate_unweighted_modularity_score(core_subgraph, shell_subgraph, total_edges_unweighted))
    weighted_shell_modularity_scores.append(calculate_weighted_modularity_score(core_subgraph, shell_subgraph, total_edges_weighted))
    cluster_counts.append(math.log(len(cluster_idx), 10)) 
#     break
#     vert_col = len(nonzero_col) - len(nonzero_col.count(0))
#     print (vert_row, vert_col)
#     print (edge_weight)
    
#     cluster_ajac_density = nx.density(bipartite.from_biadjacency_matrix(cluster_ajac))
    if l % 50 == 0:
        print (l)
original_labels = labels.copy()


In [None]:
x_axis = [x for x in range(len(clusters))]
plt.scatter(x_axis, eigen_ratios, alpha=0.2)

In [None]:
eigen_np = np.array(eigen_ratios)
print(np.where(eigen_np>0.8)[0])

In [None]:
bigram_matrix = np.asarray(bigram_matrix)

In [None]:
print (len(clusters))

In [None]:
df_noise = df_data[df_data['cluster_label'].isin(clusters_nonhomogenous)]
df_noise.to_csv('../data/noisy_rerun.csv')

In [None]:
rcParams['figure.figsize'] = 20,10

avg_sorted_scores, weighted_cluster_density_sorted, weighted_fraudar_scores_sorted, weighted_edge_perc_scores_sorted, weighted_modularity_scores_sorted = \
    zip(*sorted(zip(avg_scores, weighted_cluster_density, weighted_fraudar_scores, weighted_edge_perc_scores, weighted_modularity_scores)))
plt.plot(weighted_cluster_density_sorted, label = 'density')
plt.plot(weighted_fraudar_scores_sorted, label= 'fraudar')
plt.plot(weighted_edge_perc_scores_sorted, label= 'edge percentage')
plt.plot(weighted_modularity_scores_sorted, label= 'modularity')
plt.title("WEIGHTED METRICS sorted by label score")
plt.legend()
plt.show()

avg_sorted_scores, unweighted_cluster_density_sorted, unweighted_fraudar_scores_sorted, unweighted_edge_perc_scores_sorted, unweighted_modularity_scores_sorted = \
    zip(*sorted(zip(avg_scores, unweighted_cluster_density, unweighted_fraudar_scores, unweighted_edge_perc_scores, unweighted_modularity_scores)))
plt.plot(unweighted_cluster_density_sorted, label = 'density')
plt.plot(unweighted_fraudar_scores_sorted, label= 'fraudar')
plt.plot(unweighted_edge_perc_scores_sorted, label= 'edge percentage')
plt.plot(weighted_modularity_scores_sorted, label= 'modularity')
plt.title("Unweighted Metrics sorted by label score")
plt.legend()
plt.show()

cluster_counts_sorted, weighted_cluster_density_sorted, weighted_fraudar_scores_sorted, weighted_edge_perc_scores_sorted, weighted_modularity_scores_sorted = \
    zip(*sorted(zip(cluster_counts, weighted_cluster_density, weighted_fraudar_scores, weighted_edge_perc_scores, weighted_modularity_scores)))
plt.plot(weighted_cluster_density_sorted, label = 'density', linestyle='None', marker='o')
plt.plot(weighted_fraudar_scores_sorted, label= 'fraudar', linestyle='None', marker='o')
plt.plot(weighted_edge_perc_scores_sorted, label= 'edge percentage', linestyle='None', marker='o')
plt.plot(weighted_modularity_scores_sorted, label= 'modularity', linestyle='None', marker='o')
plt.title("WEIGHTED METRICS vs cluster counts")
plt.legend()
plt.show()

cluster_counts_sorted, unweighted_cluster_density_sorted, unweighted_fraudar_scores_sorted, unweighted_edge_perc_scores_sorted, unweighted_modularity_scores_sorted = \
    zip(*sorted(zip(cluster_counts_sorted, unweighted_cluster_density, unweighted_fraudar_scores, unweighted_edge_perc_scores, unweighted_modularity_scores)))
plt.plot(unweighted_cluster_density_sorted, label = 'density', linestyle='None', marker='o')
plt.plot(unweighted_fraudar_scores_sorted, label= 'fraudar', linestyle='None', marker='o')
plt.plot(unweighted_edge_perc_scores_sorted, label= 'edge percentage', linestyle='None', marker='o')
plt.plot(unweighted_modularity_scores_sorted, label= 'modularity', linestyle='None', marker='o')
plt.title("Unweighted Metrics vs cluster counts")
plt.legend()
plt.show()



In [None]:
original_labels = labels.copy()
noise_labels = np.where(original_labels==-1)[0]
print (len(noise_labels))
total_index = list(range(len(encoded_vecs)))
# print(list(set(total_index)- set(noise_labels)))
rem_index = list(set(total_index)- set(noise_labels))
# print (rem_index)
original_labels = list(original_labels)
non_noisy_labels = [original_labels[i] for i in rem_index]
non_noisy_vecs = encoded_vecs[rem_index,:][:]
# original_labels = original_labels.reshape(-1, 1)
# print (original_labels.shape)
# non_noisy_labels = original_labels[:, rem_index]
from sklearn.metrics import silhouette_score
score = silhouette_score(non_noisy_vecs, non_noisy_labels, metric='euclidean')
# sample_scores = silhouette_samples_block(non_noisy_vecs, non_noisy_labels, metric='euclidean')
# print (sample_scores)

#cluster_density, fraudar_scores, cluster_counts, clusters = zip(*sorted(zip(cluster_density, fraudar_scores, cluster_counts, clusters)))
avg_scores, modularity_scores, fraudar_scores, cluster_density,cluster_counts, clusters = zip(*sorted(zip(avg_scores, modularity_scores,fraudar_scores, cluster_density, cluster_counts, clusters)))

In [None]:
# rcParams['figure.figsize'] = 20,10
# plt.plot(cluster_density)
# plt.title("CLUSTER DENSITIES")
# plt.show()

# plt.plot(fraudar_scores)
# plt.title("FRAUDAR")
# plt.show()

# plt.plot(modularity_scores)
# plt.title("MODULARITY")
# plt.show()

# plt.plot(avg_scores)
# plt.title("AVERAGE SCORE")
# plt.show()

In [None]:
print (len(filtered_clusters))


In [None]:
rcParams['figure.figsize'] = 5,5
num_bins = 7
n, bins, patches = plt.hist(avg_scores, num_bins, facecolor='blue', alpha=0.5)
plt.show()


In [None]:
filtered_cluster_metric, avg_label_score, filtered_clusters = zip(*sorted(zip(filtered_cluster_metric, avg_label_score, filtered_clusters)))
filtered_cluster_metric = list(reversed(filtered_cluster_metric))
avg_label_score = list(reversed(avg_label_score))
filtered_clusters = list(reversed(filtered_clusters))

In [None]:
total_count = [0] * 3

for index, cl in enumerate(filtered_clusters):
    if cl == -1:
        continue
    df_filt = df_data[df_data['cluster_label']==cl]
    if avg_label_score[index] < 2:
        total_count[0] +=1
    elif avg_label_score[index] >=2 and avg_label_score[index] <= 3.5:
        total_count[1] +=1
    else:
        total_count[2] +=1
print (total_count)

In [None]:
def get_combinations(words_hashmap, phrase):
    print (phrase)
    phrase_list = []
    first_second = phrase.split()
    first = first_second[0]
    last = first_second[-1]
    if last in words_hashmap:
        for bigram in words_hashmap[last]:
            if bigram in phrase:
                continue
            bigram_words = bigram.split()
            new_phrase = phrase + ' ' + bigram_words[1]
            if len(new_phrase.split()) == 10:
                return [new_phrase]
            phrase_list = get_combinations(words_hashmap, new_phrase)
            phrase_list.append(new_phrase)
    return []
            
def get_word_hashmap(words_list):
    words_hashmap = {}
    total_list = words_list.copy()
    print ("Creating hashmap")
    for words in words_list:
#         print(words.split())
        first, second = words.split()
        if first in words_hashmap:
            words_hashmap[first].append(words)
        else:
            words_hashmap[first] = [words]
    return words_hashmap
    print ("Word hashmap creation done")
    for words in word_list:
        print (words)
        first_second = words.split()
        first = first_second[0]
        second = first_second[1]
        if second in words_hashmap:
            for bigram in words_hashmap[second]:
                total_list.append(first + ' ' + bigram)
    total_list = list(set(total_list))
    
    return total_list

In [None]:
color_array = ['grey'] * len(true_labels)
anomaly_indices = []
rgba_colors = np.zeros((len(true_labels),4))
for ind, cl in enumerate(filtered_clusters):
    if cl == -1:
        continue
    cluster_idx = np.argwhere(labels == cl).reshape(-1)
    anomaly_indices += list(cluster_idx)
rgba_colors[:, 0] = 0
rgba_colors[:, 3] = 0.01
print (anomaly_indices)
for ind in anomaly_indices:
    rgba_colors[ind, 0] = 1
    rgba_colors[ind, 3] = 1.0
    
rcParams['figure.figsize'] = 20,10
plt.scatter(X_embedded.T[0], X_embedded.T[1], color=rgba_colors)

In [None]:
from sklearn.metrics import silhouette_score
score = silhouette_score(encoded_vecs, original_labels, metric='euclidean')
print (score)

In [None]:
import time
import csv
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from numpy import linalg as LA

def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    word_list = []
    for topic_idx, topic in enumerate(model.components_):
#         print("\nTopic #%d:" % topic_idx)
#         print(", ".join([words[i]
#                         for i in topic.argsort()[:-n_top_words - 1:-1]]))
        word_list = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    
    return word_list

# Tweak the two parameters below
number_topics = 1
number_words = 30


# model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
count = 0
trafficking_ads = []
eigen_ratios = []
# filtered_cluster_metric, avg_label_score, filtered_clusters = zip(*sorted(zip(filtered_cluster_metric, avg_label_score, filtered_clusters)))
colors_highlight = ['1m','2m','3m','4m','5m','6m']
cluster_word_list = []
low_eigen_clusters = []
for index, cl in enumerate(filtered_clusters):
    if cl == -1:
        cluster_word_list.append('')
        continue
    count += 1
#     if count == 10:
#         break
    cluster_idx = np.argwhere(original_labels == cl).reshape(-1)
   
    ajac_big = (b_mat[cluster_idx,:][:]).todense()
    agg_scores = np.max(np.asarray(ajac_big), axis=0)
#     if avg_label_score[index] < 3.5:
#         continue
    df_filt = df_data[df_data['cluster_label']==cl]
    df_filt = df_filt.sort_values(by=['probabilities'], ascending=False)
    content = []
#     for ind, row in df_filt.iterrows():
#         c = ''
#         if row['title'] and type(row['title']) == type(' '):
#             c = row['title']
#         c += row['body']
#         c = re.sub(r'\d+', '', c)
#         if type(row['Name']) == type(''):
#             name = row['Name'].split(';')
#             for n in name:
#                 name_regex = re.compile(re.escape(n), re.IGNORECASE)
#                 c = name_regex.sub('', c)
#             c = re.sub(r'[^\x00-\x7F]+',' ', c)
#             cleanr = re.compile('<.*?>')
#             c = re.sub(cleanr, '', c)
#         content.append(c)
    
    content = list(df_filt['content_p'])
#     print (content.shape)
    # Create and fit the LDA model
    count_vectorizer = TfidfVectorizer(ngram_range=(2,2), use_idf=False)
    count_data = count_vectorizer.fit_transform(content)
    lda = LDA(n_components=number_topics)
    lda.fit(count_data)
    col_names = count_vectorizer.get_feature_names()
    # Print the topics found by the LDA model
#     print("Topics found via LDA:")
    
    svd = TruncatedSVD(n_components=10)
    local_vecs = svd.fit_transform(count_data)
    w = svd.singular_values_
    count_data = count_data.todense()
    word_list = print_topics(lda, count_vectorizer, number_words)
    agg_scores_local = np.max(np.asarray(count_data), axis=0)
    tf_idf = []
    tf_local = []
#     print (agg_scores)
    for word in word_list:
        big_index = features_col.index(word)
        tf_idf.append(agg_scores[big_index])
        tf_local.append(agg_scores_local[col_names.index(word)])
#     hashmap = get_word_hashmap(word_list)
#     total_list = []
#     for word in word_list:
#         total_list += get_combinations(hashmap, word)
#     print (", ".join(word for word in total_list))
    tf_idf, tf_local, word_list = zip(*sorted(zip(tf_idf, tf_local, word_list)))
    tf_idf = list(reversed(tf_idf))
    tf_local = list(reversed(tf_local))
    word_list = list(reversed(word_list))

#     print (tf_idf)
#     print ("-------------------------Local Scores------------------------")
#     print (tf_local)
#     print (word_list)
    cluster_word_list.append(word_list)
#     bigram_list = bigrams_dict[cl]
#     print (", ".join(word for word in word_list))
#     print (bigrams_dict[cl])
#     print (", ".join(word for word in bigram_list))
#     common_bigrams = list(set(word_list) & set(bigram_list))
#     print ("--------------------------------------------------")
#     print (", ".join(word for word in common_bigrams))
#     ads = []
#     print ("\n\n")
    eig_ratio = (w[0]+1)/(w[1]+1)
    eig_ratio = eig_ratio * eig_ratio
    eigen_ratios.append(eig_ratio)
#     if eig_ratio > 0.0:
#         print ('=========Cluster = {}, Average Score = {}, Cluster metric = {}, Eigenvalues ratio = {}==========\n'.format(
#             cl, avg_label_score[index], filtered_cluster_metric[index], eig_ratio*eig_ratio))
#         print (bigrams_dict[cl])
#         for ind, row in df_filt.iterrows():
#             body = row['body']
#             title = row['title']
#             for i, bigram in enumerate(word_list):
#     #             print (bigram)
#                 try:
#                     start_ind = body.lower().index(bigram)
#                     end_ind = start_ind + len(bigram)
#     #                 print (start_ind)
#                     body = body[:start_ind] + '\033[4' + colors_highlight[i%7] + body[start_ind:end_ind] + '\033[m' + body[end_ind:]
#                     start_ind = title.lower().index(bigram)
#                     end_ind = start_ind + len(bigram)
#     #                 print (start_ind)
#                     title = title[:start_ind] + '\033[4' + colors_highlight[i%7] + title[start_ind:end_ind] + '\033[m' + title[end_ind:]
#                 except:
#                     pass
#             print ("----------------------------------------------------------------------------------------------------------------")
#             print ("{}, {}, {}".format(title, body, row['Name']))
#             ad = [row['id'], row['title'], row['body'], row['label'], count]
#             trafficking_ads.append(ad)
#     #     break
#         time.sleep(5)
#         print ('\n\n')
plt.plot(eigen_ratios)
plt.show()
# with open('../data/svd_results/trafficking_ads_check_highlight.csv', 'w') as f:
#     csv_writer = csv.writer(f)
#     csv_writer.writerows(trafficking_ads)

In [None]:
spearmanr(filtered_cluster_metric, eigen_ratios)[0]

In [None]:
eigen_ratios, filtered_clusters = zip(*sorted(zip(eigen_ratios, filtered_clusters)))

In [None]:
counts = 0

for ind, cl in enumerate(filtered_clusters):
    if eigen_ratios[ind] < 1.5:
        counts += 1
        print ('=======================================================================================')
        df_filt = df_data[df_data['cluster_label']==cl]
        for i, row in df_filt.iterrows():
            print ('{}'.format(row['body']))
            print ('---------------------------------------------------------------------------------')
print (counts)

In [None]:
df_results.shape

In [None]:
print (len(filtered_clusters))

In [None]:
content = list(df_results['content_p'])

vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(2,2), norm='l2', 
     stop_words=stop_words, min_df=2, max_df=0.8)
bigram_matrix = vectorizer.fit_transform(content)
features_col = vectorizer.get_feature_names()
print(bigram_matrix.shape)

In [None]:
df_results.reset_index(inplace=True)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cluster_labels_results = df_results['cluster_label']
cluster_vectors = np.zeros((len(filtered_clusters), bigram_matrix.shape[1]))
for ind, cl in enumerate(filtered_clusters):
    cluster_idx = np.argwhere(cluster_labels_results == cl).reshape(-1)
    tf_cluster_mat = bigram_matrix[cluster_idx,:][:].todense()
    tf_cluster_mat_flat = np.mean(tf_cluster_mat, axis=0)
    print (tf_cluster_mat_flat.shape)
    cluster_vectors[ind] = tf_cluster_mat_flat
    
pairwise_sim_mat = cosine_similarity(cluster_vectors, dense_output=True)

In [None]:
pairwise_sim_mat

In [None]:
np.fill_diagonal(pairwise_sim_mat, 0.0)
np.where(pairwise_sim_mat>0.9)

In [None]:
df_results[df_results['cluster_label']==261]['body']
