In [1]:
import numpy as np
from scipy.sparse import csr_matrix

In [6]:
# csr_a[[0,1], :][:, [3,4]].todense()


def get_csr_counts(relation, shape, axis=1):
    # 1 means count of values along columns
#     print (shape)
    c = relation.nonzero()[axis]
    nonzero_ids = list(set(c))
    np_count = np.zeros(shape[axis])
#     print (np_count.shape)
    for i in nonzero_ids:
#         print (i)
        np_count[i] = len((c==i).nonzero()[0])
    
    return np_count
# print (np_count)

In [7]:
def get_all_subgraphs(mat_data, cl_ind):
    filtered_sub = mat_data[cl_ind,:][:]
    bigrams_count = get_csr_counts((filtered_sub>0), filtered_sub.shape)
#     print (bigrams_count)
#     print ("Bigrams count matrix shape : {}".format(bigrams_count.shape))
    zero_count_index = np.where(bigrams_count==0)[0]
#     print (zero_count_index)
    one_count_index = list(np.where(bigrams_count==1)[0])
#     print ("One count index : {}".format(one_count_index))
#     print ("Max zero count index : {} ".format(max(zero_count_index)))
    core_bigrams_index = list(np.where(bigrams_count>1)[0])
#     print ("Core Bigrams index : {}".format(len(core_bigrams_index)))
    outer_bigrams_index = list(one_count_index + core_bigrams_index)
    
    bigram_induced_graph = mat_data[:][:,core_bigrams_index]
#     print ("Bigram Induced Graph shape : {}".format(bigram_induced_graph.shape))
    ads_count = get_csr_counts((bigram_induced_graph>0), bigram_induced_graph.shape, axis=0)
#     csr_matrix.count_nonzero(bigram_induced_graph, axis=1)
    ads_in_shell_index = np.where(ads_count>1)[0]
    
    not_core_ads = list(set(list(ads_in_shell_index)) - set(cl_ind))
    not_core_bigrams = list(set(list(one_count_index)) - set(core_bigrams_index))
    mat_copy = mat_data.copy().tolil()
    for i in not_core_ads:
        for j in not_core_bigrams:
            mat_copy[i, j] = 0
    shell_subgraph = mat_copy[ads_in_shell_index,:][:,outer_bigrams_index]
    mat_copy[not_core_ads, :] = 0
    outer_subgraph = mat_copy[ads_in_shell_index,:][:,outer_bigrams_index]
    mat_copy[:,one_count_index] = 0
    core_subgraph = mat_copy[ads_in_shell_index,:][:,outer_bigrams_index]

    
    return shell_subgraph.tocsr(), outer_subgraph.tocsr(), core_subgraph.tocsr()

In [2]:
def get_counts(core_mat):
    ads_count = get_csr_counts((core_mat>0), core_mat.shape, axis=0)
    bigrams_count = get_csr_counts((core_mat>0), core_mat.shape, axis=1)
    
    return ads_count, bigrams_count

In [3]:
import math

def calculate_unweighted_density(core_mat, ads_count, bigrams_count):
    edge_weight = csr_matrix.count_nonzero(core_mat)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
#     print (ads_core_num)
#     print (bigrams_core_num)
    return edge_weight/(ads_core_num * bigrams_core_num + 1)

def calculate_weighted_density(core_mat, ads_count, bigrams_count):
    edge_weight = csr_matrix.sum(core_mat)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])

    return edge_weight/(ads_core_num * bigrams_core_num + 1)

def calculate_unweighted_fraudar_score(core_mat, ads_count, bigrams_count):
    edge_weight = csr_matrix.count_nonzero(core_mat)
#     get_csr_counts((core_mat>0), core_mat.shape, axis=1)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return edge_weight/(ads_core_num + bigrams_core_num + 1)

def calculate_weighted_fraudar_score(core_mat, ads_count, bigrams_count):
    edge_weight = csr_matrix.sum(core_mat)
    ads_core_num = len(np.where(ads_count>0)[0])
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return edge_weight/(ads_core_num + bigrams_core_num + 1)

def calculate_unweighted_edge_per_score(core_mat, outer_mat, bigrams_count):
    core_edges = csr_matrix.count_nonzero(core_mat)
    outer_edges = csr_matrix.count_nonzero(outer_mat)
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return ((core_edges + 1)/(outer_edges +1))*(math.log(bigrams_core_num+1))

def calculate_weighted_edge_per_score(core_mat, outer_mat, bigrams_count):
    core_edges = csr_matrix.sum(core_mat)
    outer_edges = csr_matrix.sum(outer_mat)
    
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    
    return ((core_edges + 1)/(outer_edges +1))*(math.log(bigrams_core_num+1))


def calculate_custom_score(core_mat, outer_mat):
    bigram_degrees = np.count_nonzero(np.asarray(core_mat), axis=0)
    bigrams_count = np.count_nonzero(core_mat, axis=1)
    bigrams_core_num = len(np.where(bigrams_count>0)[0])
    outer_edges = np.count_nonzero(outer_mat)
    bigram_degrees = bigram_degrees/bigrams_core_num
    
    ads_count = np.count_nonzero(core_mat, axis=0)
    ads_core_num = len(np.where(ads_count>0)[0])
    
    return (np.sum(bigram_degrees)/(outer_edges+1))*(math.log(bigrams_core_num+1))*(math.log(ads_core_num+1))
# #     print (mat.shape)
#     edges_nonzero = np.count_nonzero(mat, axis=0)
#     unique, counts = np.unique(edges_nonzero, return_counts=True)
#     degree_counts = dict(zip(unique, counts))
#     numerator = 0.0
#     denominator = 0.0
#     half = max(mat.shape[0]/2, 2)
#     for k, v in degree_counts.items():
#         if k == 0:
#             continue
#         elif k <= half:
#             denominator += k*v
#         else:
#             denominator += k*v
#             numerator += k*v
#     if denominator == 0.0:
#         return 0.0
#     else:
#         return numerator/denominator

# def calculate_weighted_edge_per_score(mat):
#     return 0.0

#Should be shell_mat instead of outer_mat, change once you figure out how to get shell subgraph.
def calculate_unweighted_modularity_score(core_mat, outer_mat, total_edges):
#     total_edges = math.log(total_edges)
    ad_degrees = np.count_nonzero(np.asarray(outer_mat), axis=1)
#     print (ad_degrees)
    bigram_degrees = np.count_nonzero(np.asarray(outer_mat), axis=0)
#     print (bigram_degrees)
    ads_count = get_csr_counts((core_mat>0), core_mat.shape, axis=0)
    bigrams_count = get_csr_counts((core_mat>0), core_mat.shape, axis=1)
    ads_core_index = np.where(ads_count>0)[0]
    bigrams_core_index = np.where(bigrams_count>0)[0]
#     ads_list_index = [x for x in range(core_mat.shape[0])]
#     nonzero_edges = np.transpose(np.nonzero(core_mat))
    summation = 0.0
    for i in ads_core_index:
        for j in bigrams_core_index:
            if core_mat[i][j] == 0:
                adj = 0
            else:
                adj = 1
            if adj == 1:
                summation += (adj - (ad_degrees[i] * bigram_degrees[j])/total_edges)

#     nonzero_edges = np.transpose(np.nonzero(core_mat))
#     summation = 0.0
#     for k in range(nonzero_edges.shape[0]):
#         ad_index = nonzero_edges[k][0]
#         big_index = nonzero_edges[k][1]
# #         print (ad_index)
# #         print (big_index)
# #         if core_mat[ad_index][big_index] != 0:
# #             adj = 1
# #         else:
# #             adj = 0
        
#         summation += (1 - (ad_degrees[ad_index] * bigram_degrees[big_index])/(2*total_edges))
    
    return (summation/total_edges)*(math.log(len(ads_core_index)+1))

def calculate_weighted_modularity_score(core_mat, outer_mat, total_edges, ads_count, bigrams_count):
    ad_degrees = np.asarray(csr_matrix.sum(outer_mat, axis=1))
    bigram_degrees = np.asarray(csr_matrix.sum(outer_mat, axis=0))
    
    ads_core_index = np.where(ads_count>0)[0]
    bigrams_core_index = np.where(bigrams_count>1)[0]

#     ads_list_index = [x for x in range(core_mat.shape[0])]
#     nonzero_edges = np.transpose(np.nonzero(core_mat))
    summation = 0.0
    for i in ads_core_index:
        for j in bigrams_core_index:
#             print (i, j)
            if core_mat[i, j] != 0:
                summation += (core_mat[i, j] - (ad_degrees[i][0] * bigram_degrees[0][j])/total_edges)
#     summation = 0.0
#     for k in range(nonzero_edges.shape[0]):
#         ad_index = nonzero_edges[k][0]
#         big_index = nonzero_edges[k][1]
#         summation += (outer_mat[ad_index][big_index] - (ad_degrees[ad_index] * bigram_degrees[big_index])/(2*total_edges))
    
    return (summation/total_edges)*(math.log(len(ads_core_index)+1))

def calculate_pairwise_modularity(mat):
    mat = np.asarray(mat.todense())
    sim_scores = np.zeros((mat.shape[0], mat.shape[0]))
    for i in range(mat.shape[0]):
        for j in range(i+1, mat.shape[0]):
#             print ("i : {}, j : {}".format(i,j))
            if i == j:
                continue
#             print (len(mat[i]))
            sim_scores[i][j] = calculate_modularity_score(np.vstack((mat[i], mat[j])))
    
    return sim_scores

In [41]:
import math

def get_tf_idf(csr_mat):
    ads_count, bigrams_count = get_counts(csr_mat)
    lil_mat = csr_mat.tolil()
    for i in range(lil_mat.shape[0]):
        for j in range(lil_mat.shape[1]):
            lil_mat[i, j] = lil_mat[i, j] * ((math.log(1 + lil_mat.shape[0])/(1 + bigrams_count[j])) + 1)
    
    return lil_mat.tocsr()

### Coherence

In [48]:
a = [[1, 0, 0, 0],
    [0, 1, 1, 1],
    [0, 1, 1, 1],
    [0, 1, 1, 1]]
csr_a = csr_matrix(a, dtype=float)

csr_a = get_tf_idf(csr_a)

[[1.80471896 0.         0.         0.        ]
 [0.         1.40235948 1.40235948 1.40235948]
 [0.         1.40235948 1.40235948 1.40235948]
 [0.         1.40235948 1.40235948 1.40235948]]


In [46]:
math.log(5)/2 + 1

1.8047189562170503

In [49]:
b = [[1, 0, 0, 0],
    [0, 1, 1, 0],
    [0, 1, 0, 1],
    [0, 0, 1, 1]]
csr_b = csr_matrix(b, dtype=float)

csr_b = get_tf_idf(csr_b)

In [50]:
s, o, c = get_all_subgraphs(csr_a, [1,2,3])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_a)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

0.9 1.2857142857142858 1.3862943611198906 1.3862943611198906 0.12476649250079012 0.12476649250079012


In [51]:
s, o, c = get_all_subgraphs(csr_b, [1,2,3])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_b)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

0.9218875824868199 1.316982260695457 1.3862943611198906 1.3862943611198906 0.5129795226206882 0.5129795226206882


### Exclusivity

In [52]:
a = [[1, 0, 0, 0],
    [0, 1, 1, 0],
    [0, 1, 1, 0],
    [0, 0, 0, 1]]
csr_a = csr_matrix(a, dtype=float)

csr_a = get_tf_idf(csr_a)

In [56]:
b = [[1, 0, 0, 0],
    [1, 1, 1, 0],
    [0, 1, 1, 1],
    [0, 0, 0, 1]]
csr_b = csr_matrix(b, dtype=float)

csr_b = get_tf_idf(csr_b)

In [57]:
s, o, c = get_all_subgraphs(csr_a, [1,2])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_a)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

1.22918344331576 1.22918344331576 1.0986122886681098 1.0986122886681098 0.2560852512064189 0.2560852512064189


In [58]:
s, o, c = get_all_subgraphs(csr_b, [1,2])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_b)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

1.22918344331576 1.22918344331576 0.7682442376717832 0.7682442376717832 0.13732653608351375 0.13732653608351375


### Rarity

In [59]:
a = [[1, 0, 0, 0],
    [0, 1, 1, 0],
    [0, 1, 1, 0],
    [0, 0, 0, 1]]
csr_a = csr_matrix(a, dtype=float)

csr_a = get_tf_idf(csr_a)

In [60]:
b = [[1, 1, 0, 0],
    [0, 1, 1, 0],
    [0, 1, 1, 1],
    [0, 0, 1, 1]]
csr_b = csr_matrix(b, dtype=float)

csr_b = get_tf_idf(csr_b)

In [61]:
s, o, c = get_all_subgraphs(csr_a, [1,2])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_a)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

1.22918344331576 1.22918344331576 1.0986122886681098 1.0986122886681098 0.2560852512064189 0.2560852512064189


In [62]:
s, o, c = get_all_subgraphs(csr_b, [1,2])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_b)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

1.1218875824868202 1.1218875824868202 0.8913925244674451 0.8913925244674451 0.21437821884213906 0.21437821884213906


### Support

In [63]:
a = [[1, 0, 0, 0],
    [0, 1, 1, 0],
    [0, 1, 1, 0],
    [0, 0, 0, 1]]
csr_a = csr_matrix(a, dtype=float)

csr_a = get_tf_idf(csr_a)

In [64]:
b = [[1, 0, 0, 0],
    [0, 1, 0, 0],
    [0, 1, 0, 0],
    [0, 0, 0, 1]]
csr_b = csr_matrix(b, dtype=float)

csr_b = get_tf_idf(csr_b)

In [65]:
s, o, c = get_all_subgraphs(csr_a, [1,2])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_a)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

1.22918344331576 1.22918344331576 1.0986122886681098 1.0986122886681098 0.2560852512064189 0.2560852512064189


In [66]:
s, o, c = get_all_subgraphs(csr_b, [1,2])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_b)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

0.0 0.0 0.0 0.0 0.0 0.0


### Support case 2

In [67]:
a = [[1, 0, 0, 0, 0],
    [0, 1, 1, 1, 1],
    [0, 1, 1, 1, 1],
    [0, 1, 1, 1, 1]]
csr_a = csr_matrix(a, dtype=float)

csr_a = get_tf_idf(csr_a)

In [68]:
b = [[1, 0, 0, 0, 0],
    [0, 1, 1, 0, 0],
    [0, 1, 1, 0, 0],
    [0, 1, 1, 0, 0]]
csr_b = csr_matrix(b, dtype=float)

csr_b = get_tf_idf(csr_b)

In [71]:
s, o, c = get_all_subgraphs(csr_a, [1,2,3])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_a)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

1.2944856721001772 2.103539217162788 1.6094379124341003 1.6094379124341003 0.1212658582944659 0.1212658582944659


In [72]:
s, o, c = get_all_subgraphs(csr_b, [1,2])
ads_count, bigrams_count = get_counts(c)
total_edges_weighted = csr_matrix.sum(csr_b)

print (calculate_weighted_density(c, ads_count, bigrams_count), 
       calculate_weighted_fraudar_score(c, ads_count, bigrams_count), 
       calculate_weighted_edge_per_score(c, o, bigrams_count),
       calculate_weighted_edge_per_score(c, s, bigrams_count),
       calculate_weighted_modularity_score(c, o, total_edges_weighted, ads_count, bigrams_count),
      calculate_weighted_modularity_score(c, s, total_edges_weighted, ads_count, bigrams_count))

1.1218875824868202 1.1218875824868202 1.0986122886681098 0.7713074907396755 0.27202293609527134 0.10650429805609705


### 