## HW 2 - Algorithms and Applications in Social Networks

Names & IDs:

Yonatan Voikhansky, 315398339

Ariel Ireni, 313914970

In [25]:
import networkx as nx
import numpy as np
from networkx.algorithms import community
import requests
from networkx.algorithms.community import k_clique_communities

## Question 1
### Part A

In [33]:
def Newman_Girvan(G, k):
    G = G.copy()
    removal_queue = list()
    while len(G.edges) > 0:
        cent_dict = nx.edge_betweenness(G)
        max_edge = max(cent_dict, key=cent_dict.get)
        removal_queue.append(max_edge)
        G.remove_edge(*max_edge)

    while len(removal_queue) > 0:
        curr_edge = removal_queue.pop()
        G.add_edge(curr_edge[0], curr_edge[1])
        if nx.number_connected_components(G) == k:
            return list(nx.connected_components(G))
    
    return None

def test_q1a():
    G = nx.karate_club_graph()
    our_result = Newman_Girvan(G, 2)
    their_result = list(community.girvan_newman(G))[0]
    assert our_result == list(their_result)

test_q1a()

### Part B

In [27]:
def get_biggest_connected_component():
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
    }

    response = requests.get("http://slavanov.com/teaching/sn1718b/data/communities.txt", headers=headers)
    data = response.text
    data = data.split("\n")
    edges = []
    for line in data:
        line = line.split(" ")
        edges.append(line)
    edges = edges[:-1]

    G = nx.from_edgelist(edges)
    largest_cc = max(nx.connected_components(G), key=len)
    G_comp = nx.subgraph(G, list(largest_cc))
    return G_comp

def Newman_Girvan_largest_component():
    G_comp = get_biggest_connected_component()
    communities = Newman_Girvan(G_comp, 3)
    return [list(comm) for comm in communities]

lst = Newman_Girvan_largest_component()
for i in range(len(lst)):
    print(f"Community {i+1}: \n{lst[i]}\n")

Community 1: 
['288', '85', '246', '204', '291', '64', '107', '21', '303', '249', '24', '301', '105', '340', '228', '234', '92', '106', '271', '284', '176', '222', '156', '30', '238', '57', '164', '194', '250', '34', '122', '320', '50', '277', '191', '187', '300', '274', '217', '51', '232', '94', '166', '54', '180', '185', '231', '136', '224', '123', '266', '38', '269', '190', '3', '148', '126', '183', '184', '329', '1', '5', '314', '84', '121', '280', '178', '16', '87', '25', '206', '103', '196', '142', '242', '31', '83', '251', '189', '169', '208', '247', '117', '128', '62', '173', '252', '45', '309', '239', '304', '324', '272', '257', '302', '108', '261', '135', '69', '211', '53', '66', '76', '313', '318', '120', '9', '283', '82', '168', '134', '132', '268', '56', '344', '153', '73', '163', '270', '79', '276', '338', '330', '315', '342', '258', '198', '10', '317', '165', '345', '80', '172', '186', '295', '316', '26', '39', '308', '88', '223', '299', '113', '101', '100', '129', '260'

#### Part C - in the PDF file

## Question 2
### Part A

In [28]:
def get_k_clique_communities(G, k):
	# find all maximal cliques
	maximal_cliques = list(nx.find_cliques(G))
	
	# create clique overlap matrix
	n = len(maximal_cliques)
	overlap_matrix = [[0 for col in range(n)] for row in range(n)]
	for i in range(len(maximal_cliques)):
		c_1 = maximal_cliques[i]
		
		for j in range(len(maximal_cliques)):
			count_overlap = 0
			c_2 = maximal_cliques[j]
			for node in c_2:
				if node in c_1: 
					count_overlap += 1

			overlap_matrix[i][j] = count_overlap

	# treshold matrix with k-1
	for i in range(len(overlap_matrix)):
		for j in range(len(overlap_matrix)):
			if i == j:
				if overlap_matrix[i][j] < k:
					overlap_matrix[i][j] = 0
				else:
					overlap_matrix[i][j] = 1
			else:
				if overlap_matrix[i][j] < k-1:
					overlap_matrix[i][j] = 0
				else:
					overlap_matrix[i][j] = 1	
					
    	# communities are connected components
	A = np.array(overlap_matrix)

	G_c = nx.from_numpy_matrix(A, create_using=nx.MultiGraph)
	
	to_be_removed = [x for  x in G_c.nodes() if G_c.degree(x) < 1]
	for x in to_be_removed:
		G_c.remove_node(x)
	
	cliques_comp = list(nx.connected_components(G_c))


	communities = []
	for comp in list(cliques_comp):
		curr_community = []
		for clique in comp:
			curr_community = curr_community + maximal_cliques[clique]
	
		communities.append(set(curr_community))

	return communities

def test_2a():
	G = nx.from_edgelist([(1,2), (1,3), (1,4), (2,4), (3,4), (2,3), (2,5), (2,6), (5,6), (4,6), (6,7), (4,7), (6,8), (7,8), (4,8), (3,9), (9,10), (10,4), (10,8), (10,6), (9,6), (3,10), (4,9), (8,9)])
	assert get_k_clique_communities(G, 4) == [{1, 2, 3, 4}, {3, 4, 6, 7, 8, 9, 10}]

test_2a()

#### Part B

In [29]:
def k_clique_largest_component():
	G_comp = get_biggest_connected_component()
	return get_k_clique_communities(G_comp, 4)

def test_2b():
	our_communities = k_clique_largest_component()
	G_comp = get_biggest_connected_component()
	their_communities = list(k_clique_communities(G_comp, 4))
	assert len(our_communities) == len(their_communities)
	for i in range(len(our_communities)):
		assert sorted(our_communities[i]) == sorted(their_communities[i])


test_2b()

#### Part C - in the PDF file

### Questions 3, 4, 5, 6 - in the PDF file