# Minimum k-cut Algorithm

### High-level Algorithm Specification:

1. Create vertex list and an edges list, e.g.:

    ```javascript
    vertices = {1: [2,4,5], 2: [3,4,5], 3: [2,4], 4: [1,2,3], 5: [1,2]}
    edges = [[1,2], [1,4], [1,5], [2,3], [2,4], [2,5], [3,4]]
    ```

2. Keep track of the minimum cut so far:

    ```javascript
    // really this could be the max degree of all vertices, I believe
    min_edges_so_far = len(edges)
    min_vertex_sets = {1:[], 2:[], 3:[], 4:[], 5:[]}
    ```

3. *Iterate at least `n^2 log n` times (where n is the original number of vertices)*
    
    **Intiate:**
    
    ```javascript
    temp_vertex_sets = copy(min_vertex_sets)
    temp_vertices = copy(vertices)
    temp_edges = copy(edges)
    ```

    **While num_vertices > k:**

    1. Pick an edge at random: the first vertex (`v1`) will absorb the second (`v2`). Add `v2` and `temp_vertex_sets[v2]` to `temp_vertex_sets[v1]` and delete `temp_vertex_sets[v2]`.
    2. All vertices adjacent to `v2` are added to `temp_vertices[v1]` unless already present. Remove `v2` from `temp_vertices[v1]`.
    3. Replace all instances of `v2` in `temp_edges` with `v1`, unless the other vertex of the edge is itself `v1`. In the latter case, delete the edge (e.g. remove self-loops). **Note:** Parallel edges are allowed; there may be multiple instances of an edge comprised the same vertex pair.

    **Finally:** The number of final edges is the number of edges across the final cut in this iteration. If it is less than min_edges_so_far, update `min_edges_so_far = len(temp_edges)` and `min_vertex_sets = temp_vertex_sets`.
        

## Setup: Select all measurements and document ids from the database

In [16]:
import psycopg2
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import numpy as np
import json
from copy import deepcopy
import random

database = 'fomc'
conn = psycopg2.connect("dbname=" + database + " user=abarciauskas")
cur = conn.cursor()

year = 2007
cosine_thresh = 0.25
cur.execute("SELECT Doc1Id,Doc2Id,CosineSimilarity FROM alignments WHERE Year = '" + str(year) + "'"
           " AND CosineSimilarity >= " + str(cosine_thresh) + " ORDER BY random() LIMIT 2577")
cosine_sims = cur.fetchall()
len(cosine_sims)

2577

## Step 1: Create the graph

The graph is comprised a list of edges (a vertex tuple) and a dictionary of vertices.

In [17]:
def create_graph(alignments):
    edges = [tuple([x[0],x[1]]) for x in alignments]
    vertices = {}
    for edge in edges:
        v1 = edge[0]
        v2 = edge[1]
        if v1 in vertices.keys():
            vertices[v1].add(v2)
        else:
            vertices[v1] = {v2}
        if v2 in vertices.keys():
            vertices[v2].add(v1)
        else:
            vertices[v2] = {v1}
    return [edges, vertices]

edges, vertices = create_graph(cosine_sims)
print len(vertices)
print len(edges)

1195
2577


In [18]:
# need to find disconnected graphs
graphs = []
unvisited = set(vertices.keys())

#Detect distinct graphs
while len(unvisited) > 0:
    # for every vertex, find all of its connected components and recurse on those vertices
    visited = []
    current_vertex = unvisited.pop()
    visited.append(current_vertex)
    stack_to_visit = list(vertices[current_vertex])
    while len(stack_to_visit) > 0:
        current_vertex = stack_to_visit.pop()
        current_adj_vtcs = vertices[current_vertex]
        if current_vertex not in visited: visited.append(current_vertex)
        if current_vertex in unvisited: unvisited.remove(current_vertex)        
        for v in current_adj_vtcs:
            if v not in visited:
                stack_to_visit.insert(0, v)
    graphs.append(visited)

# print len(unvisited)
# print len(visited)
# print len(graphs)
# print ''

graph_lengths = [len(graph) for graph in graphs]
fc_graph = graphs[graph_lengths.index(max(graph_lengths))]
print 'Number vertices fully connected graph: ' + str(len(fc_graph))

Number vertices fully connected graph: 1140


In [19]:
# Remove loner graphs
# fc = fully connected
set_fc_graph_vertices = set(fc_graph)
loners = set_fc_graph_vertices ^ set(vertices.keys())

fc_vertices = deepcopy(vertices)
fc_edges = deepcopy(edges)

print(len(fc_vertices))
for loner in loners:
    fc_vertices.pop(loner, None)
print(len(fc_vertices))

print(len(fc_edges))
fc_edges = filter(lambda x: not list(x)[0] in loners and not list(x)[1] in loners, fc_edges)    
print(len(fc_edges))

1195
1140
2577
2546


## Step 2: Keep track of minimum so far

## Step 3: Random iterations

In [20]:
import random
def run(niters):
    min_fc_edges_so_far = len(fc_edges)
    min_vertex_sets = {key: set() for key in fc_vertices.keys()}
    for iteridx in range(niters):
        if iteridx % 50 == 0: print 'Running iter: ' + str(iteridx)        
        temp_vertex_sets = {key: set() for key in fc_vertices.keys()}
        temp_fc_vertices = deepcopy(fc_vertices)
        temp_fc_edges = fc_edges[:]        
        while len(temp_fc_vertices) > k:
            # pick an edge at random and delete it
            rand_idx = int(random.random()*len(temp_fc_edges))
            random_edge = temp_fc_edges.pop(rand_idx)
            # Add v2 and temp_vertex_sets[v2] to temp_vertex_sets[v1] and delete temp_vertex_sets[v2]
            v1 = list(random_edge)[0]
            v2 = list(random_edge)[1]
            temp_vertex_sets[v1] = temp_vertex_sets[v1].union(temp_vertex_sets[v2])
            temp_vertex_sets[v1].add(v2)
            temp_vertex_sets.pop(v2, None)

            # All fc_vertices adjacent to v2 are added to temp_fc_vertices[v1] unless already present.
            # Remove v2 from temp_fc_vertices[v1].
            adj_v2 = temp_fc_vertices[v2]
            temp_fc_vertices[v1] = temp_fc_vertices[v1].union(adj_v2)
            temp_fc_vertices[v1].remove(v2)
            temp_fc_vertices.pop(v2, None)

            # Replace all instances of v2 in temp_fc_edges with v1, unless the other vertex of the edge is itself v1.
            # In the latter case, delete the edge (e.g. remove self-loops).
            # Note: Parallel fc_edges are allowed; there may be multiple instances of an edge comprised the same vertex pair.
            remove_fc_edges = []
            for i,cur_edge in enumerate(temp_fc_edges):
                if len(cur_edge) > 1:
                    cur_edge_v1 = list(cur_edge)[0]
                    cur_edge_v2 = list(cur_edge)[1]
                    if (cur_edge == random_edge):
                        remove_fc_edges.append(i)
                    elif cur_edge_v1 == v2:
                        temp_fc_edges[i] = {v1, cur_edge_v2}
                        # remove this edge from temp_fc_vertices
                        # it may have already been removed because we keep parallel fc_edges around
                        if v2 in temp_fc_vertices[cur_edge_v2]: temp_fc_vertices[cur_edge_v2].remove(v2)
                    elif cur_edge_v2 == v2:
                        temp_fc_edges[i] = {cur_edge_v1, v1}
                        # it may have already been removed because we keep parallel fc_edges around
                        if v2 in temp_fc_vertices[cur_edge_v1]: temp_fc_vertices[cur_edge_v1].remove(v2)
            # work around for delete
            temp_fc_edges = [set(i) for j, i in enumerate(temp_fc_edges) if j not in remove_fc_edges]
            #Finally: The number of final fc_edges is the number of fc_edges across the final cut in this iteration.
            #If it is less than min_fc_edges_so_far, update min_fc_edges_so_far = len(temp_fc_edges) and min_vertex_sets = temp_vertex_sets.
            if len(temp_fc_edges) < min_fc_edges_so_far:
                min_fc_edges_so_far = len(temp_fc_edges)
                min_vertex_sets = temp_vertex_sets
    return min_fc_edges_so_far, min_vertex_sets

In [21]:
k = 100
n = len(fc_vertices)
niters = 1

import time

t0 = time.time()
min_fc_edges_so_far, min_vertex_sets = run(niters)
t1 = time.time()

total = t1-t0
print 'Total time for ' + str(niters) + ': ' + str(total)

Running iter: 0
Total time for 1: 3.62939500809


In [22]:
niters = int(np.ceil(n**2*np.log(n)))
niters = 5000
total_seconds = niters*total
minutes = total_seconds/60
hours = minutes/60
print hours

5.04082640012


In [None]:
niters = 10000
t0 = time.time()
min_edges_so_far, min_vertex_sets = run(niters)
t1 = time.time()
total = t1-t0
print 'Total time for ' + str(niters) + ': ' + str(total)

In [62]:
print 'Num crossing edges: ' + str(min_edges_so_far)
total = t1-t0
print 'Total time for ' + str(niters) + ' iterations: ' + str(total/60/60) + ' hours'
super_nodes = min_vertex_sets.keys()
super_nodes = filter(lambda x: len(min_vertex_sets[x]) > 4, super_nodes)
nclusters = len(super_nodes)
print 'Number of actual clusters: ' + str(nclusters)

Num crossing edges: 126
Total time for 10000 iterations: 4.75294949664 hours
Number of actual clusters: 2


In [55]:
from itertools import izip

# find the relative frequency for each super node
cur.execute("SELECT TermVector FROM corpii WHERE Year = '" + str(year) + "'")
terms = cur.fetchall()[0][0]

nterms = len(terms)
overall_frequencies = [0]*nterms
cluster_frequencies = [[0]*nterms]*nclusters

for cidx, supernode in enumerate(super_nodes):
    vertices_in_cluster = min_vertex_sets[supernode]
    all_nodes = list(vertices_in_cluster)
    all_nodes.append(supernode)
    for docid in all_nodes:
        cur.execute("SELECT TfIdfVector FROM processed_documents WHERE Id = " + str(docid))
        result = cur.fetchone()[0]
        tf_idf_vector = [float(x) for x in result]
        cluster_frequencies[cidx] = map(sum, izip(cluster_frequencies[cidx], tf_idf_vector))
        overall_frequencies = map(sum, izip(overall_frequencies, tf_idf_vector))


In [56]:
cluster_freqs_normalized = []
for cidx in range(nclusters):
    curr_cluster = cluster_frequencies[cidx]
    cluster_freqs_normalized.append(
        [curr_cluster[i]/overall_frequencies[i] if overall_frequencies[i] > 0 else 0 for i in range(nterms)]
    )

num_terms = 20
curr_cluster = cluster_freqs_normalized[3]
# workaround for reversing the array
sorted_frequency_idcs = np.argsort(curr_cluster)[::-1]
print [curr_cluster[sorted_frequency_idcs[i]] for i in range(num_terms)]
print ''
print [terms[sorted_frequency_idcs[i]] for i in range(num_terms)]

IndexError: list index out of range

In [34]:
clusters = [[supernode] + list(min_vertex_sets[supernode]) for supernode in super_nodes]

for cluster in clusters:
    degrees = [len(vertices[vertex]) for vertex in cluster]
    centroid_id = cluster[degrees.index(max(degrees))]
    cur.execute("SELECT Original FROM processed_documents WHERE Id = " + str(centroid_id))
    centroid = cur.fetchall()
    print centroid
    print ''

[928, 1024, 1, 4, 5, 6, 8, 24, 25, 26, 28, 33, 36, 37, 38, 49, 50, 51, 52, 58, 60, 61, 62, 66, 67, 72, 75, 76, 78, 79, 80, 81, 85, 88, 89, 90, 91, 92, 93, 94, 95, 97, 98, 99, 100, 101, 107, 108, 109, 117, 120, 125, 127, 137, 146, 147, 148, 152, 153, 154, 155, 157, 164, 165, 166, 167, 168, 169, 170, 171, 172, 176, 177, 178, 183, 187, 189, 190, 191, 196, 202, 203, 208, 209, 213, 214, 215, 216, 218, 220, 222, 223, 224, 225, 226, 229, 230, 231, 233, 236, 237, 238, 244, 250, 251, 252, 257, 258, 264, 267, 271, 277, 284, 292, 295, 296, 297, 300, 307, 308, 309, 310, 314, 321, 323, 324, 326, 327, 332, 333, 334, 336, 340, 341, 342, 343, 344, 345, 347, 349, 352, 354, 355, 358, 359, 360, 361, 362, 363, 371, 377, 381, 383, 387, 389, 404, 415, 418, 419, 421, 422, 429, 430, 431, 432, 434, 436, 437, 438, 445, 446, 447, 450, 454, 455, 456, 458, 460, 461, 462, 463, 465, 466, 469, 471, 472, 475, 479, 480, 481, 482, 483, 484, 487, 488, 489, 495, 497, 500, 503, 507, 510, 512, 514, 515, 525, 527, 535, 536, 