In [None]:
!git clone https://github.com/joonalillfors/spectral-clustering.git

Cloning into 'spectral-clustering'...
remote: Enumerating objects: 56, done.[K
remote: Counting objects: 100% (56/56), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 56 (delta 17), reused 44 (delta 10), pack-reused 0[K
Receiving objects: 100% (56/56), 15.18 MiB | 8.39 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/da324dataminingproject2.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/spectral-clustering/code/graphs_processed')

In [None]:
import numpy as np
import pandas as pd

# Load the adjacency matrix from the provided data
adj_matrix = np.array(pd.read_csv('/content/spectral-clustering/code/graphs_processed/adjacency.csv'))

# E = []

# for i in range(adj_matrix.shape[0]):
#     for j in range(adj_matrix.shape[1]):
#       if adj_matrix[i, j] == 1:
#         E.append((i, j))

# with open('/content/spectral-clustering/code/graphs_processed/dm.txt', 'w') as f:
#   for i in range(adj_matrix.shape[0]):
#     for j in range(adj_matrix.shape[1]):
#       if adj_matrix[i, j] == 1:
#         f.write(f'{i} {j}\n')


In [None]:
import sys
import numpy as np
from scipy.sparse.linalg import eigsh
from sklearn.cluster import KMeans


def objective(clustering, edges, k):
    clusters = np.zeros(k, dtype=int)
    for i in clustering:
        clusters[i] += 1
    outgoing = np.zeros(k, dtype=int)
    for u, v in edges:
        cu = clustering[u]
        cv = clustering[v]
        if cu != cv:
            outgoing[cu] += 1
            outgoing[cv] += 1
    sum = 0
    for i, o in enumerate(outgoing):
        sum += o / clusters[i]
    print(clusters)
    return sum


# Spectral clustering with Fiedler vector
def fiedler(A, D, k):
    UL = D - A
    # Get eigenvalues v and eigenvectors w
    v, w = np.linalg.eig(UL)
    # Sort eigenvalues
    idx = v.argsort()[::1]
    x2 = w[idx[:2][1]].T
    fiedler = x2.reshape(-1,1)
    res = KMeans(n_clusters=k).fit_predict(fiedler)
    return res


# Basic normalized spectral clustering
def spectral(A, D, k):
    L = np.identity(A.shape[0]) - D @ A @ D
    V, eig = eigsh(L, k)

    U = eig
    U_rowsums = U.sum(axis=1)
    U = U / U_rowsums[:, np.newaxis]

    res = KMeans(n_clusters=k).fit_predict(U)
    return res

# Spectral clustering without the first eigenvector
def ogSpectral(A, D, k):
    L = D - A
    w, eig = eigsh(L, k+1, which="SA")
    # Drop the first eigenvector
    U = eig.T[1::].T
    res = KMeans(n_clusters=k).fit_predict(U)
    return res

def writeRes(alg, name, nofV, nofE, k, clustering):
    try:
        f = open(f"../results/{alg}/{name}", "w")
        f.write(f"# {name} {nofV} {nofE} {k}\n")
        for v, c in enumerate(clustering, start=1):
            f.write(f"{v} {c+1}\n")
        f.close()
    except Exception as e:
        print(e)
        print("write failed")


In [None]:
# filename = "dm.txt"
# E = []
# adjacency = np.zeros((11952, 11952))
# nofVertices = 0
# nofEdges = 0
k = 10
# if not filename:
#     print("no filename")
# # try:
# path = "graphs_processed/"+filename
# print(path)
# f = open(path, "r")
# for line in f:
#     if line.startswith("#"):
#         meta = line.split(" ")
#         nofVertices = int(meta[2])
#         nofEdges = int(meta[3])
#         k = int(meta[4])
#         adjacency = np.zeros((nofVertices, nofVertices), dtype=int)
#         print(f"Graph: {meta[1]}")
#         print(f"Nodes: {meta[2]}")
#         print(f"Edges: {meta[3]}")
#         print(f"k: {meta[4]}")
#     else:
#         u, v = line.split(" ")
#         u = int(u)
#         v = int(v.replace('\n', ''))
#         adjacency[u][v] = 1
#         adjacency[v][u] = 1
#         E.append((u, v))
# except:
#     print(f"no such file {filename}")

# Calculate degrees
degree = np.array(list(map(lambda x: np.sum(x), adj_matrix)), dtype=float)
D = np.diag(np.sqrt(1 / degree))
DD = np.diag(degree)

# Compute clusterings
kmeans_fiedler = fiedler(adj_matrix, D, k)
spec = spectral(adj_matrix, D, k)
og_spec = ogSpectral(adj_matrix, DD, k)

# Calculate scores
# kmeans_fiedler_score = objective(kmeans_fiedler, E, k)
# spec_score = objective(spec, E, k)
# og_spec_score = objective(og_spec, E, k)

# print("kmeans fiedler:", kmeans_fiedler_score)
# print("spec:", spec_score)
# print("og spec:", og_spec_score)

writeRes("fiedler", filename, nofVertices, nofEdges, k, kmeans_fiedler)
writeRes("normalized-spectral", filename, nofVertices, nofEdges, k, spec)
writeRes("without-first-eigvec", filename, nofVertices, nofEdges, k, og_spec)



NameError: name 'filename' is not defined

In [None]:
np.unique(spec)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)

In [None]:
sub = pd.DataFrame(og_spec).reset_index()
sub = sub.rename(columns = {'index': 'ID', 0: 'LABEL'})
sub

Unnamed: 0,ID,LABEL
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
11947,11947,0
11948,11948,0
11949,11949,0
11950,11950,0


In [None]:
sub.to_csv("submission.csv", index = False)

In [None]:
%cd spectral-clustering/code

/content/spectral-clustering/code
no such file dm


In [None]:
!python partition.py dm.txt

graphs_processed/dm.txt
no such file dm.txt


In [None]:
%cd ../..