In [185]:
K = 10

In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName='spectral')

In [39]:
spark = pyspark.sql.SparkSession(sc)

In [139]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, MatrixEntry, CoordinateMatrix

In [190]:
from pyspark.ml.clustering import KMeans

In [57]:
# spark.conf.set('spark.sql.pivotMaxValues', 335000)

## Get adjacency matrix

In [142]:
txt = sc.textFile('./data/com-amazon.ungraph.txt')
txt.take(5)

['# Undirected graph: ../../data/output/amazon.ungraph.txt',
 '# Amazon',
 '# Nodes: 334863 Edges: 925872',
 '# FromNodeId\tToNodeId',
 '1\t88160']

In [143]:
txt = txt.sample(False, 0.001, 1)

In [144]:
txt = txt.zipWithIndex().filter(lambda x: int(x[1]) >= 4).map(lambda x: x[0].split('\t'))

In [None]:
txt.take(10)

In [145]:
entries = txt.map(lambda x: MatrixEntry(int(x[0]), int(x[1]), 1.0))
type(entries) # rdd

pyspark.rdd.PipelinedRDD

In [146]:
N = txt.flatMap(lambda x: [int(xx) for xx in x]).max()
N

548091

In [150]:
W = CoordinateMatrix(entries, numCols=N, numRows=N)
print(W.numCols())
print(W.numRows())
print(type(W))

548091
548091
<class 'pyspark.mllib.linalg.distributed.CoordinateMatrix'>


In [151]:
# def toSparseRow(N):
#     return lambda val: Vectors.sparse(N, [(int(ii), 1) for ii in val])

# txt.flatMap(lambda x: [int(xx) for xx in x]).max()
# N = txt.flatMap(lambda x: [int(xx) for xx in x]).max()
# rows = txt.map(lambda x: tuple(x)).groupByKey().mapValues(toSparse(N))
# W = IndexedRowMatrix(rows)
# W.numCols()
# W.numRows()

## Graph Laplacian

In [179]:
degrees = W.entries.map(lambda entry: (entry.i, entry.value)).reduceByKey(lambda a, b: a + b)
entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1]))
D = CoordinateMatrix(entries, numCols=N, numRows=N)

pyspark.mllib.linalg.distributed.CoordinateMatrix

- Ordinay: $$L = D - W$$
- Norlaized: $$L = I - D^{-1}W$$
- Symmetric: $$L = I - D^{-1/2}WD^{-1/2}$$

In [186]:
L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix()
type(L)

pyspark.mllib.linalg.distributed.CoordinateMatrix

## First k eigenvalues and eigen vectors

In [187]:
L = L.toRowMatrix()

In [189]:
## The PCA method is not that scalable and has a fixed limit of columns (65535)
V = L.computePrincipalComponents(k=K)

## K-means on rows of transformed data

In [None]:
kmeans = KMeans().setK(K).setSeed(1)
model = kmeans.fit(V)
clusters = model.transform(V)

In [None]:
sc.stop()