In [1]:
import time
import pyspark
import numpy as np
from itertools import permutations
from scipy.sparse import coo_matrix, save_npz
from google.cloud import storage
from datetime import datetime
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .getOrCreate()

In [None]:
start_time = datetime.now()

# reading list of files in the bitbucket
client = storage.Client()
bucket = client.bucket('bgse-datawarehousing-random-tweets')
files_list = bucket.list_blobs(prefix='')
files_list = ['gs://bgse-datawarehousing-random-tweets/' + str(i.name) for i in files_list]

# reading and processing data
tweets = spark.read.json(files_list)
result = tweets \
        .rdd \
        .filter(lambda r: (r.entities is not None) and (len(r.entities.hashtags) > 1)) \
        .map(lambda r: [i.text.lower() for i in r.entities.hashtags]) \
        .flatMap(lambda r: permutations(r, 2)) \
        .map(lambda r: (r, 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda r: (r[0][0], r[0][1], r[1])) \
        .collect()

# creating a dictionary of tags for the sparse matrix
unique_values = set([i[0] for i in result])
words_dictionary = {i: x for x, i in enumerate(unique_values)}

# creating and saving sparse matrix
row  = np.array([words_dictionary[i[0]] for i in result])
col  = np.array([words_dictionary[i[1]] for i in result])
data = np.array([i[2] for i in result])
sparse_matrix = coo_matrix((data, (row, col)), shape=(len(words_dictionary), len(words_dictionary)))
save_npz("sparse_matrix.npz", sparse_matrix, compressed=True)

#evaluating time performance
end_time = datetime.now()
print(end_time - start_time)

1:35:23.822104


In [None]:
sparse_matrix

<260019x260019 sparse matrix of type '<type 'numpy.int64'>'
	with 2561237 stored elements in COOrdinate format>