In [None]:
from itertools import permutations
from pyspark.sql import SparkSession
from scipy.sparse import coo_matrix, save_npz
from datetime import datetime

spark = SparkSession \
    .builder \
    .appName("PySpark Intro") \
    .getOrCreate()

start_time = datetime.now()

In [None]:
tweets = spark.read.options(samplingRatio=.01).json('gs://bgse-datawarehousing-random-tweets')

In [None]:
#Extract the hashtags from the tweets
#Filter for the entities that are not None
#Filter for the hastags that are greater than 1
#Extract the hashtag text from the tuple

list_hashtags = tweets.rdd.map(lambda e: e.entities) \
    .filter(lambda e: e is not None).map(lambda t: t.hashtags)\
    .filter(lambda t: len(t) > 1) \
    .map(lambda t: [i[1] for i in t]).cache()

In [None]:
#Generate a list with the count of all hashtag pairs 

from itertools import permutations

count_hashtags = list_hashtags.map(lambda x: list(permutations(x,2))) \
    .flatMap(lambda x: (x)) \
    .map(lambda x: (x,1)) \
    .reduceByKey(lambda x,y: x+y) \
    .collect()

In [None]:
# Create a dictionary with an index number for each unique hashtag
list_reduced = map(lambda x: [x[0][0], x[0][1],x[1]], count_hashtags)
list_pairs = map(lambda x: [x[0], x[1]], list_reduced)
unique_hashtags = set(reduce(lambda x,y: x+y, list_pairs))
dict_hashtags = {key: value for value, key in enumerate(set(unique_hashtags),start = 0)}

In [None]:
# Create the inputs for the COO matrix function
#row_index - contains the row index number associated with each of the hashtags
#column_index - contains the column index number associated with each of the hashtags
#count - contains the count of all the hashtag pairs


list_1 = map(lambda x: x[0], list_reduced)
list_2 = map(lambda x: x[1], list_reduced)

row_index = map(lambda x: dict_hashtags.get(x), list_1)
column_index = map(lambda x: dict_hashtags.get(x), list_2)
count = map(lambda x: x[2], list_reduced)


In [None]:
# coo_matrix((data, (i, j)), [shape=(M, N)])
# to construct from three arrays:
# data[:] the entries of the matrix, in any order
# i[:] the row indices of the matrix entries
# j[:] the column indices of the matrix entries

matrix = coo_matrix(count, (row_index, column_index))

In [None]:
# creating and saving sparse matrix
save_npz("sparse_matrix_big.npz", matrix, compressed=True)

#evaluating time performance
end_time = datetime.now()
print("Process start time: " +str(start_time))
print("Process end time: " + str(end_time))
print("Total run time: " + str(end_time - start_time))