In [None]:
from pyspark.sql import SparkSession
import numpy as np
import itertools as it
from collections import Counter
from scipy.sparse import coo_matrix, save_npz
from datetime import datetime

spark = SparkSession \
    .builder \
    .appName("PySpark Intro") \
    .getOrCreate()

In [None]:
start_time = datetime.now()

tweets = spark.read.options(samplingRatio=0.01).json('gs://bgse-datawarehousing-random-tweets')


In [None]:
# rdd magic on the file

tweets_hashtags = tweets.rdd\
    .map(lambda r: r.entities)\
    .filter(lambda x: x is not None)\
    .map(lambda r: r.hashtags)\
    .filter(lambda x: len(x) > 1)\
    .map(lambda x: [i[1] for i in x])\
    .collect()
    
# organize the data 

perms = map(lambda x: list(it.permutations(x, 2)), tweets_hashtags)

combined_list = reduce(lambda a,b : a+b, perms)

tweet_pairs = [(x.lower(), y.lower()) for x,y in combined_list]

countsold = list(Counter(tweet_pairs).items())

counts = [[item[0][0], item[0][1], item[1]] for item in countsold]

unique_pairs = (set(tweet_pairs))

# create dictionary of unique individual hashtags

unique_hashtags1 = [i[0] for i in unique_pairs]
unique_hashtags2 = [i[1] for i in unique_pairs]
unique_hashtags = (set(unique_hashtags1 + unique_hashtags2))

unique_hashtags_index = {item:val for val, item in enumerate(unique_hashtags)}

# map each item in pairs to match value in dictionary

list1 = [i[0] for i in counts]
list2 = [i[1] for i in counts]
counts_list = [i[2] for i in counts] 

# create row and column index

row_index = map(lambda x: unique_hashtags_index.get(x), list1)
column_index = map(lambda x: unique_hashtags_index.get(x), list2)

# Constructing a sparse matrix 
row  = np.array(row_index)
col  = np.array(column_index)
data = np.array(counts_list)
final_matrix = coo_matrix((data, (row, col)))

# save file

save_npz("final_matrix.npz", final_matrix, compressed = True)

# evaluate the time performance

end_time = datetime.now()
print(end_time - start_time)

In [None]:
final_matrix