In [1]:
#Import pyspark and start session
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark Intro") \
    .getOrCreate()
    

In [2]:
import time
from datetime import datetime
start_time = datetime.now()

#Read json file and inspect schema
tweeties = spark.read.options(samplingRatio=0.001).json('gs://bgse-datawarehousing-random-tweets')
#tweeties = spark.read.json('gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z')


tweeties.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: bo

In [None]:
#Import libraries
import itertools as it
import numpy as np

#Filter the tweets so that we have only tweets where there are at least two hashtags

just_hashtags = tweeties.rdd \
      .map(lambda i:i.entities) \
      .filter(lambda i: i is not None) \
      .map(lambda i: i.hashtags) \
      .filter(lambda i: len(i)>1) \
      .map(lambda x: [i[1] for i in x]) 
        

#print(type(just_hashtags))
#print(just_hashtags) 

In [None]:
hashtag_combs = just_hashtags \
    .map(lambda i:list(it.combinations(i, r=2))) \
    .collect()

#print(type(hashtag_combs))

In [None]:
def explode(row):
    for k in row:
        yield k

unique_hashtags = just_hashtags \
    .flatMap(explode) \
    .distinct() \
    .collect()

#print(unique_hashtags)

In [None]:
#Create all possible combinations of hashtags within each tweet and unique list of all hashtags
import numpy as np 
import itertools as it
from itertools import combinations

#Create list of all hashtag combinations
#hashtag_combs = map(lambda i: list(it.combinations(i, r=2)), just_hashtags)
hashtag_combs_list =  reduce(lambda a,b: a+b, hashtag_combs)

#Create list of all hashtags
hashtag_combs_list_sorted = sorted(hashtag_combs_list)
hashtag_combs_list_unique = list(dict.fromkeys(unique_hashtags))

#print(hashtag_combs_list_unique)

In [None]:
#Create dictionary of hashtags plus their their unique keys             
from collections import OrderedDict

d = OrderedDict()
for idx, value in enumerate(hashtag_combs_list_unique):
    key = value
    d[key] = idx 

hashtag_combs_keys_ids= dict(d)

#print(hashtag_combs_keys_ids)

In [None]:
#Count number of times that each hashtag combination occurs and convert this into a list of lists (lols)
from collections import Counter

counts_hashtag_combs = Counter(hashtag_combs_list)

counts_hashtag_combs_list = list(counts_hashtag_combs.items())
counts_hashtag_combs_lols = [[item[0][0],item[0][1],item[1]] for item in counts_hashtag_combs_list] 

#print(counts_hashtag_combs_lols)

In [None]:
#Lookup the unique key of the 2 hashtags from the unique key/hashtag dictionary

hashtag_id_count_lols = map(lambda i:[hashtag_combs_keys_ids.get(i[0]),hashtag_combs_keys_ids.get(i[1]),i[2]] , counts_hashtag_combs_lols)

#print(hashtag_id_count_lols)
#print(hashtag_id_count_lols)
#print(hashtag_combs_keys_ids.get("uk"))

In [None]:
#Create arrays of hashtag 1, hashtag 2 and their count in preparation for loading into the coo sparse matrix
hashtag_1_array = np.array([item[0] for item in hashtag_id_count_lols])
hashtag_2_array = np.array([item[1] for item in hashtag_id_count_lols])
hashtag_count_array = np.array([item[2] for item in hashtag_id_count_lols])
                   
#print(hashtag_1_array)

In [None]:
#Load arrays into scipy sparse matrix
from scipy.sparse import coo_matrix, save_npz

coo_hashtag_matrix = coo_matrix((hashtag_count_array, (hashtag_1_array, hashtag_2_array)))
#print(coo_hashtag_matrix)

In [None]:
#Save matrix
save_npz("coo_hashtag_matrix.npz", coo_hashtag_matrix, compressed=True)

#evaluating time performance
end_time = datetime.now()
print(end_time - start_time)