In [1]:
import numpy as np
import pandas as pd
import scipy

from google.cloud import storage
from scipy.sparse import coo_matrix

import pyspark

## Reading json files from the bucket 

In [2]:
bucket_name = "bgse-datawarehousing-random-tweets"

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

json_file_list = []
for blob in bucket.list_blobs():
    json_file_list.append("gs://" + bucket_name + "/" + blob.name)


In [3]:
len(json_file_list)

6109

In [4]:
json_df = spark.read.json(json_file_list[0:10])

## Extracting Hashtags

In [23]:
from pyspark.sql.functions import col, lower, size
from itertools import combinations

def lowercase_list(lst):
    return list(map(lambda item: item.lower(), lst))

hashtags = (
    json_df
        .select("entities.hashtags.text")
        # filter out any rows without more than one hashtag
        .where(size(col("text")) > 1)
        .rdd
        # select the text from the rows
        .map(lambda r: r['text'])
        # lowercase all the hashtags
        .map(lowercase_list)
)

In [24]:
hashtags

PythonRDD[15] at RDD at PythonRDD.scala:52

## Getting combinations of every hashtag

In [25]:
def pair_combinations(lst):
    return combinations(lst, 2)

In [26]:
hashtag_combinations = (
    hashtags.flatMap(pair_combinations)
        .map(lambda tup: (tup, 1))
        .countByKey()
)

In [27]:
def flatten_dict(d):
    return [(x, y, z) for ((x, y), z) in d.items()]

flattened_dict = flatten_dict(hashtag_combinations)


## Creating both halves of matrix

In [28]:
key1, key2, count = zip(*flattened_dict)

In [29]:
matrix_first_half = flattened_dict
matrix_second_half = list(zip(key2,key1,count))


In [30]:
key1, key2, count = zip(*matrix_first_half + matrix_second_half)

In [31]:
key1 = list(key1)
key2 = list(key2)
count = list(count)

## Checking the matrix

In [32]:
for i in flattened_dict[25:30]:
    print(i)


(u'leftwing', u'blair', 47)
(u'usa', u'us', 2)
(u'marxist', u'bbc', 1)
(u'followme', u'pussy', 1)
(u'trump', u'nasdaq', 1)


In [33]:
for i in range(25,30):
    print(key1[i], key2[i], count[i])


(u'leftwing', u'blair', 47)
(u'usa', u'us', 2)
(u'marxist', u'bbc', 1)
(u'followme', u'pussy', 1)
(u'trump', u'nasdaq', 1)


## Taking my dictionary and making a sparse matrix from it

In [34]:
df = pd.DataFrame({'key1': key1, 'key2': key2, 'count': count})
df.set_index(['key1', 'key2'], inplace=True)
matrix = coo_matrix((df['count'],(df.index.labels[0], df.index.labels[1])))
matrix

<1791x1791 sparse matrix of type '<type 'numpy.int64'>'
	with 11272 stored elements in COOrdinate format>

In [35]:
scipy.sparse.save_npz('sparse_matrix.npz', matrix, compressed=True)
sparse_matrix = scipy.sparse.load_npz('sparse_matrix.npz')

In [36]:
sparse_matrix

<1791x1791 sparse matrix of type '<type 'numpy.int64'>'
	with 11272 stored elements in COOrdinate format>