In [1]:
import numpy as np
import pandas as pd
import scipy
import time
from google.cloud import storage
from scipy.sparse import coo_matrix
import pyspark

## Reading json files from the bucket 

In [2]:
start_time = time.time()
start_time

1572618195.780909

In [3]:

bucket_name = "bgse-datawarehousing-random-tweets"

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

json_file_list = []
for blob in bucket.list_blobs():
    json_file_list.append("gs://bgse-datawarehousing-random-tweets" + "/" + blob.name)


In [4]:
len(json_file_list)

6109

In [5]:
json_df = spark.read.json(json_file_list[0:1])


In [6]:
json_df.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |    |-- title: string (nullable = true)
 |    |    |    |-- display_url: string (nullable = true)
 |

## Extracting Hashtags

In [7]:
from pyspark.sql.functions import col, lower, size
from itertools import combinations

def lowercase_list(lst):
    return list(map(lambda item: item.lower(), lst))

def identity(x):
    return x

hashtags = (
    json_df
        .select("entities.hashtags.text")
        # filter out any rows without more than one hashtag
        .where(size(col("text")) > 1)
        .limit(10)
        .rdd
        # select the text from the rows
        .map(lambda r: r['text'])
        # lowercase all the hashtags
        .map(lowercase_list)
)

In [8]:
hashtags

PythonRDD[12] at RDD at PythonRDD.scala:52

In [9]:
#sometimes there is more than one hashtag so we have a list of lists
#use flatmap to reduce it all into one list
hashtags.flatMap(identity).collect()

[u'republican',
 u'trump',
 u'america',
 u'clinton',
 u'bush',
 u'learning',
 u'living',
 u'hoping',
 u'owleyes',
 u'disheveled',
 u'trade',
 u'china',
 u'donaldtrump',
 u'xijinping',
 u'brexit',
 u'tonightvmtv',
 u'russianasset',
 u'putinspuppet',
 u'traitortrump',
 u'trumpcolluded',
 u'trumpconspired',
 u'impeachthemf',
 u'eu',
 u'brexit',
 u'labour',
 u'stopbrexit',
 u'brexit',
 u'eusupergirl',
 u'peoplesvote',
 u'eeuu',
 u'venezuela',
 u'publiclands',
 u'drilling']

## Getting combinations of every hashtag

In [10]:
def pair_combinations(lst):
    return combinations(lst, 2)

In [11]:
hashtag_combinations = (
    hashtags.flatMap(pair_combinations)
        .map(lambda tup: (tup, 1))
        .countByKey()
)

In [12]:
def flatten_dict(d):
    return [(x, y, z) for ((x, y), z) in d.items()]

flattened_dict = flatten_dict(hashtag_combinations)
flattened_dict

[(u'news', u'justdoit', 1),
 (u'justdoit', u'oscar2019', 1),
 (u'haraam', u'brexit', 1),
 (u'supportsmallstreamers', u'justdoit', 1),
 (u'snowday', u'teamgodvek', 1),
 (u'comedy', u'teamgodvek', 1),
 (u'oscars', u'snowday', 1),
 (u'oscar2019', u'teamgodvek', 1),
 (u'trump', u'hanoi', 1),
 (u'comedy', u'news', 1),
 (u'news', u'supportsmallstreamers', 1),
 (u'comedy', u'supportsmallstreamers', 1),
 (u'comedy', u'snowday', 1),
 (u'disgusting', u'notlegal', 1),
 (u'halal', u'food', 1),
 (u'talk', u'oscar2019', 1),
 (u'oscar2019', u'twitch', 1),
 (u'comedy', u'oscars', 1),
 (u'talk', u'teamgodvek', 1),
 (u'trump', u'humantrafficking', 1),
 (u'twitch', u'teamgodvek', 1),
 (u'talk', u'comedy', 1),
 (u'sosprisiones', u'tuabandonomepuedematar', 1),
 (u'oscar2019', u'snowday', 1),
 (u'news', u'oscars', 1),
 (u'justdoit', u'oscars', 1),
 (u'lechuguinos', u'25feb', 1),
 (u'halal', u'haraam', 1),
 (u'comedy', u'twitch', 1),
 (u'halal', u'brexit', 1),
 (u'president', u'hanoi', 1),
 (u'music', u'snow

## Creating both halves of matrix

In [13]:
key1, key2, count = zip(*flattened_dict)

In [14]:
matrix_first_half = flattened_dict
matrix_second_half = list(zip(key2,key1,count))
matrix_first_half + matrix_second_half

[(u'news', u'justdoit', 1),
 (u'justdoit', u'oscar2019', 1),
 (u'haraam', u'brexit', 1),
 (u'supportsmallstreamers', u'justdoit', 1),
 (u'snowday', u'teamgodvek', 1),
 (u'comedy', u'teamgodvek', 1),
 (u'oscars', u'snowday', 1),
 (u'oscar2019', u'teamgodvek', 1),
 (u'trump', u'hanoi', 1),
 (u'comedy', u'news', 1),
 (u'news', u'supportsmallstreamers', 1),
 (u'comedy', u'supportsmallstreamers', 1),
 (u'comedy', u'snowday', 1),
 (u'disgusting', u'notlegal', 1),
 (u'halal', u'food', 1),
 (u'talk', u'oscar2019', 1),
 (u'oscar2019', u'twitch', 1),
 (u'comedy', u'oscars', 1),
 (u'talk', u'teamgodvek', 1),
 (u'trump', u'humantrafficking', 1),
 (u'twitch', u'teamgodvek', 1),
 (u'talk', u'comedy', 1),
 (u'sosprisiones', u'tuabandonomepuedematar', 1),
 (u'oscar2019', u'snowday', 1),
 (u'news', u'oscars', 1),
 (u'justdoit', u'oscars', 1),
 (u'lechuguinos', u'25feb', 1),
 (u'halal', u'haraam', 1),
 (u'comedy', u'twitch', 1),
 (u'halal', u'brexit', 1),
 (u'president', u'hanoi', 1),
 (u'music', u'snow

In [15]:
key1, key2, count = zip(*matrix_first_half + matrix_second_half)

In [16]:
key1 = list(key1)
key2 = list(key2)
count = list(count)

## Checking the matrix

In [17]:
for i in flattened_dict[25:30]:
    print(i)


(u'justdoit', u'oscars', 1)
(u'lechuguinos', u'25feb', 1)
(u'halal', u'haraam', 1)
(u'comedy', u'twitch', 1)
(u'halal', u'brexit', 1)


In [18]:
for i in range(25,30):
    print(key1[i], key2[i], count[i])


(u'justdoit', u'oscars', 1)
(u'lechuguinos', u'25feb', 1)
(u'halal', u'haraam', 1)
(u'comedy', u'twitch', 1)
(u'halal', u'brexit', 1)


## Taking my dictionary and making a sparse matrix from it

In [19]:
df = pd.DataFrame({'key1': key1, 'key2': key2, 'count': count})
df.set_index(['key1', 'key2'], inplace=True)
matrix = coo_matrix((df['count'],(df.index.labels[0], df.index.labels[1])))
matrix

<32x32 sparse matrix of type '<type 'numpy.int64'>'
	with 156 stored elements in COOrdinate format>

In [21]:
end_time = time.time()
end_time - start_time

41.14583992958069

In [23]:
scipy.sparse.save_npz('sparse_matrix.npz', matrix, compressed=True)
sparse_matrix = scipy.sparse.load_npz('sparse_matrix.npz')

In [24]:
sparse_matrix

<32x32 sparse matrix of type '<type 'numpy.int64'>'
	with 156 stored elements in COOrdinate format>