In [1]:
import numpy as np
import pandas as pd
from google.cloud import storage
from scipy.sparse import coo_matrix
import pyspark

## Reading json files from the bucket 

In [2]:
bucket_name = "bgse-datawarehousing-random-tweets"

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

json_file_list = []
for blob in bucket.list_blobs():
    json_file_list.append("gs://bgse-datawarehousing-random-tweets" + "/" + blob.name)


In [3]:
json_file_list[0]


u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z'

In [None]:
json_df = spark.read.json(json_file_list)
#json_df.printSchema()

## Extracting Hashtags

In [None]:
from pyspark.sql.functions import col, lower, size
from itertools import combinations

def lowercase_list(lst):
    return list(map(lambda item: item.lower(), lst))

def identity(x):
    return x

hashtags = (
    json_df
        .select("entities.hashtags.text")
        # filter out any rows without more than one hashtag
        .where(size(col("text")) > 1)
        .limit(10)
        .rdd
        # select the text from the rows
        .map(lambda r: r['text'])
        # lowercase all the hashtags
        .map(lowercase_list)
)

In [None]:
hashtags

PythonRDD[18] at RDD at PythonRDD.scala:52

In [None]:
#sometimes there is more than one hashtag so we have a list of lists
#use flatmap to reduce it all into one list
hashtags.flatMap(identity).collect()

[u'france',
 u'ford',
 u'happy_taeyang_day',
 u'narcissus',
 u'\uc608\ubed0\uc9c0\uc9c0\ub9c8',
 u'\ud0dc\uc591',
 u'\uc720\ud0dc\uc591',
 u'taeyang',
 u'sf9\ud0dc\uc591',
 u'\u30c6\u30e4\u30f3',
 u'nowplaying',
 u'onairnow',
 u'edm',
 u'redvelvet',
 u'\ub808\ub4dc\ubca8\ubcb3',
 u'redvelvet_redmare',
 u'redvelvet_redmareinusa',
 u'redvelvet_redmareincanada',
 u'\uc9c4\uc601',
 u'\ubc15\uc9c4\uc601',
 u'got7',
 u'\uc0ac\uc774\ucf54\uba54\ud2b8\ub9ac\uadf8\ub140\uc11d',
 u'redvelvet',
 u'seulgi',
 u'\ub808\ub4dc\ubca8\ubcb3',
 u'\uc2ac\uae30',
 u'pinkmarket',
 u'eiikleaw',
 u'mamamoo',
 u'\ub9c8\ub9c8\ubb34',
 u'\ud558\uc796\uc544_\ub9c8\ub9c8\ubb34_\ucef4\ubc31',
 u'twbdimash',
 u'worldsbest',
 u'nailagda',
 u'notebook']

## Getting combinations of every hashtag

In [16]:
def pair_combinations(lst):
    return combinations(lst, 2)

In [17]:
hashtag_combinations = (
    hashtags.flatMap(pair_combinations)
        .map(lambda tup: (tup, 1))
        .countByKey()
)

In [19]:
def flatten_dict(d):
    return [(x, y, z) for ((x, y), z) in d.items()]

flattened_dict = flatten_dict(hashtag_combinations)
flattened_dict

[(u'news', u'justdoit', 1),
 (u'justdoit', u'oscar2019', 1),
 (u'haraam', u'brexit', 1),
 (u'supportsmallstreamers', u'justdoit', 1),
 (u'snowday', u'teamgodvek', 1),
 (u'comedy', u'teamgodvek', 1),
 (u'oscars', u'snowday', 1),
 (u'oscar2019', u'teamgodvek', 1),
 (u'trump', u'hanoi', 1),
 (u'comedy', u'news', 1),
 (u'news', u'supportsmallstreamers', 1),
 (u'comedy', u'supportsmallstreamers', 1),
 (u'comedy', u'snowday', 1),
 (u'disgusting', u'notlegal', 1),
 (u'halal', u'food', 1),
 (u'talk', u'oscar2019', 1),
 (u'oscar2019', u'twitch', 1),
 (u'comedy', u'oscars', 1),
 (u'talk', u'teamgodvek', 1),
 (u'trump', u'humantrafficking', 1),
 (u'twitch', u'teamgodvek', 1),
 (u'talk', u'comedy', 1),
 (u'sosprisiones', u'tuabandonomepuedematar', 1),
 (u'oscar2019', u'snowday', 1),
 (u'news', u'oscars', 1),
 (u'justdoit', u'oscars', 1),
 (u'lechuguinos', u'25feb', 1),
 (u'halal', u'haraam', 1),
 (u'comedy', u'twitch', 1),
 (u'halal', u'brexit', 1),
 (u'president', u'hanoi', 1),
 (u'music', u'snow

## Creating both halves of matrix

In [20]:
key1, key2, count = zip(*flattened_dict)

In [28]:
matrix_first_half = flattened_dict
matrix_second_half = list(zip(key2,key1,count))
matrix_first_half + matrix_second_half

In [24]:
key1, key2, count = zip(*matrix_first_half + matrix_second_half)

In [25]:
key1 = list(key1)
key2 = list(key2)
count = list(count)

## Checking the matrix

In [32]:
for i in flattened_dict[25:30]:
    print(i)


(u'justdoit', u'oscars', 1)
(u'lechuguinos', u'25feb', 1)
(u'halal', u'haraam', 1)
(u'comedy', u'twitch', 1)
(u'halal', u'brexit', 1)


In [33]:
for i in range(25,30):
    print(key1[i], key2[i], count[i])


(u'justdoit', u'oscars', 1)
(u'lechuguinos', u'25feb', 1)
(u'halal', u'haraam', 1)
(u'comedy', u'twitch', 1)
(u'halal', u'brexit', 1)


## Taking my dictionary and making a sparse matrix from it

In [26]:
df = pd.DataFrame({'key1': key1, 'key2': key2, 'count': count})
df.set_index(['key1', 'key2'], inplace=True)
matrix = coo_matrix((df['count'],(df.index.labels[0], df.index.labels[1])))
matrix

<32x32 sparse matrix of type '<type 'numpy.int64'>'
	with 156 stored elements in COOrdinate format>

In [27]:
matrix.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0]])