In [1]:
import numpy as np
import pandas as pd
import scipy

from google.cloud import storage
from scipy.sparse import coo_matrix

import pyspark

## Reading json files from the bucket 

In [2]:
bucket_name = "bgse-datawarehousing-random-tweets"

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

json_file_list = []
for blob in bucket.list_blobs():
    json_file_list.append("gs://" + bucket_name + "/" + blob.name)


In [3]:
len(json_file_list)

6109

In [4]:
json_df = spark.read.json(json_file_list[0:10])

In [5]:
json_df.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: bo

## Extracting Hashtags

In [6]:
from pyspark.sql.functions import col, lower, size
from itertools import combinations

def lowercase_list(lst):
    return list(map(lambda item: item.lower(), lst))

def identity(x):
    return x

hashtags = (
    json_df
        .select("entities.hashtags.text")
        # filter out any rows without more than one hashtag
        .where(size(col("text")) > 1)
        .rdd
        # select the text from the rows
        .map(lambda r: r['text'])
        # lowercase all the hashtags
        .map(lowercase_list)
)

In [7]:
hashtags

PythonRDD[8] at RDD at PythonRDD.scala:52

In [8]:
#sometimes there is more than one hashtag so we have a list of lists
#use flatmap to reduce it all into one list
hashtags.flatMap(identity).collect()

[u'seguridadporlapaz',
 u'asambleadelospueblos',
 u'veterans',
 u'fuckyou',
 u'mtpdaily',
 u'thebeat',
 u'inners',
 u'brexit',
 u'diergezondheid',
 u'crisisenvenezuela',
 u'misiles',
 u'individualone',
 u'25thamendment',
 u'eeuu',
 u'venezuela',
 u'eeuu',
 u'venezuela',
 u'corbyn',
 u'brexit',
 u'restart',
 u'trump',
 u'genepool',
 u'traitorous',
 u'rino',
 u'childtrafficking',
 u'obstructionist',
 u'weareremain',
 u'fbpe',
 u'brexit',
 u'finalsayforall',
 u'justmakeitstop',
 u'secondreferendum',
 u'peoplesvote',
 u'genepool',
 u'traitorous',
 u'rino',
 u'childtrafficking',
 u'obstructionist',
 u'brexit',
 u'sexybrexit',
 u'trump',
 u'racism',
 u'brexit',
 u'leavers',
 u'sosprisiones',
 u'tuabandonomepuedematar',
 u'trump',
 u'notmypresident',
 u'brexit',
 u'pme',
 u'iran',
 u'etatsunis',
 u'zarif',
 u'trump',
 u'fakenews',
 u'fakenewswapo',
 u'bolshevickies',
 u'trotsky',
 u'tb',
 u'doglover',
 u'remain',
 u'reman',
 u'roman',
 u'trump',
 u'black',
 u'americans',
 u'maga',
 u'clonewar

## Getting combinations of every hashtag

In [9]:
def pair_combinations(lst):
    return combinations(lst, 2)

In [10]:
hashtag_combinations = (
    hashtags.flatMap(pair_combinations)
        .map(lambda tup: (tup, 1))
        .countByKey()
)

In [11]:
def flatten_dict(d):
    return [(x, y, z) for ((x, y), z) in d.items()]

flattened_dict = flatten_dict(hashtag_combinations)
flattened_dict

[(u'sigueme', u'sigatop10hojesdv', 1),
 (u'may', u'conservatives', 1),
 (u'fucking', u'coward', 1),
 (u'politics', u'sports', 2),
 (u'complicitgop', u'unamerican', 1),
 (u'impeachtrump', u'childrenstolen', 2),
 (u'backstop', u'eu', 1),
 (u'supplychain', u'retail', 1),
 (u'heslying', u'unfittoserve', 2),
 (u'corruption', u'greed', 1),
 (u'belgium', u'italexit', 1),
 (u'bbcnews', u'may', 1),
 (u'maga', u'adolphhitler', 1),
 (u'conservatives', u'ucla', 1),
 (u'hostage', u'maga', 2),
 (u'referendum', u'regrexit', 1),
 (u'li', u'dailydrawing', 1),
 (u'deepstaterino', u'peoplespresident', 1),
 (u'immigration', u'donaldtrump', 1),
 (u'facts', u'being', 1),
 (u'austin', u'vw', 1),
 (u'trumpmoron', u'politicslive', 1),
 (u'msnbc', u'trump', 1),
 (u'russianasset', u'collusion', 1),
 (u'rutte3moetoprotten', u'brexit', 3),
 (u'leftwing', u'blair', 47),
 (u'usa', u'us', 2),
 (u'marxist', u'bbc', 1),
 (u'followme', u'pussy', 1),
 (u'trump', u'nasdaq', 1),
 (u'vietnam', u'fighting', 1),
 (u'chc', u'c

## Creating both halves of matrix

In [12]:
key1, key2, count = zip(*flattened_dict)

In [13]:
matrix_first_half = flattened_dict
matrix_second_half = list(zip(key2,key1,count))
matrix_first_half + matrix_second_half

[(u'sigueme', u'sigatop10hojesdv', 1),
 (u'may', u'conservatives', 1),
 (u'fucking', u'coward', 1),
 (u'politics', u'sports', 2),
 (u'complicitgop', u'unamerican', 1),
 (u'impeachtrump', u'childrenstolen', 2),
 (u'backstop', u'eu', 1),
 (u'supplychain', u'retail', 1),
 (u'heslying', u'unfittoserve', 2),
 (u'corruption', u'greed', 1),
 (u'belgium', u'italexit', 1),
 (u'bbcnews', u'may', 1),
 (u'maga', u'adolphhitler', 1),
 (u'conservatives', u'ucla', 1),
 (u'hostage', u'maga', 2),
 (u'referendum', u'regrexit', 1),
 (u'li', u'dailydrawing', 1),
 (u'deepstaterino', u'peoplespresident', 1),
 (u'immigration', u'donaldtrump', 1),
 (u'facts', u'being', 1),
 (u'austin', u'vw', 1),
 (u'trumpmoron', u'politicslive', 1),
 (u'msnbc', u'trump', 1),
 (u'russianasset', u'collusion', 1),
 (u'rutte3moetoprotten', u'brexit', 3),
 (u'leftwing', u'blair', 47),
 (u'usa', u'us', 2),
 (u'marxist', u'bbc', 1),
 (u'followme', u'pussy', 1),
 (u'trump', u'nasdaq', 1),
 (u'vietnam', u'fighting', 1),
 (u'chc', u'c

In [14]:
key1, key2, count = zip(*matrix_first_half + matrix_second_half)

In [15]:
key1 = list(key1)
key2 = list(key2)
count = list(count)

## Checking the matrix

In [16]:
for i in flattened_dict[25:30]:
    print(i)


(u'leftwing', u'blair', 47)
(u'usa', u'us', 2)
(u'marxist', u'bbc', 1)
(u'followme', u'pussy', 1)
(u'trump', u'nasdaq', 1)


In [17]:
for i in range(25,30):
    print(key1[i], key2[i], count[i])


(u'leftwing', u'blair', 47)
(u'usa', u'us', 2)
(u'marxist', u'bbc', 1)
(u'followme', u'pussy', 1)
(u'trump', u'nasdaq', 1)


## Taking my dictionary and making a sparse matrix from it

In [18]:
df = pd.DataFrame({'key1': key1, 'key2': key2, 'count': count})
df.set_index(['key1', 'key2'], inplace=True)
matrix = coo_matrix((df['count'],(df.index.labels[0], df.index.labels[1])))
matrix

<1791x1791 sparse matrix of type '<type 'numpy.int64'>'
	with 11272 stored elements in COOrdinate format>

In [19]:
scipy.sparse.save_npz('sparse_matrix.npz', matrix, compressed=True)
sparse_matrix = scipy.sparse.load_npz('sparse_matrix.npz')

In [20]:
sparse_matrix

<1791x1791 sparse matrix of type '<type 'numpy.int64'>'
	with 11272 stored elements in COOrdinate format>