In [1]:
# Import packages

from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from itertools import permutations

from scipy.sparse import coo_matrix
from google.cloud import storage

import json
import datetime as dt
import scipy.sparse

In [3]:
# List all files in the public bucket

bucket_name = "bgse-datawarehousing-random-tweets"

client = storage.Client()
bucket = client.bucket(bucket_name)
json_file_list = []
for iterator, blob in enumerate(bucket.list_blobs(prefix="")):
    json_file_list.append("gs://" + bucket_name + "/" + blob.name)

# Check the first few files
print(json_file_list[0:2])
print(len(json_file_list))

[u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z', u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:38.681Z']
6109


In [10]:
# Load the data
stop=3
data = spark.read.json(json_file_list[0:stop])
#data = spark.read.json(json_file_list)

In [5]:
# Inspect the data
data.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |    |-- title: string (nullable = true)
 |    |    |    |-- display_url: string (nullable = true)
 |

In [11]:
# Create value list and measure time

start = dt.datetime.now()

value_list = data.rdd \
        .filter(lambda l: ((l.entities is not None) and \
                                (len(l.entities.hashtags) > 1) )) \
        .map(lambda l: [i.text.lower() for i in l.entities.hashtags]) \
        .flatMap(lambda l: permutations(l, 2)) \
        .map(lambda l: (l, 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda l: (l[0][0], l[0][1], l[1])) \
        .collect()
        
end = dt.datetime.now()
print(end-start)

0:00:06.182455


In [12]:
# Create column list
unique_hashtags = list(set([y[0] for y in value_list]))
sorted_unique_hashtags = sorted(unique_hashtags)
dictionary_column_names = { sorted_unique_hashtags[j] : j for j in range(0, len(sorted_unique_hashtags) ) }

In [13]:
# Creating the scipy sparse matrix
rows = np.array([dictionary_column_names[j[0]] for j in value_list])
cols = np.array([dictionary_column_names[j[1]] for j in value_list])
dataframe = np.array([j[2] for j in value_list])
scipy_sparse_matrix = coo_matrix((dataframe, (rows,cols)), shape=(len(dictionary_column_names), len(dictionary_column_names)))

In [14]:
# Inspect the created scipy sparse matrix
#print(sp_mat)
print(value_list[0:10])

for x in list(dictionary_column_names)[10:20]:
    print ("key {}, value {} ".format(x,  dictionary_column_names[x]))

[(u'rino', u'childtrafficking', 1), (u'brexit', u'newsnight', 7), (u'red', u'florida', 1), (u'trumpholiday', u'independence', 1), (u'breadlinebernie', u'mondaymotivation', 2), (u'uk', u'regrexit', 1), (u'uk', u'may', 2), (u'mondaythoughts', u'qanon', 1), (u'maydelay', u'2ndreferendum', 1), (u'giletsjaunes', u'brexit', 4)]
key trumpholiday, value 597 
key c4news, value 77 
key religion, value 467 
key demonicrats, value 131 
key bonespurs, value 50 
key breadlinebernie, value 54 
key shadowban, value 507 
key shadowbanning, value 509 
key the_collapse_of_america_is_near, value 563 
key ivankatrump, value 284 


In [10]:
# Save out the files
with open('value_list.json', 'w') as outfile:
            json.dump(value_list, outfile)
        
with open('dictionary_column_names.json', 'w') as outfile2:
            json.dump(dictionary_column_names, outfile2)
        
        
scipy.sparse.save_npz('scipy_sparse_matrix.npz', scipy_sparse_matrix, compressed = True)

In [11]:
# Copy files from the cluster to the bucket
# It was done using the terminal from the available web interfaces
# cd /
# ls -hl *.json
# ls -hl *.npz
# gsutil cp *.json gs://sparktweetbucket1996
# gsutil cp *.npz gs://sparktweetbucket1996

In [None]:
# RUNNING EVERYTHING ALL TOGETHER
# Import packages

from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from itertools import permutations

from scipy.sparse import coo_matrix
from google.cloud import storage

import json
import datetime as dt
import scipy.sparse

# List all files in the public bucket

bucket_name = "bgse-datawarehousing-random-tweets"

client = storage.Client()
bucket = client.bucket(bucket_name)
json_file_list = []
for iterator, blob in enumerate(bucket.list_blobs(prefix="")):
    json_file_list.append("gs://" + bucket_name + "/" + blob.name)

# Check the first few files
print(json_file_list[0:2])
print(len(json_file_list))


# Load the data
#stop=1
#data = spark.read.json(json_file_list[0:stop])
data = spark.read.json(json_file_list)

# Inspect the data
data.printSchema()

# Create value list and measure time

start = dt.datetime.now()

value_list = data.rdd \
        .filter(lambda l: ((l.entities is not None) and \
                                (len(l.entities.hashtags) > 1) )) \
        .map(lambda l: [i.text.lower() for i in l.entities.hashtags]) \
        .flatMap(lambda l: permutations(l, 2)) \
        .map(lambda l: (l, 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda l: (l[0][0], l[0][1], l[1])) \
        .collect()
        
end = dt.datetime.now()
print(end-start)


# Create column list
unique_hashtags = list(set([y[0] for y in value_list]))
sorted_unique_hashtags = sorted(unique_hashtags)
dictionary_column_names = { sorted_unique_hashtags[j] : j for j in range(0, len(sorted_unique_hashtags) ) }

# Creating the scipy sparse matrix
rows = np.array([dictionary_column_names[j[0]] for j in value_list])
cols = np.array([dictionary_column_names[j[1]] for j in value_list])
dataframe = np.array([j[2] for j in value_list])
scipy_sparse_matrix = coo_matrix((dataframe, (rows,cols)), shape=(len(dictionary_column_names), len(dictionary_column_names)))

# Save out the files
with open('value_list.json', 'w') as outfile:
            json.dump(value_list, outfile)
        
with open('dictionary_column_names.json', 'w') as outfile2:
            json.dump(dictionary_column_names, outfile2)
        
        
scipy.sparse.save_npz('scipy_sparse_matrix.npz', scipy_sparse_matrix, compressed = True)


[u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z', u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:38.681Z']
6109
root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- delete: struct (nullable = true)
 |    |-- status: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- id_str: string (nullable = true)
 |    |    |-- user_id: long (nullable = true)
 |    |    |-- user_id_str: string (nullable = true)
 |    |-- timestamp_ms: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: 