In [21]:
import numpy as np
from scipy.sparse import coo_matrix
from itertools import permutations
from itertools import combinations
import scipy.sparse

from google.cloud import storage
import json
import datetime as dt

In [22]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySpark Intro") \
    .getOrCreate()

In [23]:
# Spark lets us read JSON files and create dataframes with nested items!

tw1 = spark.read.json('gs://bgse-datawarehousing-random-tweets/2019-02-26T02:15:33.028Z')

tw1.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: bo

In [24]:
#With certain methods I get a list of tuples, which contain two hashtags and the numeber which they co-occure.
hashtags = tw1.limit(10000).rdd\
    .filter(lambda x: x.entities is not None)\
    .map(lambda x: x.entities.hashtags)\
    .filter(lambda x: len(x)>1)\    #len(x)>1 because I want a list that contains more than two hashtags
    .map(lambda x: [i.text.lower() for i in x])\
    .flatMap(lambda x: permutations(x,2))\
    .map(lambda x: (x,1)) \
    .reduceByKey(lambda x,y: x+y)\
    .map(lambda x: (x[0][0], x[0][1], x[1]))\
    .collect()
hashtags

[(u'buildthewall', u'maga', 1),
 (u'government', u'medium', 1),
 (u'trump', u'kag', 8),
 (u'trump', u'gofuckme', 5),
 (u'pvv', u'ruttemoetweg', 1),
 (u'prajapati', u'politics', 1),
 (u'vincentfusca', u'darktolight', 1),
 (u'putin', u'china', 1),
 (u'draftdodger', u'coward', 2),
 (u'maga', u'ccot', 10),
 (u'amazing', u'loo', 1),
 (u'abortion', u'plannedparenthood', 1),
 (u'trump', u'party', 1),
 (u'wwg1wga', u'wwg1wgaworldwide', 1),
 (u'america', u'usa', 1),
 (u'people', u'prajapati', 1),
 (u'qarmy', u'wwg1wgaworldwide', 1),
 (u'resist', u'fakepresident', 1),
 (u'mueller', u'fakenews', 2),
 (u'singapore', u'prosper', 1),
 (u'maga', u'americasnewsroom', 3),
 (u'fakenews', u'mueller', 2),
 (u'troops', u'veteran', 2),
 (u'putin', u'ccp', 1),
 (u'impeachtrump', u'impeach45', 2),
 (u'fakenationalemergency', u'resistance', 1),
 (u'yolo', u'\uc5f4\ud608\ub0a8\uc544', 1),
 (u'wtf', u'trump', 5),
 (u'yolo', u'\ub9c8\ub974\ucf54', 1),
 (u'whitehouse', u'resist', 2),
 (u'rutte3wegermee', u'ruttemo

In [25]:
#Take only the unique hashtags and create dictionary
unique = list(set([i[0] for i in hashtags]))
diction = {key: i for i, key in enumerate(unique)}

#Make matrix's rows,columns and values
rows = np.asarray([diction[i[0]] for i in hashtags])
columns = np.asarray([diction[i[1]] for i in hashtags])
values = np.array([i[2] for i in hashtags])

In [26]:
#Sparse matrix
sm = coo_matrix((values, (rows, columns)), shape=(len(unique),len(unique)))
sm.shape

(296, 296)

In [13]:
#Save matrix in a file
scipy.sparse.save_npz("sparse_matrix.npz", sm, compressed=True)


## Run the WHOLE Bucket

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from itertools import permutations
from itertools import combinations
import scipy.sparse
from pyspark.sql import SparkSession
from google.cloud import storage
import json
import datetime as dt

start = dt.datetime.now()

# List all files in the public bucket

kouvas = "bgse-datawarehousing-random-tweets"
client = storage.Client()
bucket = client.bucket(kouvas)
json_file_list = []
for iterator, blob in enumerate(bucket.list_blobs(prefix="")):
    json_file_list.append("gs://" + kouvas + "/" + blob.name)

# Load the data

teras = spark.read.json(json_file_list)
teras.printSchema()

# Create list of tuples

hashtags = teras.rdd\
    .filter(lambda x: x.entities is not None)\
    .map(lambda x: x.entities.hashtags)\
    .filter(lambda x: len(x)>1)\
    .map(lambda x: [i.text.lower() for i in x])\
    .flatMap(lambda x: permutations(x,2))\
    .map(lambda x: (x,1)) \
    .reduceByKey(lambda x,y: x+y)\
    .map(lambda x: (x[0][0], x[0][1], x[1]))\
    .collect()
print(hashtags)

#Take only the unique hashtags and make dictionaries

unique = list(set([i[0] for i in hashtags]))
diction = {key: i for i, key in enumerate(unique)}

#Matrix's rows, columns and values
rows = np.asarray([diction[i[0]] for i in hashtags])
columns = np.asarray([diction[i[1]] for i in hashtags])
values = np.array([i[2] for i in hashtags])

#Sparse matrix
sm = coo_matrix((values, (rows, columns)), shape=(len(unique),len(unique)))
sm.shape

#Save matrix in a file

scipy.sparse.save_npz("sparse_great_matrix.npz", sm, compressed=True)

end = dt.datetime.now()
print(end-start)

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- delete: struct (nullable = true)
 |    |-- status: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- id_str: string (nullable = true)
 |    |    |-- user_id: long (nullable = true)
 |    |    |-- user_id_str: string (nullable = true)
 |    |-- timestamp_ms: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



0:50:28.785271
