# Tweet Data Analysis Using PySpark

In [19]:
import pyspark
from itertools import permutations
import numpy as np
from scipy import sparse
from google.cloud import storage

## Importing data

Lets start by importing one individual JSON file and developing the whole extraction process.

In [2]:
raw_tweets = spark.read.json("gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z")

In [3]:
raw_tweets.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |    |-- title: string (nullable = true)
 |    |    |    |-- display_url: string (nullable = true)
 |

In [4]:
h_list = raw_tweets.rdd \
   .filter(lambda r: r.entities is not None) \
   .map(lambda r: r.entities) \
   .filter(lambda e: len(e.hashtags) > 1) \
   .map(lambda e:[h.text.lower() for h in e.hashtags]) \
   .flatMap(lambda h:permutations(h,2)) \
   .map(lambda l: (l,1)) \
   .reduceByKey(lambda a, b: a+b) \
   .map(lambda l:(l[0][0], l[0][1], l[1])) \
   .collect()

In [5]:
h_list

[(u'giletsjaunes', u'brexit', 1),
 (u'brexiteers', u'brexitshambles', 2),
 (u'palestine', u'usa', 1),
 (u'brexit', u'brexitvote', 1),
 (u'gop', u'republicans', 1),
 (u'owleyes', u'living', 1),
 (u'uk', u'may', 1),
 (u'bbc', u'yellowvestsuk', 1),
 (u'brexit', u'bbc', 1),
 (u'brexitshambles', u'despitebrexit', 2),
 (u'lechuguinos', u'25feb', 3),
 (u'canada', u'switzerland', 1),
 (u'kuwait', u'oman', 1),
 (u'validate', u'brexit', 1),
 (u'northkorea', u'kimjongun', 1),
 (u'france', u'unhr', 1),
 (u'leave', u'eu', 2),
 (u'taxcutsandjobsact', u'taxscam', 1),
 (u'fbpe', u'despitebrexit', 2),
 (u'pornopresident', u'prostitutionpatriots', 1),
 (u'marxist', u'ukip', 1),
 (u'switzerland', u'scandinavia', 1),
 (u'eu', u'us', 1),
 (u'trumpcolluded', u'russianasset', 1),
 (u'eu', u'cleanbrexit', 2),
 (u'despitebrexit', u'ukip', 2),
 (u'canada', u'germany', 1),
 (u'impeachthemf', u'trumpconspired', 1),
 (u'referendum', u'peoplesvote', 1),
 (u'yellowvestsuk', u'liberal', 1),
 (u'haraam', u'food', 1),


In [6]:
len(h_list)

1348

## Structure the Scipy sparse matrix

To create my sparse matrix I need to start by organizing the data into columns, rows and the values that will populate the matrix

In [8]:
(h_list[22][0], h_list[22][1], h_list[22][2])

(u'eu', u'us', 1)

I can access each element of the tuples I have extracted from the JSON files. I can turn these tuples into organized data that I can feed into the sparse matrix.

In [9]:
data = np.array([h_list[i][2] for i in range(len(h_list))])
columns = np.array([h_list[i][1] for i in range(len(h_list))])
rows = np.array([h_list[i][0] for i in range(len(h_list))])

In [10]:
col_key = np.array([i for i in range(len(h_list))])
row_key = np.array([i for i in range(len(h_list))])

In [15]:
(rows[row_key[22]], columns[col_key[22]], h_list[22][2])

(u'eu', u'us', 1)

In [17]:
coo_mat = sparse.coo_matrix((data, (row_key, col_key)), shape =(len(h_list),len(h_list)))

In [18]:
coo_mat.todense()

matrix([[1, 0, 0, ..., 0, 0, 0],
        [0, 2, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 3, 0],
        [0, 0, 0, ..., 0, 0, 2]])

## Operating with several JSON files

Lets try getting data from several JSON files at a time

In [20]:
# Get the names from all the files stored in the publoc bucket
client = storage.Client()
bucket = client.bucket("bgse-datawarehousing-random-tweets")
json_files_list = []
for b in bucket.list_blobs(prefix=""):
    json_files_list.append("gs://bgse-datawarehousing-random-tweets/" + b.name)

In [21]:
len(json_files_list)

6109

In [22]:
json_files_list[0:4]

[u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:38.681Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:33.941Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:37.498Z']

Lets now try to get the data for the first four files

In [23]:
agg_list = []

for f in json_files_list[0:4]:
    raw_tweets = spark.read.json(f)
    
    h_list = raw_tweets.rdd \
   .filter(lambda r: r.entities is not None) \
   .map(lambda r: r.entities) \
   .filter(lambda e: len(e.hashtags) > 1) \
   .map(lambda e:[h.text.lower() for h in e.hashtags]) \
   .flatMap(lambda h:permutations(h,2)) \
   .map(lambda l: (l,1)) \
   .reduceByKey(lambda a, b: a+b) \
   .map(lambda l:(l[0][0], l[0][1], l[1])) \
   .collect()

    agg_list.append(h_list)

In [24]:
agg_list

[[(u'giletsjaunes', u'brexit', 1),
  (u'brexiteers', u'brexitshambles', 2),
  (u'palestine', u'usa', 1),
  (u'brexit', u'brexitvote', 1),
  (u'gop', u'republicans', 1),
  (u'owleyes', u'living', 1),
  (u'uk', u'may', 1),
  (u'bbc', u'yellowvestsuk', 1),
  (u'brexit', u'bbc', 1),
  (u'brexitshambles', u'despitebrexit', 2),
  (u'lechuguinos', u'25feb', 3),
  (u'canada', u'switzerland', 1),
  (u'kuwait', u'oman', 1),
  (u'validate', u'brexit', 1),
  (u'northkorea', u'kimjongun', 1),
  (u'france', u'unhr', 1),
  (u'leave', u'eu', 2),
  (u'taxcutsandjobsact', u'taxscam', 1),
  (u'fbpe', u'despitebrexit', 2),
  (u'pornopresident', u'prostitutionpatriots', 1),
  (u'marxist', u'ukip', 1),
  (u'switzerland', u'scandinavia', 1),
  (u'eu', u'us', 1),
  (u'trumpcolluded', u'russianasset', 1),
  (u'eu', u'cleanbrexit', 2),
  (u'despitebrexit', u'ukip', 2),
  (u'canada', u'germany', 1),
  (u'impeachthemf', u'trumpconspired', 1),
  (u'referendum', u'peoplesvote', 1),
  (u'yellowvestsuk', u'liberal', 

In [29]:
len(agg_list)

4

First lets flatten this list

In [30]:
#List of lists
h_tuples = []

#flatten the list
for l in agg_list:
    for t in l:
        h_tuples.append(t)

In [31]:
h_tuples

[(u'giletsjaunes', u'brexit', 1),
 (u'brexiteers', u'brexitshambles', 2),
 (u'palestine', u'usa', 1),
 (u'brexit', u'brexitvote', 1),
 (u'gop', u'republicans', 1),
 (u'owleyes', u'living', 1),
 (u'uk', u'may', 1),
 (u'bbc', u'yellowvestsuk', 1),
 (u'brexit', u'bbc', 1),
 (u'brexitshambles', u'despitebrexit', 2),
 (u'lechuguinos', u'25feb', 3),
 (u'canada', u'switzerland', 1),
 (u'kuwait', u'oman', 1),
 (u'validate', u'brexit', 1),
 (u'northkorea', u'kimjongun', 1),
 (u'france', u'unhr', 1),
 (u'leave', u'eu', 2),
 (u'taxcutsandjobsact', u'taxscam', 1),
 (u'fbpe', u'despitebrexit', 2),
 (u'pornopresident', u'prostitutionpatriots', 1),
 (u'marxist', u'ukip', 1),
 (u'switzerland', u'scandinavia', 1),
 (u'eu', u'us', 1),
 (u'trumpcolluded', u'russianasset', 1),
 (u'eu', u'cleanbrexit', 2),
 (u'despitebrexit', u'ukip', 2),
 (u'canada', u'germany', 1),
 (u'impeachthemf', u'trumpconspired', 1),
 (u'referendum', u'peoplesvote', 1),
 (u'yellowvestsuk', u'liberal', 1),
 (u'haraam', u'food', 1),


In [32]:
data = np.array([h_tuples[i][2] for i in range(len(h_tuples))])
columns = np.array([h_tuples[i][1] for i in range(len(h_tuples))])
rows = np.array([h_tuples[i][0] for i in range(len(h_tuples))])
col_key = np.array([i for i in range(len(h_tuples))])
row_key = np.array([i for i in range(len(h_tuples))])

In [33]:
data

array([1, 2, 1, ..., 1, 1, 1])

In [35]:
coo_mat = sparse.coo_matrix((data, (row_key, col_key)), shape =(len(h_tuples),len(h_tuples)))

In [36]:
coo_mat.todense()

matrix([[1, 0, 0, ..., 0, 0, 0],
        [0, 2, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 1]])

How can we know which hashtags are we referencing in the table?

In [37]:
columns

array([u'brexit', u'brexitshambles', u'usa', ..., u'bbcqt', u'marr',
       u'\u5b9a\u5ef6'], dtype='<U31')

In [42]:
def get_col_name(col_index):
    return(columns[col_index])

def get_row_name(row_index):
    return(rows[row_index])

In [44]:
(get_col_name(1), get_row_name(1), data[1])

(u'brexitshambles', u'brexiteers', 2)

# The final test

Now lets make this work for all the JSON files in the bucket

In [45]:
json_files_list

[u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:38.681Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:33.941Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:37.498Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:10:26.830Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:10:40.612Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:15:30.403Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:15:37.573Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:20:28.787Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:20:36.044Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:25:27.894Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:25:29.916Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:30:31.307Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:30:40.196Z',
 u'gs://bgse-datawar

In [None]:
agg_list = []

for f in json_files_list:
    raw_tweets = spark.read.json(f)
    
    h_list = raw_tweets.rdd \
   .filter(lambda r: r.entities is not None) \
   .map(lambda r: r.entities) \
   .filter(lambda e: len(e.hashtags) > 1) \
   .map(lambda e:[h.text.lower() for h in e.hashtags]) \
   .flatMap(lambda h:permutations(h,2)) \
   .map(lambda l: (l,1)) \
   .reduceByKey(lambda a, b: a+b) \
   .map(lambda l:(l[0][0], l[0][1], l[1])) \
   .collect()

    agg_list.append(h_list)

In [None]:
#List of lists
h_tuples = []

#flatten the list
for l in agg_list:
    for t in l:
        h_tuples.append(t)

In [None]:
data = np.array([h_tuples[i][2] for i in range(len(h_tuples))])
columns = np.array([h_tuples[i][1] for i in range(len(h_tuples))])
rows = np.array([h_tuples[i][0] for i in range(len(h_tuples))])
col_key = np.array([i for i in range(len(h_tuples))])
row_key = np.array([i for i in range(len(h_tuples))])

In [None]:
coo_mat = sparse.coo_matrix((data, (row_key, col_key)), shape =(len(h_tuples),len(h_tuples)))

Now lets save the matrix

In [None]:
scipy.sparse.save.npz("sparse_matrix.npz", coot_mat, compressed = True)