# Tweet Data Analysis Using PySpark
## Part 1: *Problem exploration*

In [1]:
import pyspark
from itertools import permutations
import numpy as np
from scipy import sparse
from google.cloud import storage

## Importing data

Lets start by importing one individual JSON file and developing the whole extraction process.

In [2]:
raw_tweets = spark.read.json("gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z")

Now lest print the schema to understand the location of the hashtags within the tweets data

In [3]:
raw_tweets.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |    |-- title: string (nullable = true)
 |    |    |    |-- display_url: string (nullable = true)
 |

In [4]:
h_list = raw_tweets.rdd \
   .filter(lambda r: r.entities is not None) \
   .map(lambda r: r.entities) \
   .filter(lambda e: len(e.hashtags) > 1) \
   .map(lambda e:[h.text.lower() for h in e.hashtags]) \
   .flatMap(lambda h:permutations(h,2)) \
   .map(lambda l: (l,1)) \
   .reduceByKey(lambda a, b: a+b) \
   .map(lambda l:(l[0][0], l[0][1], l[1])) \
   .collect()

In [5]:
h_list

[(u'giletsjaunes', u'brexit', 1),
 (u'brexiteers', u'brexitshambles', 2),
 (u'palestine', u'usa', 1),
 (u'brexit', u'brexitvote', 1),
 (u'gop', u'republicans', 1),
 (u'owleyes', u'living', 1),
 (u'uk', u'may', 1),
 (u'bbc', u'yellowvestsuk', 1),
 (u'brexit', u'bbc', 1),
 (u'brexitshambles', u'despitebrexit', 2),
 (u'lechuguinos', u'25feb', 3),
 (u'canada', u'switzerland', 1),
 (u'kuwait', u'oman', 1),
 (u'validate', u'brexit', 1),
 (u'northkorea', u'kimjongun', 1),
 (u'france', u'unhr', 1),
 (u'leave', u'eu', 2),
 (u'taxcutsandjobsact', u'taxscam', 1),
 (u'fbpe', u'despitebrexit', 2),
 (u'pornopresident', u'prostitutionpatriots', 1),
 (u'marxist', u'ukip', 1),
 (u'switzerland', u'scandinavia', 1),
 (u'eu', u'us', 1),
 (u'trumpcolluded', u'russianasset', 1),
 (u'eu', u'cleanbrexit', 2),
 (u'despitebrexit', u'ukip', 2),
 (u'canada', u'germany', 1),
 (u'impeachthemf', u'trumpconspired', 1),
 (u'referendum', u'peoplesvote', 1),
 (u'yellowvestsuk', u'liberal', 1),
 (u'haraam', u'food', 1),


In [6]:
len(h_list)

1348

## Structuring the Scipy sparse matrix

To create my sparse matrix I need to start by organizing the data into a valid format to populate the matrix. To do this I will explore the data as I currently have it.

In [7]:
(h_list[22][0], h_list[22][1], h_list[22][2])

(u'eu', u'us', 1)

I can now easily access each element of the tuples I have extracted from the JSON file. I can then turn these tuples into organized data that I can feed into the sparse matrix. My first idea is to transform the list of tuples into a dictionary where each combination of hashtagas is a key and the value is the number of coocurrences.

In [8]:
h_dict = {}

In [9]:
h_dict[(h_list[22][0], h_list[22][1])] = h_list[22][2]

In [10]:
h_dict[(h_list[22][0], h_list[22][1])]

1

It works for one element, so lets now make it work for all the elements of the list

In [11]:
h_dict = {}

In [12]:
for h in h_list:
    h_dict[(h[0], h[1])] = h[2]

In [13]:
h_dict

{(u'potus', u'trump'): 1,
 (u'thegreatawakening', u'qanon'): 1,
 (u'un', u'austria'): 1,
 (u'giletsjaunes', u'brexit'): 1,
 (u'vi\xf1a2019', u'video'): 2,
 (u'brexiteers', u'brexitshambles'): 2,
 (u'palestine', u'usa'): 1,
 (u'brexit', u'corbyn'): 2,
 (u'music', u'comedy'): 3,
 (u'politicalcartoon', u'uk'): 1,
 (u'marxist', u'brexit'): 1,
 (u'brexit', u'brexitvote'): 1,
 (u'leave', u'despitebrexit'): 2,
 (u'owleyes', u'living'): 1,
 (u'uk', u'may'): 1,
 (u'talk', u'oscar2019'): 3,
 (u'despitebrexit', u'brexit'): 2,
 (u'qanon', u'trump2020'): 1,
 (u'dutchpm', u'darwinism'): 1,
 (u'brexitshambles', u'despitebrexit'): 2,
 (u'unhr', u'us'): 1,
 (u'tucker', u'thestory'): 1,
 (u'switzerland', u'unhr'): 1,
 (u'eu', u'unsc'): 1,
 (u'lechuguinos', u'25feb'): 3,
 (u'politicalcartoon', u'brexit'): 1,
 (u'canada', u'switzerland'): 1,
 (u'yolo', u'\uc5f4\ud608\ub0a8\uc544'): 2,
 (u'kuwait', u'oman'): 1,
 (u'brexitshambles', u'brexit'): 2,
 (u'validate', u'brexit'): 1,
 (u'un', u'eu'): 1,
 (u'oscar2

To create a **Coordinate Format (COO)** sparse matrix, I need to rearrange my data a little bit to create columns, rows and the data that will populate the matrix

In [14]:
h_unique = np.empty((0,1), str)
h_unique

array([], shape=(0, 1), dtype='|S1')

In [15]:
h_unique = [np.append(h_unique, h[0]) for h in h_list]

In [16]:
h_unique = [np.append(h_unique, h[1]) for h in h_list]

In [17]:
h_unique = np.unique(h_unique)

In [18]:
len(h_unique)

274

Now that we have all the unique hashtags lets create a dictionary where we store them and give them a value to later use in the matrix. This dictionary will then allow me to easily identify words with a value.

In [19]:
words_dict = {}

In [20]:
for i, h in enumerate(h_unique):
    words_dict[h] = i

In [21]:
words_dict

{u'2019oscarsin5words': 0,
 u'25feb': 1,
 u'alliance': 2,
 u'america': 3,
 u'amerika': 4,
 u'artvsartist': 5,
 u'artvsartist2019': 6,
 u'asambleadelospueblos': 7,
 u'atheist': 8,
 u'austria': 9,
 u'barr': 10,
 u'bbc': 11,
 u'bernietownhall': 12,
 u'bigots': 13,
 u'blockchain': 14,
 u'bonespurs': 15,
 u'bregret': 16,
 u'brent': 17,
 u'brexit': 18,
 u'brexitbetrayal': 19,
 u'brexitbritain': 20,
 u'brexitchaos': 21,
 u'brexiteers': 22,
 u'brexitshambles': 23,
 u'brexitvote': 24,
 u'buildthewall': 25,
 u'bush': 26,
 u'canada': 27,
 u'cartoon': 28,
 u'ccot': 29,
 u'childsexrings': 30,
 u'china': 31,
 u'cleanbrexit': 32,
 u'clinton': 33,
 u'cnn': 34,
 u'comedy': 35,
 u'congress': 36,
 u'conservative': 37,
 u'coquimbo': 38,
 u'corbyn': 39,
 u'corbynsolvesbrexit': 40,
 u'covingtoncatholic': 41,
 u'crazyusa': 42,
 u'ctl': 43,
 u'darwinism': 44,
 u'demand': 45,
 u'democracy': 46,
 u'democrats': 47,
 u'deplorables': 48,
 u'despitebrexit': 49,
 u'disgusting': 50,
 u'disheveled': 51,
 u'doj': 52,
 

In [22]:
data = np.empty((0,1), str)

In [23]:
for i in words_dict:
    for j in words_dict:
        if i != j:
            try:
                data = np.append(data, h_dict[i,j])
            except KeyError:
                data = np.append(data, 0)

In [24]:
data

array(['0', '0', '0', ..., '1', '0', '0'], dtype='|S21')

### *But there is no need to do this, the *Scipy* library provides us with functions that can make the process easier*

In [25]:
data = np.array([h_list[i][2] for i in range(len(h_list))])
columns = np.array([h_list[i][1] for i in range(len(h_list))])
rows = np.array([h_list[i][0] for i in range(len(h_list))])

In [26]:
columns

array([u'brexit', u'brexitshambles', u'usa', ..., u'iamaskingforafriend',
       u'teamgodvek', u'maga'], dtype='<U22')

Now we just have to translate the hashtag into its value from the dictionary

In [27]:
def to_value(hashtag):
    return words_dict[hashtag]

In [28]:
to_value(columns[1])

23

In [29]:
col_key = np.array([to_value(c) for c in columns])
row_key = np.array([to_value(r) for r in rows])

In [30]:
col_key

array([ 18,  23, 253, ...,  92, 214, 123])

In [31]:
coo_mat = sparse.coo_matrix((data, (row_key, col_key)), shape =(len(words_dict),len(words_dict)))
coo_mat

<274x274 sparse matrix of type '<type 'numpy.int64'>'
	with 1348 stored elements in COOrdinate format>

In [32]:
coo_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 2],
        [0, 0, 0, ..., 0, 2, 0]])

### Lets try to see if the matrix makes sense

In [33]:
h_dict

{(u'potus', u'trump'): 1,
 (u'thegreatawakening', u'qanon'): 1,
 (u'un', u'austria'): 1,
 (u'giletsjaunes', u'brexit'): 1,
 (u'vi\xf1a2019', u'video'): 2,
 (u'brexiteers', u'brexitshambles'): 2,
 (u'palestine', u'usa'): 1,
 (u'brexit', u'corbyn'): 2,
 (u'music', u'comedy'): 3,
 (u'politicalcartoon', u'uk'): 1,
 (u'marxist', u'brexit'): 1,
 (u'brexit', u'brexitvote'): 1,
 (u'leave', u'despitebrexit'): 2,
 (u'owleyes', u'living'): 1,
 (u'uk', u'may'): 1,
 (u'talk', u'oscar2019'): 3,
 (u'despitebrexit', u'brexit'): 2,
 (u'qanon', u'trump2020'): 1,
 (u'dutchpm', u'darwinism'): 1,
 (u'brexitshambles', u'despitebrexit'): 2,
 (u'unhr', u'us'): 1,
 (u'tucker', u'thestory'): 1,
 (u'switzerland', u'unhr'): 1,
 (u'eu', u'unsc'): 1,
 (u'lechuguinos', u'25feb'): 3,
 (u'politicalcartoon', u'brexit'): 1,
 (u'canada', u'switzerland'): 1,
 (u'yolo', u'\uc5f4\ud608\ub0a8\uc544'): 2,
 (u'kuwait', u'oman'): 1,
 (u'brexitshambles', u'brexit'): 2,
 (u'validate', u'brexit'): 1,
 (u'un', u'eu'): 1,
 (u'oscar2

In [34]:
print(h_dict.keys()[5][0])
row_ex = to_value(h_dict.keys()[5][0])
row_ex

brexiteers


22

In [35]:
print(h_dict.keys()[5][1])
col_ex = to_value(h_dict.keys()[5][1])
col_ex

brexitshambles


23

In [36]:
h_dict[h_dict.keys()[5][0],h_dict.keys()[5][1]]

2

In [37]:
print(coo_mat.getcol(row_ex).getrow(col_ex))

  (0, 0)	2


It gives the correct answer!

## Operating with several JSON files

Lets try getting data from several JSON files at a time

In [38]:
# Get the names from all the files stored in the publoc bucket
client = storage.Client()
bucket = client.bucket("bgse-datawarehousing-random-tweets")
json_files_list = []
for b in bucket.list_blobs(prefix=""):
    json_files_list.append("gs://bgse-datawarehousing-random-tweets/" + b.name)

In [39]:
len(json_files_list)

6109

In [40]:
json_files_list[0:4]

[u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:38.681Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:33.941Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:37.498Z']

Lets now try to get the data for the first four files

In [41]:
agg_list = []

for f in json_files_list[0:4]:
    raw_tweets = spark.read.json(f)
    
    h_list = raw_tweets.rdd \
   .filter(lambda r: r.entities is not None) \
   .map(lambda r: r.entities) \
   .filter(lambda e: len(e.hashtags) > 1) \
   .map(lambda e:[h.text.lower() for h in e.hashtags]) \
   .flatMap(lambda h:permutations(h,2)) \
   .map(lambda l: (l,1)) \
   .reduceByKey(lambda a, b: a+b) \
   .map(lambda l:(l[0][0], l[0][1], l[1])) \
   .collect()

    agg_list.append(h_list)

In [42]:
agg_list

[[(u'giletsjaunes', u'brexit', 1),
  (u'brexiteers', u'brexitshambles', 2),
  (u'palestine', u'usa', 1),
  (u'brexit', u'brexitvote', 1),
  (u'gop', u'republicans', 1),
  (u'owleyes', u'living', 1),
  (u'uk', u'may', 1),
  (u'bbc', u'yellowvestsuk', 1),
  (u'brexit', u'bbc', 1),
  (u'brexitshambles', u'despitebrexit', 2),
  (u'lechuguinos', u'25feb', 3),
  (u'canada', u'switzerland', 1),
  (u'kuwait', u'oman', 1),
  (u'validate', u'brexit', 1),
  (u'northkorea', u'kimjongun', 1),
  (u'france', u'unhr', 1),
  (u'leave', u'eu', 2),
  (u'taxcutsandjobsact', u'taxscam', 1),
  (u'fbpe', u'despitebrexit', 2),
  (u'pornopresident', u'prostitutionpatriots', 1),
  (u'marxist', u'ukip', 1),
  (u'switzerland', u'scandinavia', 1),
  (u'eu', u'us', 1),
  (u'trumpcolluded', u'russianasset', 1),
  (u'eu', u'cleanbrexit', 2),
  (u'despitebrexit', u'ukip', 2),
  (u'canada', u'germany', 1),
  (u'impeachthemf', u'trumpconspired', 1),
  (u'referendum', u'peoplesvote', 1),
  (u'yellowvestsuk', u'liberal', 

In [43]:
len(agg_list)

4

First lets flatten this list

In [44]:
#List of lists
h_tuples = []

#flatten the list
for l in agg_list:
    for t in l:
        h_tuples.append(t)

In [45]:
h_tuples

[(u'giletsjaunes', u'brexit', 1),
 (u'brexiteers', u'brexitshambles', 2),
 (u'palestine', u'usa', 1),
 (u'brexit', u'brexitvote', 1),
 (u'gop', u'republicans', 1),
 (u'owleyes', u'living', 1),
 (u'uk', u'may', 1),
 (u'bbc', u'yellowvestsuk', 1),
 (u'brexit', u'bbc', 1),
 (u'brexitshambles', u'despitebrexit', 2),
 (u'lechuguinos', u'25feb', 3),
 (u'canada', u'switzerland', 1),
 (u'kuwait', u'oman', 1),
 (u'validate', u'brexit', 1),
 (u'northkorea', u'kimjongun', 1),
 (u'france', u'unhr', 1),
 (u'leave', u'eu', 2),
 (u'taxcutsandjobsact', u'taxscam', 1),
 (u'fbpe', u'despitebrexit', 2),
 (u'pornopresident', u'prostitutionpatriots', 1),
 (u'marxist', u'ukip', 1),
 (u'switzerland', u'scandinavia', 1),
 (u'eu', u'us', 1),
 (u'trumpcolluded', u'russianasset', 1),
 (u'eu', u'cleanbrexit', 2),
 (u'despitebrexit', u'ukip', 2),
 (u'canada', u'germany', 1),
 (u'impeachthemf', u'trumpconspired', 1),
 (u'referendum', u'peoplesvote', 1),
 (u'yellowvestsuk', u'liberal', 1),
 (u'haraam', u'food', 1),


In [46]:
len(h_tuples)

5663

Now we have to create a dictionary with the unique words in the hashtags

In [48]:
h_tuples[1][0]

u'brexiteers'

In [57]:
words = [h[0] for h in h_tuples]

In [58]:
words = [h[1] for h in h_tuples]

In [59]:
h_unique = np.array(words)

In [60]:
h_unique = np.unique(h_unique)

In [61]:
len(h_unique)

873

Now that we have all the unique hashtags lets create a dictionary where we store them and give them a value to later use in the matrix. This dictionary will then allow me to easily identify words with a value.

In [62]:
words_dict = {}

In [63]:
for i, h in enumerate(h_unique):
    words_dict[h] = i

In [64]:
words_dict

{u'1hour': 0,
 u'1u': 1,
 u'2019oscarsin5words': 2,
 u'2020elections': 3,
 u'23favalanchahumanitaria': 4,
 u'25feb': 5,
 u'25thamendment': 6,
 u'25thamendmentnow': 7,
 u'2ndamendment': 8,
 u'2ndreferendum': 9,
 u'4thofjuly': 10,
 u'6g': 11,
 u'abcnews': 12,
 u'abortionisnothealthcare': 13,
 u'acosta': 14,
 u'act': 15,
 u'adamschiff': 16,
 u'afp': 17,
 u'aid': 18,
 u'algeria': 19,
 u'algerie': 20,
 u'alliance': 21,
 u'amazon': 22,
 u'amdg': 23,
 u'america': 24,
 u'americafirst': 25,
 u'americans': 26,
 u'amerika': 27,
 u'animal': 28,
 u'anotherreferendum': 29,
 u'arresttrump': 30,
 u'artvsartist': 31,
 u'artvsartist2019': 32,
 u'asambleadelospueblos': 33,
 u'asilodijo': 34,
 u'ask': 35,
 u'assholeone': 36,
 u'ass\xe9dio': 37,
 u'atheist': 38,
 u'attorneys': 39,
 u'ausopen': 40,
 u'austin': 41,
 u'austria': 42,
 u'avello': 43,
 u'badrinos': 44,
 u'bankruptcy': 45,
 u'barr': 46,
 u'batman': 47,
 u'bbc': 48,
 u'bbcnews': 49,
 u'bbcpl': 50,
 u'bbcqt': 51,
 u'bce': 52,
 u'bedroomtax': 53,
 u

In [65]:
data = np.array([h_tuples[i][2] for i in range(len(h_tuples))])
columns = np.array([h_tuples[i][1] for i in range(len(h_tuples))])
rows = np.array([h_tuples[i][0] for i in range(len(h_tuples))])

In [66]:
def to_identifier(hashtag):
    return words_dict[hashtag]

In [67]:
def to_word(identifier):
    for word, iden in words_dict.items():
        if iden == identifier:
            print(word)

In [69]:
col_key = np.array([to_identifier(c) for c in columns])
row_key = np.array([to_identifier(r) for r in rows])

In [70]:
print(col_key)
print(row_key)

[ 83  91 802 ...  51 724 417]
[298  88 536 ...  48  83 743]


In [71]:
to_word(row_key[0])

giletsjaunes


In [72]:
coo_mat = sparse.coo_matrix((data, (row_key, col_key)), shape =(len(words_dict),len(words_dict)))

In [73]:
coo_mat

<873x873 sparse matrix of type '<type 'numpy.int64'>'
	with 5663 stored elements in COOrdinate format>