# Tweet Data Analysis Using PySpark
## Part 2: *Final code*

Now that we have understand the problem (in the previous notebook *tweets data exploration*) we can run this notebook to create the sparse matrix for all the JSON files 

In [7]:
import pyspark
from itertools import permutations
import numpy as np
from scipy import sparse
from google.cloud import storage

## Importing data

Lets start by importing one individual JSON file and developing the whole extraction process.

## Operating with several JSON files

Lets try getting data from several JSON files at a time

In [8]:
# Get the names from all the files stored in the publoc bucket
client = storage.Client()
bucket = client.bucket("bgse-datawarehousing-random-tweets")
json_files_list = []
for b in bucket.list_blobs(prefix=""):
    json_files_list.append("gs://bgse-datawarehousing-random-tweets/" + b.name)

In [None]:
json_files_list

[u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:38.681Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:33.941Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:05:37.498Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:10:26.830Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:10:40.612Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:15:30.403Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:15:37.573Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:20:28.787Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:20:36.044Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:25:27.894Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:25:29.916Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:30:31.307Z',
 u'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:30:40.196Z',
 u'gs://bgse-datawar

In [None]:
agg_list = []

for f in json_files_list:
    raw_tweets = spark.read.json(f)
    
    h_list = raw_tweets.rdd \
   .filter(lambda r: r.entities is not None) \
   .map(lambda r: r.entities) \
   .filter(lambda e: len(e.hashtags) > 1) \
   .map(lambda e:[h.text.lower() for h in e.hashtags]) \
   .flatMap(lambda h:permutations(h,2)) \
   .map(lambda l: (l,1)) \
   .reduceByKey(lambda a, b: a+b) \
   .map(lambda l:(l[0][0], l[0][1], l[1])) \
   .collect()

    agg_list.append(h_list)

In [None]:
#List of lists
h_tuples = []

#flatten the list
for l in agg_list:
    for t in l:
        h_tuples.append(t)

In [None]:
words = [h[0] for h in h_tuples]
words = [h[1] for h in h_tuples]
words = np.array(words)
h_unique = np.unique(words)

Now that we have all the unique hashtags lets create a dictionary where we store them and give them a value to later use in the matrix. This dictionary will then allow me to easily identify words with a value.

In [None]:
words_dict = {}

In [None]:
for i, h in enumerate(h_unique):
    words_dict[h] = i

In [None]:
data = np.array([h_tuples[i][2] for i in range(len(h_tuples))])
columns = np.array([h_tuples[i][1] for i in range(len(h_tuples))])
rows = np.array([h_tuples[i][0] for i in range(len(h_tuples))])

In [None]:
def to_identifier(hashtag):
    return words_dict[hashtag]

In [None]:
def to_word(identifier):
    for word, iden in words_dict.items():
        if iden == identifier:
            print(word)

In [None]:
col_key = np.array([to_identifier(c) for c in columns])
row_key = np.array([to_identifier(r) for r in rows])

In [None]:
coo_mat = sparse.coo_matrix((data, (row_key, col_key)), shape =(len(words_dict),len(words_dict)))

In [None]:
coo_mat

Now lets save the matrix

In [None]:
scipy.sparse.save.npz("sparse_matrix.npz", coot_mat, compressed = True)