In [84]:
import numpy as np
import pandas as pd
import scipy
from google.cloud import storage
from scipy.sparse import coo_matrix
import pyspark

In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
        .builder
        .appName("pyspark")
        .getOrCreate()
    )

### Making the sparse matrix using one json file from the bucket (locally)

In [3]:
json_df = spark.read.json("data1/data.json")
json_df.printSchema()
#How to see datatype of columns


root
 |-- contributors: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- display_text_range: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- additional_media_info: struct (nullable = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- embeddable: boolean (nullable = true)
 |    |    |    |    |-- monetizable: boolean (nullable = true)
 |    |    |    |    |-- title: string (nullable = true)
 |    |    |    |-- display_url: string (nullable = true)
 |

In [4]:
from pyspark.sql.functions import col, lower, size
from itertools import combinations

def lowercase_list(lst):
    return list(map(lambda item: item.lower(), lst))

def identity(x):
    return x

hashtags = (
    json_df
        .select("entities.hashtags.text")
        # filter out any rows without more than one hashtag
        .where(size(col("text")) > 1)
        .limit(10)
        .rdd
        # select the text from the rows
        .map(lambda r: r['text'])
        # lowercase all the hashtags
        .map(lowercase_list)
)

* the reason we use .rdd is because it's a 'lower level API' so it exposes more features and power to you


* because we do .rdd, we need to write the function and give it to map

    in rdd, we work with the actual data

    whereas without .rdd, we just tell spark what we want to execute.

    and it will execute it if we give it an 'action' (e.g. collect(), or show() will actually do the execution)


* if we didn't use .rdd, spark will optimize the transformation of our data for us behind the scenes and we don't need to think about it


In [5]:
hashtags

PythonRDD[11] at RDD at PythonRDD.scala:53

In [6]:
hashtags.collect()
#using .collect() will return it as a list of rows

[['talk',
  'comedy',
  'music',
  'news',
  'supportsmallstreamers',
  'justdoit',
  'oscar2019',
  'oscars',
  'twitch',
  'snowday',
  'teamgodvek'],
 ['disgusting', 'notlegal'],
 ['trump', 'humantrafficking'],
 ['nicollewallace', 'trump'],
 ['leadership', 'democrats'],
 ['sosprisiones', 'tuabandonomepuedematar'],
 ['halal', 'haraam', 'food', 'brexit'],
 ['lechuguinos', '25feb'],
 ['president', 'trump', 'hanoi', 'northkorea', 'kimjongun'],
 ['emergency', 'potus']]

In [7]:
#sometimes there is more than one hashtag so we have a list of lists
#use flatmap to reduce it all into one list
hashtags.flatMap(identity).collect()

['talk',
 'comedy',
 'music',
 'news',
 'supportsmallstreamers',
 'justdoit',
 'oscar2019',
 'oscars',
 'twitch',
 'snowday',
 'teamgodvek',
 'disgusting',
 'notlegal',
 'trump',
 'humantrafficking',
 'nicollewallace',
 'trump',
 'leadership',
 'democrats',
 'sosprisiones',
 'tuabandonomepuedematar',
 'halal',
 'haraam',
 'food',
 'brexit',
 'lechuguinos',
 '25feb',
 'president',
 'trump',
 'hanoi',
 'northkorea',
 'kimjongun',
 'emergency',
 'potus']

### Extra: I wanted to assign a unique number to each hashtag  (afterwards to make a dictionary with it)


In [8]:
#I got zipWithIndex() from this website
#https://spark.apache.org/docs/1.1.1/api/python/pyspark.rdd.RDD-class.html#zipWithIndex
hashtags.flatMap(identity).zipWithIndex().collect()

[('talk', 0),
 ('comedy', 1),
 ('music', 2),
 ('news', 3),
 ('supportsmallstreamers', 4),
 ('justdoit', 5),
 ('oscar2019', 6),
 ('oscars', 7),
 ('twitch', 8),
 ('snowday', 9),
 ('teamgodvek', 10),
 ('disgusting', 11),
 ('notlegal', 12),
 ('trump', 13),
 ('humantrafficking', 14),
 ('nicollewallace', 15),
 ('trump', 16),
 ('leadership', 17),
 ('democrats', 18),
 ('sosprisiones', 19),
 ('tuabandonomepuedematar', 20),
 ('halal', 21),
 ('haraam', 22),
 ('food', 23),
 ('brexit', 24),
 ('lechuguinos', 25),
 ('25feb', 26),
 ('president', 27),
 ('trump', 28),
 ('hanoi', 29),
 ('northkorea', 30),
 ('kimjongun', 31),
 ('emergency', 32),
 ('potus', 33)]

In [9]:
#taking our list of tuples, we now put it into a dictionary 
indexed_hashtags = hashtags.flatMap(identity).zipWithIndex().collectAsMap()
indexed_hashtags

{'talk': 0,
 'comedy': 1,
 'music': 2,
 'news': 3,
 'supportsmallstreamers': 4,
 'justdoit': 5,
 'oscar2019': 6,
 'oscars': 7,
 'twitch': 8,
 'snowday': 9,
 'teamgodvek': 10,
 'disgusting': 11,
 'notlegal': 12,
 'trump': 28,
 'humantrafficking': 14,
 'nicollewallace': 15,
 'leadership': 17,
 'democrats': 18,
 'sosprisiones': 19,
 'tuabandonomepuedematar': 20,
 'halal': 21,
 'haraam': 22,
 'food': 23,
 'brexit': 24,
 'lechuguinos': 25,
 '25feb': 26,
 'president': 27,
 'hanoi': 29,
 'northkorea': 30,
 'kimjongun': 31,
 'emergency': 32,
 'potus': 33}

### Now I have all the hashtags, I want to find how the combinations of every hashtag

* instead of using permutations, I look at combinations

    because the matrix will be symmetric anyway, 
    
* later I will make the flip combinations and merge it to get a whole symmetric matrix

In [10]:
def pair_combinations(lst):
    return combinations(lst, 2)

In [11]:
hashtag_combinations = (
    hashtags.flatMap(pair_combinations)
        .map(lambda tup: (tup, 1))
        .countByKey()
)

In [12]:
hashtag_combinations

defaultdict(int,
            {('talk', 'comedy'): 1,
             ('talk', 'music'): 1,
             ('talk', 'news'): 1,
             ('talk', 'supportsmallstreamers'): 1,
             ('talk', 'justdoit'): 1,
             ('talk', 'oscar2019'): 1,
             ('talk', 'oscars'): 1,
             ('talk', 'twitch'): 1,
             ('talk', 'snowday'): 1,
             ('talk', 'teamgodvek'): 1,
             ('comedy', 'music'): 1,
             ('comedy', 'news'): 1,
             ('comedy', 'supportsmallstreamers'): 1,
             ('comedy', 'justdoit'): 1,
             ('comedy', 'oscar2019'): 1,
             ('comedy', 'oscars'): 1,
             ('comedy', 'twitch'): 1,
             ('comedy', 'snowday'): 1,
             ('comedy', 'teamgodvek'): 1,
             ('music', 'news'): 1,
             ('music', 'supportsmallstreamers'): 1,
             ('music', 'justdoit'): 1,
             ('music', 'oscar2019'): 1,
             ('music', 'oscars'): 1,
             ('music', 'twitch'): 

In [13]:
def flatten_dict(d):
    return [(x, y, z) for ((x, y), z) in d.items()]

flattened_dict = flatten_dict(hashtag_combinations)
flattened_dict

[('talk', 'comedy', 1),
 ('talk', 'music', 1),
 ('talk', 'news', 1),
 ('talk', 'supportsmallstreamers', 1),
 ('talk', 'justdoit', 1),
 ('talk', 'oscar2019', 1),
 ('talk', 'oscars', 1),
 ('talk', 'twitch', 1),
 ('talk', 'snowday', 1),
 ('talk', 'teamgodvek', 1),
 ('comedy', 'music', 1),
 ('comedy', 'news', 1),
 ('comedy', 'supportsmallstreamers', 1),
 ('comedy', 'justdoit', 1),
 ('comedy', 'oscar2019', 1),
 ('comedy', 'oscars', 1),
 ('comedy', 'twitch', 1),
 ('comedy', 'snowday', 1),
 ('comedy', 'teamgodvek', 1),
 ('music', 'news', 1),
 ('music', 'supportsmallstreamers', 1),
 ('music', 'justdoit', 1),
 ('music', 'oscar2019', 1),
 ('music', 'oscars', 1),
 ('music', 'twitch', 1),
 ('music', 'snowday', 1),
 ('music', 'teamgodvek', 1),
 ('news', 'supportsmallstreamers', 1),
 ('news', 'justdoit', 1),
 ('news', 'oscar2019', 1),
 ('news', 'oscars', 1),
 ('news', 'twitch', 1),
 ('news', 'snowday', 1),
 ('news', 'teamgodvek', 1),
 ('supportsmallstreamers', 'justdoit', 1),
 ('supportsmallstreamer

### I used combinations, so I only have half the matrix.
### Since it's symmetrical, I can fill in the other half of the matrix by stacking the opposite of the info on top of original info

In [17]:
key1, key2, count = zip(*flattened_dict)

In [18]:
matrix_first_half = flattened_dict
matrix_first_half

[('talk', 'comedy', 1),
 ('talk', 'music', 1),
 ('talk', 'news', 1),
 ('talk', 'supportsmallstreamers', 1),
 ('talk', 'justdoit', 1),
 ('talk', 'oscar2019', 1),
 ('talk', 'oscars', 1),
 ('talk', 'twitch', 1),
 ('talk', 'snowday', 1),
 ('talk', 'teamgodvek', 1),
 ('comedy', 'music', 1),
 ('comedy', 'news', 1),
 ('comedy', 'supportsmallstreamers', 1),
 ('comedy', 'justdoit', 1),
 ('comedy', 'oscar2019', 1),
 ('comedy', 'oscars', 1),
 ('comedy', 'twitch', 1),
 ('comedy', 'snowday', 1),
 ('comedy', 'teamgodvek', 1),
 ('music', 'news', 1),
 ('music', 'supportsmallstreamers', 1),
 ('music', 'justdoit', 1),
 ('music', 'oscar2019', 1),
 ('music', 'oscars', 1),
 ('music', 'twitch', 1),
 ('music', 'snowday', 1),
 ('music', 'teamgodvek', 1),
 ('news', 'supportsmallstreamers', 1),
 ('news', 'justdoit', 1),
 ('news', 'oscar2019', 1),
 ('news', 'oscars', 1),
 ('news', 'twitch', 1),
 ('news', 'snowday', 1),
 ('news', 'teamgodvek', 1),
 ('supportsmallstreamers', 'justdoit', 1),
 ('supportsmallstreamer

In [19]:
matrix_second_half = list(zip(key2,key1,count))
matrix_second_half


[('comedy', 'talk', 1),
 ('music', 'talk', 1),
 ('news', 'talk', 1),
 ('supportsmallstreamers', 'talk', 1),
 ('justdoit', 'talk', 1),
 ('oscar2019', 'talk', 1),
 ('oscars', 'talk', 1),
 ('twitch', 'talk', 1),
 ('snowday', 'talk', 1),
 ('teamgodvek', 'talk', 1),
 ('music', 'comedy', 1),
 ('news', 'comedy', 1),
 ('supportsmallstreamers', 'comedy', 1),
 ('justdoit', 'comedy', 1),
 ('oscar2019', 'comedy', 1),
 ('oscars', 'comedy', 1),
 ('twitch', 'comedy', 1),
 ('snowday', 'comedy', 1),
 ('teamgodvek', 'comedy', 1),
 ('news', 'music', 1),
 ('supportsmallstreamers', 'music', 1),
 ('justdoit', 'music', 1),
 ('oscar2019', 'music', 1),
 ('oscars', 'music', 1),
 ('twitch', 'music', 1),
 ('snowday', 'music', 1),
 ('teamgodvek', 'music', 1),
 ('supportsmallstreamers', 'news', 1),
 ('justdoit', 'news', 1),
 ('oscar2019', 'news', 1),
 ('oscars', 'news', 1),
 ('twitch', 'news', 1),
 ('snowday', 'news', 1),
 ('teamgodvek', 'news', 1),
 ('justdoit', 'supportsmallstreamers', 1),
 ('oscar2019', 'support

In [20]:
matrix_first_half + matrix_second_half

[('talk', 'comedy', 1),
 ('talk', 'music', 1),
 ('talk', 'news', 1),
 ('talk', 'supportsmallstreamers', 1),
 ('talk', 'justdoit', 1),
 ('talk', 'oscar2019', 1),
 ('talk', 'oscars', 1),
 ('talk', 'twitch', 1),
 ('talk', 'snowday', 1),
 ('talk', 'teamgodvek', 1),
 ('comedy', 'music', 1),
 ('comedy', 'news', 1),
 ('comedy', 'supportsmallstreamers', 1),
 ('comedy', 'justdoit', 1),
 ('comedy', 'oscar2019', 1),
 ('comedy', 'oscars', 1),
 ('comedy', 'twitch', 1),
 ('comedy', 'snowday', 1),
 ('comedy', 'teamgodvek', 1),
 ('music', 'news', 1),
 ('music', 'supportsmallstreamers', 1),
 ('music', 'justdoit', 1),
 ('music', 'oscar2019', 1),
 ('music', 'oscars', 1),
 ('music', 'twitch', 1),
 ('music', 'snowday', 1),
 ('music', 'teamgodvek', 1),
 ('news', 'supportsmallstreamers', 1),
 ('news', 'justdoit', 1),
 ('news', 'oscar2019', 1),
 ('news', 'oscars', 1),
 ('news', 'twitch', 1),
 ('news', 'snowday', 1),
 ('news', 'teamgodvek', 1),
 ('supportsmallstreamers', 'justdoit', 1),
 ('supportsmallstreamer

In [21]:
len(matrix_first_half + matrix_second_half)

156

In [22]:
key1, key2, count = zip(*matrix_first_half + matrix_second_half)

In [23]:
key1 = list(key1)
key2 = list(key2)
count = list(count)

In [49]:
df = pd.DataFrame({'key1': key1, 'key2': key2, 'count': count})
df.set_index(['key1', 'key2'], inplace=True)
matrix = coo_matrix((df['count'],(df.index.labels[0], df.index.labels[1])))
matrix

  This is separate from the ipykernel package so we can avoid doing imports until


<32x32 sparse matrix of type '<class 'numpy.int64'>'
	with 156 stored elements in COOrdinate format>

## Checking the matrix

In [106]:
for i in flattened_dict[25:30]:
    print(i)


('music', 'snowday', 1)
('music', 'teamgodvek', 1)
('news', 'supportsmallstreamers', 1)
('news', 'justdoit', 1)
('news', 'oscar2019', 1)


In [107]:
for i in range(25,30):
    print(key1[i], key2[i], count[i])


music snowday 1
music teamgodvek 1
news supportsmallstreamers 1
news justdoit 1
news oscar2019 1


In [108]:
matrix.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0]])

## Now looking at working on all Json Files


In [26]:
from google.cloud import storage

# Explicitly use service account credentials by specifying the private key
# file.
storage_client = storage.Client.from_service_account_json('shirley spark homework-44a3d7102220.json')
storage_client

<google.cloud.storage.client.Client at 0x7fc8515b4e48>

In [27]:
# Make an authenticated API request
bucket = storage_client.bucket("bgse-datawarehousing-random-tweets")
bucket

<Bucket: bgse-datawarehousing-random-tweets>

In [28]:
json_file_list=[]
for blob in bucket.list_blobs():
    json_file_list.append("gs://bgse-datawarehousing-random-tweets" + "/" + blob.name)
    
print(json_file_list[5])
print(len(json_file_list))

gs://bgse-datawarehousing-random-tweets/2019-02-26T00:10:40.612Z
6109


In [29]:
json_file_list[0]


'gs://bgse-datawarehousing-random-tweets/2019-02-26T00:00:30.657Z'