## [DEFINITION] Métodos para tratamento dos dados com o Spark

In [2]:
import pandas as pd
import json
import requests

def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)


def get_sql_context_instance(spark_context):
    
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
        
    return globals()['sqlContextSingletonInstance']

def get_hashtags(words):
    return set(w[1:] for w in words.split() if w.startswith('#'))


def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
    
    try:
        # Pegando o contexto corrente do spark sql
        #sql_context = get_sql_context_instance(rdd.context)
        
        # Convertendo RDD para Row RDD
        row_rdd = rdd.map(lambda w: Row(hashtag=w[0], hashtag_count=w[1]))
        
        print("================================")
        print("row_rdd: ")
        print(row_rdd.take(1))
        
        # Criando um dataframe com a lista mapeada de Row RDD
        hashtags_df = sql_context.createDataFrame(row_rdd)
        
        print("Convertendo para JSON: ")
        data = hashtags_df.toJSON().collect()
        print(data)
        
        for line in data:
            result = requests.post(url = "http://localhost:5001/v1/twits", json = json.loads(line))
            print(result)
        
        
    except Exception as ex:
        if rdd.isEmpty():
            pass
        else:
            print("Error: %s" % ex)

## [DEFINITION] Contexto do Spark

In [3]:
from pyspark import SparkConf,SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row,SQLContext
import sys
import requests

# Criando a configuração (Informe um nome para a sua API)
conf = SparkConf("local[*]")
conf.setAppName("twitterAppForDev")

# Cria o contexto spark com a configuração acima
sc = SparkContext(conf = conf)
sc.setLogLevel("ERROR") #Para evitar muito lixo no log

# Cria o contexto do straming para o contexto do spark acima utilizando como intervalo 3 segundos
ssc = StreamingContext(sc, 3)

# Setando o checkpoint para permitir a recuperação do RDD
ssc.checkpoint("checkpoint_TwitterApp")

# Lendo dados da porta 9009
dataStream = ssc.socketTextStream("localhost", 9009)

dataStream: 
<pyspark.streaming.dstream.DStream object at 0x0B003510>


## [TRANSFORMATION] Tratamento dos tweets

In [None]:
# Quebrando cada tweet em palavras
print("dataStream: ")
print(dataStream)

print("================================")
words = dataStream.flatMap(lambda line: line.split(" "))

print("================================")
print("words: ")
print(words)

# Filtra as palavras para obter apenas as hashtags, então mapeia cada hashtag para ser par com 1: (hashtag, 1)
hashtags = words.filter(get_hashtags).map(lambda x: (x, 1))

# Agregando as quantidades por hashtag
tags_totals = hashtags.updateStateByKey(aggregate_tags_count)

print("================================")
print("Processar tags_totals: ")
print(tags_totals)

# Processando cada os totais de cada tag com RDD
tags_totals.foreachRDD(process_rdd)

# Startando a computação do streaming
ssc.start()

# Esperando a finalização do straming
ssc.awaitTermination()

dataStream: 
<pyspark.streaming.dstream.DStream object at 0x0B003510>
words: 
<pyspark.streaming.dstream.TransformedDStream object at 0x0B003CD0>
Processar tags_totals: 
<pyspark.streaming.dstream.DStream object at 0x0B003BD0>
----------- 2020-06-04 17:24:54 -----------
row_rdd: 
[]
----------- 2020-06-04 17:24:57 -----------
row_rdd: 
[]
----------- 2020-06-04 17:25:00 -----------
row_rdd: 
[]
----------- 2020-06-04 17:25:03 -----------
row_rdd: 
----------- 2020-06-04 17:25:06 -----------
row_rdd: 
[]
----------- 2020-06-04 17:25:09 -----------
row_rdd: 
