In [0]:
from pyspark.sql import SparkSession

session = SparkSession.builder.appName('sparksession').getOrCreate()

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

import uuid

In [0]:
%run /Users/tha075bei026@tcioe.edu.np/actionLogger

In [0]:
session.sql('use weather')

Out[8]: DataFrame[]

In [0]:
session.sql('select current_schema()').show()
session.sql('show tables').show()

+------------------+
|current_database()|
+------------------+
|           weather|
+------------------+

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| weather|log_table|      false|
+--------+---------+-----------+



In [0]:

log_schema = StructType([
    StructField("id",StringType()),
    StructField("load_type", StringType()),
    StructField("table_name", StringType()),
    StructField("process_start_time", TimestampType()),
    StructField("process_end_time", TimestampType()),
    StructField("status", StringType()),
    StructField("comments", StringType()),
    StructField("start_date_time", TimestampType()),
    StructField("end_date_time", TimestampType()),
    StructField("created_on", TimestampType()),
    StructField("created_by", StringType())

])


In [0]:
def flatten_json(dict_col):
     for i in dict_col:
        value = dict_col[i]
        
        if isinstance(value, dict):
            yield from flatten_json(value)
        elif isinstance(value, list):
            for item in value:
                yield from flatten_json(item)
        else:
            yield value


In [0]:

source_weather_schema = StructType([
   
    StructField('lon', DoubleType()),
    StructField('lat', DoubleType()),
    
   
    StructField('description_id', IntegerType()),
    StructField('main', StringType()),
    StructField('description', StringType()),
    StructField('icon', StringType()),
    
    StructField('base', StringType()),
    

    StructField('temp', StringType()),
    StructField('feels_like', DoubleType()),
    StructField('temp_min', StringType()),
    StructField('temp_max', StringType()),
    StructField('pressure', IntegerType()),
    StructField('humidity', IntegerType()),
    StructField('sea_level', IntegerType()),
    StructField('grnd_level', IntegerType()),
    
    StructField('visibility', StringType()),

    StructField('wind_speed', DoubleType()),
    StructField('wind_deg', IntegerType()),
    StructField('wind_gust', DoubleType()),
    
    StructField('all',IntegerType()),
    
    StructField('dt', IntegerType()),
  
    StructField('country', StringType()),
    StructField('sunrise', IntegerType()),
    StructField('sunset', IntegerType()),
    
    StructField('timezone', IntegerType()),
    StructField('id', IntegerType()),
    StructField('name', StringType()),
    StructField('cod', IntegerType()),
    
   
    
])

In [0]:
import json
import requests
from requests.auth import HTTPDigestAuth
from pyspark.sql import Row

def data_from_api(city_id):
    url = 'https://api.openweathermap.org/data/2.5/weather?id=' + str(city_id) + '&appid=ae0bb98bc50300db4831762db7096897'
    response = requests.get(url)
    if(response.ok):
        jsdata = response.content
        jsdata = json.loads(jsdata)
        values = [value for value in flatten_json(jsdata)]
        df = session.createDataFrame(data = ([values]) ,schema = source_weather_schema)
        return df
    else:
        print('Error!')
        return response.status_code

In [0]:
city_data = session.read.format('delta').load('dbfs:/user/hive/warehouse/cities').limit(5)
city_id = city_data.select(city_data['id']).rdd.flatMap(lambda x: x).collect()
# city_id

In [0]:


def load_raw_data():
    
    log_dict = {'id': str(uuid.uuid4().hex),
            'load_type': 'raw',
            'table_name': 'raw_weather_table',
    
            'process_start_time': session.sql("SELECT current_timestamp()").collect()[0][0],
            'process_end_time': session.sql("SELECT current_timestamp()").collect()[0][0],

            'start_date_time':  session.sql("SELECT current_timestamp()").collect()[0][0],
            'end_date_time':  session.sql("SELECT current_timestamp()").collect()[0][0]}
    
    log = action_logger(log_dict)
    
    created_on = session.sql("SELECT current_timestamp()").collect()[0][0]
    created_by = session.sql('select current_user()').collect()[0][0]
    
    process_start_time = session.sql("SELECT current_timestamp()").collect()[0][0]
    
    city_data = [data_from_api(int(float(id))) for id in city_id]
    
    if not isinstance(city_data[0],DataFrame):
        process_end_time = session.sql("SELECT current_timestamp()").collect()[0][0]
        comments = 'error with status code' + str(city_data[0])
        error_dict = {
               'process_start_time' : process_start_time,
               'process_end_time' : process_end_time,
               'status' : 'error',
               'error_data' : comments,
               'start_date_time' : session.sql("SELECT current_timestamp()").collect()[0][0],
               'end_date_time' : session.sql("SELECT current_timestamp()").collect()[0][0]
        }
        log.action(error_dict)
    else:
        try:
            raw_city_all = city_data[0]
            for each in city_data[1:]:
                raw_city_all = raw_city_all.union(each)
            
            raw_city_all = raw_city_all.withColumn("load_run_id",lit(str(uuid.uuid4().hex)))\
                                    .withColumn("created_on",lit(created_on))\
                                    .withColumn("created_by", lit(created_by))
            
            raw_city_all = raw_city_all.distinct()
            
            process_end_time = session.sql("SELECT current_timestamp()").collect()[0][0]
            
            log_dict = {
               'process_start_time' : process_start_time,
               'process_end_time' : process_end_time,
               'status' : 'extracting',
               'start_date_time' : raw_city_all.select(min('created_on')).first()[0],
               'end_date_time' : raw_city_all.select(max('created_on')).first()[0]}
            
            log.action(log_dict)
            
    
#             raw_city_all.write.format('delta').mode('append').save('dbfs:/databases/weather/weather_table_raw')
            raw_city_all.write.option("overwriteSchema", "true")\
                .format('delta')\
                .mode('append')\
                .save('dbfs:/databases/weather/raw_weather_table')
        
            process_end_time = session.sql("SELECT current_timestamp()").collect()[0][0]
            
            log_dict = {
               'process_start_time' : process_start_time,
               'process_end_time' : process_end_time,
               'status' : 'completed',
               'start_date_time' : raw_city_all.select(min('created_on')).first()[0],
               'end_date_time' : raw_city_all.select(max('created_on')).first()[0]}
            log.action(log_dict)
                
        except Exception as e:
                error_dict = {
               'process_start_time' : process_start_time,
               'process_end_time' : session.sql("SELECT current_timestamp()").collect()[0][0],
               'status' : 'error',
               'error_data' : e,
               'start_date_time' : session.sql("SELECT current_timestamp()").collect()[0][0],
               'end_date_time' : session.sql("SELECT current_timestamp()").collect()[0][0] }
                log.action(error_dict)
    


In [0]:
# load_raw_data()