# Data Imports via API
#### Historical and forecast weather data used as an example

## Step 1: Import Required Libraries

In [2]:
#Libraries specifically required for API imports
import yaml
from yaml import load, dump

import requests
from pandas.io.json import json_normalize

#Other libraries that are always good to have
import pandas as pd
import numpy as np
from datetime import date, timedelta, time, datetime
import matplotlib.pyplot as plt


#Note: if it is your first time importing a library, run a PIP install like so. Ensure you keep the exclamation point

# ! pip install library_name



## Optional - Step 2: connect to Celonis
#### complete only if you need to integrate the API data with Celonis data / analyses

In [None]:
from pycelonis import get_celonis, pql

celonis = get_celonis("team", 
                      "key")

#team is something like: https://berkshirehathawayenergy.us-2.celonis.cloud/
#the key can be created by going to Edit Profile (under the circular button in top right) --> create API key

## Optional - Step 3: load data from Celonis data model
#### Complete only if you need to integrate API data with Celonis data/analyses. All code is sample code and should be adjusted to fit your data.

In [None]:

#use the code below if you are querying directly from a data model
    # dm_id = 'data_model_id'
    # datamodel = celonis.datamodels.find(dm_id)

#use the code below to query from studio
    # package = celonis.packages.find('package_id')
    # source_analysis = package.analyses.find('analysis_id')

    
#use the code below to create your PQL query (examples of aggregate functions and filter statements are provided)
    # q1 = pql.PQL()
    # q1 += pql.PQLColumn("ROUND_DAY(table.field1)", "Date")
    # q1 += pql.PQLColumn("SUM(table.field2)", "Actual")
    # q1 += pql.PQLColumn("table.field3", "Region")
    # q1 += pql.PQLFilter("table.field2 IS NOT NULL; ")
    # q1 += pql.PQLFilter("table.field1 > TO_DATE ( '2019-08-17 00:00:00' , FORMAT ( '%Y-%m-%d %H:%M:%S' )); ")

#use the code below to create your dataframe
    # df1 = datamodel.get_data_frame(q1)



## Step 4: Import Data via API
#### To request your own API token for NOAA weather data, go here: https://www.ncdc.noaa.gov/cdo-web/token
#### To find the dataset and station ID you need, go here: https://www.ncdc.noaa.gov/cdo-web/webservices/v2

In [None]:
# ADJUSTMENT REQUIRED - Define the amount of time that you will pull data for (in days)
timeframe = 30

#usually the max date will be Today's date
max_date = datetime.today().strftime("%Y-%m-%d")

#the min date will be the Today's date offset by the timeframe indicated above
min_date = datetime.today() + timedelta(days=timeframe*(-1))
min_date = min_date.strftime("%Y-%m-%d")

# ADJUSTMENT REQUIRED - define the data set ID (refer to link above to find dataset IDs)
datasetid = 'GHCND'

# ADJUSTMENT REQUIRED - define the station ID (refer to link above to find station IDs)
stationid = 'GHCND:USW00023169'

# ADJUSTMENT REQUIRED - define the token
token = 'xyz'

# run this code to obtain the request
url = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=' + datasetid +'&stationid=' + stationid + '&startdate=' + min_date + '&enddate=' + max_date + '&units=standard&limit=1000'
payload = {}
headers = {
    'token': token
}


#run the query to get raw data, put raw data into JSON format
data = requests.request("GET", url, headers=headers, json = payload)
data = data.json()


data

## Step 5: create a dataframe from your JSON data using json_normalize function
#### 'results' should be replaced with the JSON object you need

In [8]:
df=pd.json_normalize(data['results'])
df

Unnamed: 0,date,datatype,station,attributes,value
0,2021-08-25T00:00:00,AWND,GHCND:USW00023169,",,W,",7.4
1,2021-08-25T00:00:00,PRCP,GHCND:USW00023169,",,W,2400",0.0
2,2021-08-25T00:00:00,SNOW,GHCND:USW00023169,",,W,",0.0
3,2021-08-25T00:00:00,SNWD,GHCND:USW00023169,",,W,",0.0
4,2021-08-25T00:00:00,TAVG,GHCND:USW00023169,"H,,S,",92.0
...,...,...,...,...,...
279,2021-09-20T00:00:00,SNOW,GHCND:USW00023169,",,D,",0.0
280,2021-09-20T00:00:00,TAVG,GHCND:USW00023169,"H,,S,",81.0
281,2021-09-20T00:00:00,TMAX,GHCND:USW00023169,",,D,2400",88.0
282,2021-09-20T00:00:00,TMIN,GHCND:USW00023169,",,D,2400",73.0


## Optional - Step 6: Helpful Formatting Functions
#### This step provides examples functions that can be used to manipulate your API output. These sample functions refer to a generic dataframe titled "df". "df" should be replaced with your dataframe name.


In [None]:
#limit how much output ALL functions in the workbook will show
pd.set_option('display.max_rows', None)

#sort dataframe values by Date
df = df.sort_values(by=["column_name"])

#drop the last two rows of the dataframe
df.drop(df.tail(2).index,inplace = True)

#reset the index of the dataframe
df = df.reset_index(drop=True)

#rename a column
df = df.rename(columns={"current_column_name": "new_column_name"})

#change datatype of a column
df['date_column_name'] = pd.to_datetime(df['date_column_name'])

#filter dataset
df = df[df['date_column_name'] >  pd.Timestamp(datetime.now())]

#drop columns
df = df.drop(columns=['column1', 'column2', 'column3'])

## Optional - Step 7: push data to Celonis
#### Only use if you need to push API data to your Celonis data pool (usually API data is just used as an input to a model in MLWB and does not need to be pushed to the Celonis data)

In [None]:
#define the data pool
data_pool = celonis.pools.find("data_pool_id")

#option 1 - replace existing table
data_pool.push_table(df,"table_name_in_data_pool", if_exists = 'replace')


#option 2 - upsert data (similar to a delta load) using primary key of table
data_pool.upsert_table(table_name="table_name_in_data_pool",
                       df_or_path=df,
                       primary_keys=['primary_key'])

