In [47]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sys
sys.path.append("../")
import boto3


from solardatatools import DataHandler, get_pvdaq_data

In [48]:
"""
Demo script for reading a CSV file from S3 into a pandas data frame using the boto3 library
"""

import os

import boto3
import pandas as pd

from dotenv import load_dotenv
load_dotenv()

AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    #aws_session_token=AWS_SESSION_TOKEN,
)



In [76]:
def read_csv_from_s3_meta(
    bucket_name=None,
    path=None,
    ):
    print(bucket_name,path)
    if bucket_name is None or path is None:
        print("no bucket name or path")
        return
    
    response = s3_client.get_object(Bucket=bucket_name, Key=path)

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 get_object response. Status - {status}")
        result_df = pd.read_csv(response.get("Body"))
    else:
        print(f"Unsuccessful S3 get_object response. Status - {status}")
    return result_df

In [82]:
def read_csv_from_s3_pvo(
    bucket_name=None,
    path=None,
    index_col=0,
    parse_dates=[0],
    usecols=[1, 3]
    ):
    print(bucket_name,path)
    if bucket_name is None or path is None:
        print("no bucket name or path")
        return
    
    response = s3_client.get_object(Bucket=bucket_name, Key=path)

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 get_object response. Status - {status}")
        result_df = pd.read_csv(response.get("Body"),
                                index_col=index_col,
                                parse_dates=parse_dates,
                                usecols=usecols)
    else:
        print(f"Unsuccessful S3 get_object response. Status - {status}")
    return result_df

In [94]:
metadata_fn="PVO/sys_meta.csv"
meta_df = read_csv_from_s3_meta(AWS_S3_BUCKET,metadata_fn)
data_fn_pattern="PVO/PVOutput/{}.csv"
index_col=0
parse_dates=[0]
usecols=[1, 3]
fix_dst=True
tz_column="TimeZone"
id_column="ID"
verbose=True
file_index=0
print(meta_df['ID'])

pv.insight.nrel PVO/sys_meta.csv
Successful S3 get_object response. Status - 200
0       1095
1       1384
2       1421
3       1519
4       1533
       ...  
568    46375
569    46407
570    46520
571    46548
572    46717
Name: ID, Length: 573, dtype: int64


In [84]:
 id_num = meta_df[id_column][file_index]
print(id_num)

1095


In [None]:
# get specific file 

In [99]:

df = read_csv_from_s3_pvo(AWS_S3_BUCKET,data_fn_pattern.format(id_num))
#df = read_csv_from_s3_meta(AWS_S3_BUCKET,data_fn_pattern.format(id_num))

pv.insight.nrel PVO/PVOutput/1095.csv
Successful S3 get_object response. Status - 200


In [103]:
df.head()

Unnamed: 0_level_0,Power(W)
Time,Unnamed: 1_level_1
2011-08-05 06:20:00,1
2011-08-05 06:30:00,1
2011-08-05 06:35:00,1
2011-08-05 06:45:00,1
2011-08-05 06:50:00,1


In [104]:
dh = DataHandler(df)


In [107]:
dh.run_pipeline(power_col='Power(W)')

CAUTION: Multiple scan rates detected!
Scan rates (in seconds): [300, 600]
0 transitions detected.
Suggest splitting data set between:


total time: 13.52 seconds
--------------------------------
Breakdown
--------------------------------
Preprocessing              2.08s
Cleaning                   10.47s
Filtering/Summarizing      0.97s
    Data quality           0.15s
    Clear day detect       0.14s
    Clipping detect        0.03s
    Capacity change detect 0.64s



In [108]:
dh.report()

Length:                5.99 years
Capacity estimate:     1.60 kW
Data sampling:         5 minute
Data quality score:    78.7%
Data clearness score:  19.1%
Inverter clipping:     False
Time shifts corrected: True
Time zone correction:  None


In [30]:
file = data_fn_pattern.format(id_num)
print(file)

PVOutput/1095.csv


index: 0; system ID: 1095
