In [1]:
import pandas as pd
import requests
import json
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt
import gzip
import re

#add the access token you got from NOAA
Token = 'lwBmDJlPCkqeMyurcljRSVQHVFVhhHDt'

**Get latitude and longitude of all plants in PJM**

In [2]:
# Get latitude and longitude of all plants in PJM
# Only find data of plants that are in the most recent egrid data 
df = pd.read_csv("egrid2018_plant.csv", usecols=["ISORTO", "LAT", "LON"])
df = df.dropna()
df = df.loc[df['ISORTO'] == "PJM"]
df = df[['LAT','LON']]
df = df.round(1)
df = df.groupby(df.columns.tolist()).size().reset_index().rename(columns={0:'count'})
df.head()

Unnamed: 0,LAT,LON,count
0,34.7,-78.0,1
1,35.1,-76.0,1
2,35.3,-75.5,1
3,35.6,-76.8,1
4,35.8,-77.6,1


**Get a list of all relevant weather stations**

In [3]:
# Keep ones that are in the U.S. 
df_stations = pd.read_csv('gsod-stations.csv')
df_stations = df_stations.dropna()
df_stations = df_stations.loc[df_stations['CTRY'] == "US"]

# Keep only stations that have data from 2012 - 2018
df_stations['BEGIN']= pd.to_datetime(df_stations['BEGIN'], format="%Y%m%d") 
df_stations['END']= pd.to_datetime(df_stations['END'], format="%Y%m%d") 
df_stations = df_stations[(df_stations['BEGIN'] <= '2012-1-1') & (df_stations['END'] >= '2017-12-31')]

df_stations.head()

Unnamed: 0,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,LAT,LON,ELEV(M),BEGIN,END
14474,690150,93121,TWENTY NINE PALMS,US,CA,KNXP,34.3,-116.167,625.1,1990-01-02,2020-03-02
15082,700001,26492,PORTAGE GLACIER,US,AK,PATO,60.785,-148.839,31.4,2006-01-01,2020-03-02
15084,700197,26558,SELAWIK,US,AK,PASK,66.6,-159.986,7.6,2006-01-01,2020-03-02
15086,700260,27502,W POST-WILL ROGERS MEMORIAL AIRPORT,US,AK,PABR,71.283,-156.782,9.5,1945-01-01,2020-03-02
15090,700300,27503,WAINWRIGHT AIRPORT,US,AK,PAWI,70.639,-159.995,9.1,1999-11-02,2020-03-02


**Merge the two datasets to find the closest weather station for each plant in PJM**
- Start by rounding lat long data to one decimal place and merging
- Then round to the nearest whole number and merge

In [4]:
# STEP 1: Round lat long data to 1 decimal place, merge

df_stations_rounded = df_stations.round(1)
df_stations_rounded = df_stations_rounded.drop_duplicates(subset=['LAT', 'LON'])

merged = pd.merge(df, df_stations_rounded, on=['LAT','LON'], how='left', indicator='Exist')
print(merged['Exist'].value_counts(normalize=True) * 100)

# Drop ones that are only in one df, then use the station 
merged = merged.dropna()
merged.head()

left_only     91.313559
both           8.686441
right_only     0.000000
Name: Exist, dtype: float64


Unnamed: 0,LAT,LON,count,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,ELEV(M),BEGIN,END,Exist
12,35.9,-77.5,1,720864,290.0,TARBORO EDGECOMBE AIRPORT,US,NC,KETC,16.2,2010-11-22,2020-03-02,both
21,36.0,-76.6,1,723074,3703.0,NORTHEASTERN REGIONAL ARPT,US,NC,KEDE,6.1,2006-01-01,2020-03-02,both
34,36.3,-77.6,3,720649,231.0,HALIFAX NORTHAMPTON REGIONAL AIRPORT,US,NC,KIXA,44.2,2009-07-22,2020-03-02,both
35,36.3,-77.2,2,723079,93796.0,TRI-COUNTY AIRPORT,US,NC,KASJ,20.7,2006-01-01,2020-03-02,both
77,36.7,-76.9,4,723083,13763.0,FRANKLIN MUNICIPAL-JOHN BEVERLY ROSE AIRPORT,US,VA,KFKN,12.5,1994-10-16,2020-03-02,both


In [5]:
# STEP 2: Round lat long data to the nearest whole number, merge

# Remove plants that were already included in a weather station 
df1 = pd.concat([df[['LAT', 'LON']],merged[['LAT', 'LON']]]).drop_duplicates(keep=False)
df1 = df1.groupby(df1.columns.tolist()).size().reset_index().rename(columns={0:'count'})

# Now round to the nearest whole number and add to df 
df1 = df1.round()
df_stations_rounded1 = df_stations_rounded.round()
df_stations_rounded1 = df_stations_rounded1.drop_duplicates(subset=['LAT', 'LON'])

# repeat steps to merge with previous df 
merged1 = pd.merge(df1, df_stations_rounded1, on=['LAT','LON'], how='left', indicator='Exist')

print(merged1['Exist'].value_counts(normalize=True) * 100)

merged1 = merged1.dropna()
merged1.head()

both          99.187935
left_only      0.812065
right_only     0.000000
Name: Exist, dtype: float64


Unnamed: 0,LAT,LON,count,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,ELEV(M),BEGIN,END,Exist
0,35.0,-78.0,1,722073,3727.0,SAMPSON COUNTY AIRPORT,US,NC,KCTZ,45.0,2006-01-01,2020-03-02,both
1,35.0,-76.0,1,723062,99999.0,USMC BOMB RANGE BT-11,US,NC,KNBT,5.0,2009-09-03,2020-03-01,both
2,35.0,-76.0,1,723062,99999.0,USMC BOMB RANGE BT-11,US,NC,KNBT,5.0,2009-09-03,2020-03-01,both
3,36.0,-77.0,1,723065,13783.0,PITT-GREENVILLE AIRPORT,US,NC,KPGV,8.0,2005-01-01,2020-03-02,both
4,36.0,-78.0,1,720288,3711.0,HENDERSON-OXFORD AIRPORT,US,NC,KHNZ,161.0,2006-01-01,2020-03-02,both


In [6]:
# These are the weather stations we need to find a weighted average of 
weather_data = merged.append(merged1) 
weather_data.head()

Unnamed: 0,LAT,LON,count,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,ELEV(M),BEGIN,END,Exist
12,35.9,-77.5,1,720864,290.0,TARBORO EDGECOMBE AIRPORT,US,NC,KETC,16.2,2010-11-22,2020-03-02,both
21,36.0,-76.6,1,723074,3703.0,NORTHEASTERN REGIONAL ARPT,US,NC,KEDE,6.1,2006-01-01,2020-03-02,both
34,36.3,-77.6,3,720649,231.0,HALIFAX NORTHAMPTON REGIONAL AIRPORT,US,NC,KIXA,44.2,2009-07-22,2020-03-02,both
35,36.3,-77.2,2,723079,93796.0,TRI-COUNTY AIRPORT,US,NC,KASJ,20.7,2006-01-01,2020-03-02,both
77,36.7,-76.9,4,723083,13763.0,FRANKLIN MUNICIPAL-JOHN BEVERLY ROSE AIRPORT,US,VA,KFKN,12.5,1994-10-16,2020-03-02,both


In [7]:
# Reformat df to correspond to file storage in ftp 
weather_data.WBAN = weather_data.WBAN.astype(int)
weather_data.WBAN = weather_data.WBAN.astype(str)
weather_data.WBAN = weather_data.WBAN.str.zfill(5)

weather_data['station_id'] = weather_data.USAF + '-' + weather_data.WBAN

weather_data

Unnamed: 0,LAT,LON,count,USAF,WBAN,STATION NAME,CTRY,STATE,ICAO,ELEV(M),BEGIN,END,Exist,station_id
12,35.9,-77.5,1,720864,00290,TARBORO EDGECOMBE AIRPORT,US,NC,KETC,16.2,2010-11-22,2020-03-02,both,720864-00290
21,36.0,-76.6,1,723074,03703,NORTHEASTERN REGIONAL ARPT,US,NC,KEDE,6.1,2006-01-01,2020-03-02,both,723074-03703
34,36.3,-77.6,3,720649,00231,HALIFAX NORTHAMPTON REGIONAL AIRPORT,US,NC,KIXA,44.2,2009-07-22,2020-03-02,both,720649-00231
35,36.3,-77.2,2,723079,93796,TRI-COUNTY AIRPORT,US,NC,KASJ,20.7,2006-01-01,2020-03-02,both,723079-93796
77,36.7,-76.9,4,723083,13763,FRANKLIN MUNICIPAL-JOHN BEVERLY ROSE AIRPORT,US,VA,KFKN,12.5,1994-10-16,2020-03-02,both,723083-13763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,42.0,-90.0,1,722082,04876,ALBERTUS AIRPORT,US,IL,KFEP,262.0,2006-01-01,2020-03-02,both,722082-04876
858,42.0,-88.0,1,722126,04879,LANSING MUNICIPAL AP,US,IL,KIGQ,188.0,2006-01-01,2020-03-03,both,722126-04879
859,42.0,-90.0,1,722082,04876,ALBERTUS AIRPORT,US,IL,KFEP,262.0,2006-01-01,2020-03-02,both,722082-04876
860,42.0,-89.0,1,722075,04871,DE KALB TAYLOR MUNI ARPT,US,IL,KDKB,279.0,2006-01-01,2020-03-03,both,722075-04871


In [8]:
### Someone else's code

def getData(station, year):
    '''
    Get weather data from the internet as memory stream
    '''
    big_df = pd.DataFrame()


    # Define URL
    url = 'http://www1.ncdc.noaa.gov/pub/data/gsod/' + str(year) + '/' + str(station) \
        + '-' + str(year) + '.op.gz'

    # Define data stream
    stream = requests.get(url)

    # Unzip on-the-fly
    decomp_bytes = gzip.decompress(stream.content)
    data = decomp_bytes.decode('utf-8').split('\n')

    '''
    Data manipulations and ordering
    '''
    # Remove start and end
    data.pop(0) # Remove first line header
    data.pop()  # Remove last element

    # Define lists
    (stn, wban, date, temp, temp_c, dewp, dewp_c,
     slp, slp_c, stp, stp_c, visib, visib_c,
     wdsp, wdsp_c, mxspd, gust, max, max_f, min, min_f,
     prcp, prcp_f, sndp, f, r, s, h, th, tr) = ([] for i in range(30))

    # Fill in lists
    for i in range(0, len(data)):
        stn.append(data[i][0:6])
        wban.append(data[i][7:12])
        date.append(data[i][14:22])         
        temp.append(data[i][25:30])
        dewp.append(data[i][36:41])
        slp.append(data[i][46:52])      # Mean sea level pressure
        wdsp.append(data[i][78:83])
        mxspd.append(data[i][88:93])
        gust.append(data[i][95:100])
        max.append(data[i][103:108])
        min.append(data[i][111:116])
        prcp.append(data[i][118:123])
        sndp.append(data[i][125:130])   # Snow depth in inches to tenth

    '''
    Replacements
    min_f & max_f
    blank   : explicit => e
    *       : derived => d
    '''
    max_f = [re.sub(pattern=' ', repl='e', string=x) for x in max_f] # List comprenhension
    max_f = [re.sub(pattern='\*', repl='d', string=x) for x in max_f]

    min_f = [re.sub(pattern=' ', repl='e', string=x) for x in min_f]
    min_f = [re.sub(pattern='\*', repl='d', string=x) for x in min_f]

    #Create dataframe & cleanse data
    # Create intermediate matrix
    mat = np.matrix(data=[stn, wban, date, temp, dewp, 
           slp, wdsp, mxspd, gust, max, min,
           prcp, sndp]).T

    # Define header names
    headers = ['stn', 'wban', 'date', 'temp', 'dewp', 
           'slp', 'wdsp', 'mxspd', 'gust', 'max', 'min',
           'prcp', 'sndp']

    # Set precision
    pd.set_option('precision', 3)

    # Create dataframe from matrix object
    df = pd.DataFrame(data=mat, columns=headers)

    # Replace missing values with NAs
    df = df.where(df != ' ', 9999.9)

    # Create station ids
    df['station_id'] = df['stn'].map(str) + '-' + df['wban'].map(str)
    df = df.drop(['stn', 'wban'], axis=1)

    # Convert to numeric
    df[['temp', 'dewp', 'slp', 'wdsp',
        'mxspd',  'gust', 'max', 'min', 'prcp', 'sndp']] = df[['temp', 'dewp', 'slp', 'wdsp', 'mxspd',  
                                                               'gust', 'max', 'min', 'prcp', 'sndp']].apply(pd.to_numeric)

    # Replace missing weather data with NaNs
    df = df.replace(to_replace=[99.99, 99.9,999.9,9999.9], value=np.nan)

    # Convert to date format
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

    big_df = pd.concat([big_df, df])

    return big_df

In [9]:
avged_data = pd.DataFrame()

# Need to replace this loop with apply 
for index, row in weather_data.iterrows():
    print(row.station_id)
    plant_data = pd.DataFrame()
    
    # two years for now 
    for curr in range(2012, 2014):
        try: 
            result = getData(station=str(row.station_id), year=curr)
            plant_data = plant_data.append(result)
        except:
            pass
    
    # repeat this for the number of values in count 
    avged_data = pd.concat([avged_data, plant_data])

avged_data

720864-00290
723074-03703
720649-00231
723079-93796
723083-13763
724007-03719
720499-00154
723080-13737
722322-00361
745980-13702
724113-53881
724014-93714
720297-03730
724110-13741
720498-00153
720285-03734
723114-03715
724035-13773
722218-00356
724036-03710
724037-93728
724093-13764
724050-13743
745940-13705
724043-03756
742079-63876
722244-00358
745966-03726
724053-03717
724065-99999
724088-13707
724273-03804
745944-93784
724067-93744
725217-53855
722081-13730
724075-13735
722249-63887
724057-13701
724276-53859
724066-93706
724180-13781
724286-93824
725114-93778
724080-13739
724074-93780
724096-14706
720304-64752
720713-00251
725118-14751
723990-14711
725109-54782
724084-54760
724095-14792
725205-14762
725144-54723
722247-54785
725020-14734
725128-54739
725105-14770
724097-54738
725025-94741
725053-94728
724094-54743
720378-00122
725140-14778
740001-54793
725214-04849
720137-04867
725245-04853
725348-04831
724287-04848
725247-04805
725354-04806
725244-99999
725340-14819
725383-54827

724176-13736
720355-03732
720355-03732
720355-03732
722823-03749
722823-03749
722823-03749
720408-00136
720304-64752
720304-64752
720304-64752
720304-64752
720407-00462
720407-00462
720407-00462
720593-00187
720593-00187
720593-00187
720593-00187
724298-04850
725208-04855
725210-14895
725210-14895
720378-00122
725125-04787
725125-04787
725125-04787
725125-04787
725105-14770
725130-14777
725130-14777
725130-14777
725130-14777
722247-54785
722247-54785
722247-54785
722247-54785
722247-54785
724094-54743
724094-54743
724094-54743
724094-54743
720593-00187
725208-04855
725208-04855
724200-14891
720378-00122
720378-00122
720378-00122
720378-00122
725125-04787
725125-04787
725119-54792
725130-14777
725130-14777
725130-14777
722247-54785
722247-54785
722247-54785
722247-54785
722247-54785
724094-54743
724094-54743
724094-54743
720543-00167
724298-04850
724298-04850
725208-04855
724200-14891
725119-54792
725105-14770
725130-14777
725130-14777
722247-54785
722247-54785
722247-54785
722247-54785

Unnamed: 0,date,temp,dewp,slp,wdsp,mxspd,gust,max,min,prcp,sndp,station_id
0,2012-01-01,47.8,36.4,,3.2,15.0,20.0,75.2,26.1,0.00,,720864-00290
1,2012-01-02,49.8,29.1,,7.4,13.0,18.1,75.2,26.1,0.31,,720864-00290
2,2012-01-03,34.2,11.1,,10.2,22.0,28.0,41.0,26.6,0.00,,720864-00290
3,2012-01-04,29.1,9.9,,4.5,13.0,20.0,42.8,17.6,0.00,,720864-00290
4,2012-01-05,41.1,18.8,,5.6,11.1,19.0,55.4,32.0,0.00,,720864-00290
...,...,...,...,...,...,...,...,...,...,...,...,...
360,2013-12-27,33.5,26.9,,7.1,14.0,19.0,43.2,21.0,0.00,,722126-04879
361,2013-12-28,39.2,30.7,,10.8,17.1,22.0,49.5,30.7,0.00,,722126-04879
362,2013-12-29,34.3,28.5,,9.6,15.9,22.9,42.1,24.3,,,722126-04879
363,2013-12-30,13.9,5.1,,6.5,12.0,18.1,24.3,5.0,,,722126-04879


In [10]:
avged_data.to_csv('avged_data.csv')

In [3]:
# Get IDs of all stations that have temperature data 
df = pd.DataFrame()

# make the api call
for offset in range(1, 41000, 1000):
    print(offset) 
    r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/stations?datacategoryid=TEMP&offset='+str(offset)+'&limit=1000&datasetid=GHCND', headers={'token':Token})
    json_result = json.loads(r.text)
    df = pd.concat([df, pd.DataFrame(json_result['results'])])

1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001
25001
26001
27001
28001
29001
30001
31001
32001
33001
34001
35001
36001
37001
38001
39001
40001


In [4]:
# make formatting consistent with bit beforehand 
df.id = df.id.replace({'GHCND:':''}, regex=True)

In [4]:
# only keep columns in date range 
select_dates = df[(df['mindate'] <= '2012-1-1') & (df['maxdate'] >= '2017-12-31')]
select_dates

Unnamed: 0,datacoverage,elevation,elevationUnit,id,latitude,longitude,maxdate,mindate,name
2,0.7282,34.0,METERS,AE000041196,25.333,55.517,2020-02-04,1944-03-20,"SHARJAH INTER. AIRP, AE"
3,0.9971,10.4,METERS,AEM00041194,25.255,55.364,2020-02-04,1983-01-01,"DUBAI INTERNATIONAL, AE"
4,0.9957,26.8,METERS,AEM00041217,24.433,54.651,2020-02-04,1983-01-02,"ABU DHABI INTERNATIONAL, AE"
5,0.9919,264.9,METERS,AEM00041218,24.262,55.609,2020-02-04,1994-03-23,"AL AIN INTERNATIONAL, AE"
7,0.4729,977.2,METERS,AFM00040938,34.210,62.228,2020-02-04,1973-01-02,"HERAT, AF"
...,...,...,...,...,...,...,...,...,...
562,0.4562,641.0,METERS,WZ004455110,-26.533,31.300,2020-01-24,1910-02-01,"MANZINI MATSAPA AIR, WZ"
567,0.5840,986.0,METERS,ZA000067743,-17.817,25.817,2020-01-04,1950-01-01,"LIVINGSTONE, ZA"
572,0.8179,1480.0,METERS,ZI000067775,-17.917,31.133,2020-02-04,1956-07-01,"HARARE KUTSAGA, ZI"
585,0.8441,1095.0,METERS,ZI000067975,-20.067,30.867,2020-01-27,1951-07-01,"MASVINGO, ZI"


In [53]:
stations = pd.read_csv('ghcnd-stations (1).txt', delimiter=r"\s+", usecols=[0, 1, 2, 3, 4, 5], engine = "python", header=None)
stations.columns = ['id', 'lat', 'long', '1', 'state', '2']
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
stations

Unnamed: 0,id,lat,long,1,state,2
0,ACW00011604,17.1167,-61.7833,10.1,ST,JOHNS
1,ACW00011647,17.1333,-61.7833,19.2,ST,JOHNS
2,AE000041196,25.3330,55.5170,34.0,SHARJAH,INTER.
3,AEM00041194,25.2550,55.3640,10.4,DUBAI,INTL
4,AEM00041217,24.4330,54.6510,26.8,ABU,DHABI
...,...,...,...,...,...,...
114340,VQC00678621,17.7500,-64.6000,9.1,VI,TAGUE
114341,VQC00679222,18.1500,-64.9500,30.5,VI,WATER
114342,VQC00679450,18.3503,-64.9167,196.6,VI,WINTBERG
114343,VQW00011624,17.7028,-64.8056,18.6,VI,CHRISTIANSTED


In [54]:
# Show stations IDs that start with U.S. 
stations = stations[stations.id.str.startswith(('US'))]

In [55]:
# Only keep temperature stations 
stations = stations.loc[stations['id'].isin(select_dates.id)]
stations

Unnamed: 0,id,lat,long,1,state,2
87690,USC00010063,34.2553,-87.1814,249.3,AL,ADDISON
87696,USC00010160,32.9453,-85.9481,195.1,AL,ALEXANDER
87698,USC00010178,33.1272,-88.1550,59.4,AL,ALICEVILLE
87700,USC00010252,31.3072,-86.5225,76.2,AL,ANDALUSIA
87701,USC00010260,34.9628,-87.3719,231.6,AL,LEXINGTON
...,...,...,...,...,...,...
114292,USW00094993,45.6689,-96.9914,353.9,SD,SISSETON
114293,USW00094994,43.1561,-90.6775,204.8,WI,BOSCOBEL
114294,USW00094995,40.8483,-96.5650,362.4,NE,LINCOLN
114295,USW00094996,40.6953,-96.8542,418.2,NE,LINCOLN


In [56]:
# Remove lat long that are one degree apart 
stations['lat'] = stations['lat'].round()
stations['long'] = stations['long'].round()
stations = stations.groupby(['lat', 'long']).first().reset_index()
stations

Unnamed: 0,lat,long,id,1,state,2
0,19.0,-156.0,USC00513977,451.1,HI,KEALAKEKUA
1,19.0,-155.0,USC00517421,256.0,HI,PAHALA
2,20.0,-156.0,USC00511339,176.8,HI,HAWI
3,20.0,-155.0,USC00511856,329.2,HI,HONOKAA
4,21.0,-158.0,USC00510055,74.1,HI,AHUIMANU
...,...,...,...,...,...,...
1073,70.0,-149.0,USS0048V01S,9.1,AK,Prudhoe
1074,70.0,-148.0,USW00027406,18.6,AK,DEADHORSE
1075,70.0,-144.0,USW00027401,11.9,AK,BARTER
1076,71.0,-160.0,USW00027503,9.1,AK,WAINWRIGHT


In [57]:
# remove all hawaii and alaska states
stations = stations[stations.state != 'HI']
stations = stations[stations.state != 'AK']
stations.groupby('state').count()

Unnamed: 0_level_0,lat,long,id,1,2
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AL,19,19,19,19,19
AR,18,18,18,18,18
AZ,37,37,37,37,37
CA,53,53,53,53,53
CO,38,38,38,38,38
CT,5,5,5,5,5
DE,3,3,3,3,3
FL,27,27,27,27,27
GA,16,16,16,16,16
IA,17,17,17,17,17


Use the API to get weather data

In [58]:
# Remove states that have already been processed
stations = stations[stations.state != 'AL']
stations = stations[stations.state != 'FL']
stations = stations[stations.state != 'LA']
stations = stations[stations.state != 'MS']
stations = stations[stations.state != 'TX']
stations = stations[stations.state != 'WV']
stations = stations[stations.state != 'AR']
stations = stations[stations.state != 'CA']
stations = stations[stations.state != 'AZ']
stations = stations[stations.state != 'GA']
stations = stations[stations.state != 'NC']
stations = stations[stations.state != 'NM']
stations = stations[stations.state != 'NV']
stations = stations[stations.state != 'OK']
stations = stations[stations.state != 'SC']
stations = stations[stations.state != 'TN']
stations = stations[stations.state != 'UT']
stations = stations[stations.state != 'IL']
stations = stations[stations.state != 'IN']
stations = stations[stations.state != 'KS']
stations = stations[stations.state != 'KY']
stations = stations[stations.state != 'MO']
stations = stations[stations.state != 'VA']
stations = stations[stations.state != 'CT']
stations = stations[stations.state != 'DE']
stations = stations[stations.state != 'IA']
stations = stations[stations.state != 'ID']
stations = stations[stations.state != 'MA']
stations = stations[stations.state != 'ME']
stations = stations[stations.state != 'MI']
stations = stations[stations.state != 'NE']
stations = stations[stations.state != 'NJ']
stations = stations[stations.state != 'NY']
stations = stations[stations.state != 'OH']
stations = stations[stations.state != 'OR']
stations = stations[stations.state != 'PA']
stations = stations[stations.state != 'SD']
stations = stations[stations.state != 'WI']
stations = stations[stations.state != 'WY']
stations = stations[stations.state != 'MT']
stations = stations[stations.state != 'ND']
stations = stations[stations.state != 'NH']
stations = stations[stations.state != 'VT']
stations = stations[stations.state != 'WA']

# This one isn't done!! get an error at USC00055970
#stations = stations[stations.state != 'CO']

# This one isn't done!! get an error at USC00181790
#stations = stations[stations.state != 'MD']

# This one isn't done!! get an error at USC00210643, 2016
stations = stations[stations.state != 'MN'] 


In [59]:
for state in stations.state.unique():
    print(state)

    curr_state = stations[stations.state == state]
    for id_value in curr_state.id: 
        print(id_value)

CO
USC00052441
USC00056258
USC00050128
USC00058436
USC00058429
USC00051268
USC00058793
USC00055970
USC00051609
USC00050797
USC00051964
USC00051294
USC00051539
USC00052446
USC00054076
USC00051772
USC00050214
USC00050372
USC00050263
USC00050454
USC00052668
USC00052932
USC00050306
USC00055420
USC00051932
USC00053359
USC00051186
USC00050848
USC00051179
USC00050109
USC00054242
USC00055446
USR0000CDRY
USC00053446
USC00051060
USC00050945
USC00051996
USC00054082
MD
USC00185865
USC00187330
USC00188380
USC00186620
USC00188207
USC00180700
USC00182282
USC00181790
USC00182906


In [60]:
for state in stations.state.unique():
    print(state)

    curr_state = stations[stations.state == state]
    state_df = pd.DataFrame()
    for id_value in curr_state.id: 
        max_dates_temp = []
        min_dates_temp = []
        max_temps = []
        min_temps = []
        print(id_value)

        #for each year from 2015-2019 ...
        for year in range(2012, 2018):
            year = str(year)
            print('working on year '+year)

            # API call for max temps
            #r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&stationid=GHCND:USC00210643&datatypeid=TMAX&limit=1000&startdate=2016-01-01&enddate=2016-12-31', headers={'token':Token})
            r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&stationid=GHCND:'+id_value+'&datatypeid=TMAX&limit=1000&startdate='+year+'-01-01&enddate='+year+'-12-31', headers={'token':Token})
            d = json.loads(r.text)

            if d!={}:
                max_temps_item = [item for item in d['results'] if item['datatype']=='TMAX']
                max_dates_temp += [item['date'] for item in max_temps_item]
                max_temps += [item['value'] for item in max_temps_item]

            # API call for min temps
            r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TMIN&limit=1000&stationid=GHCND:'+id_value+'&startdate='+year+'-01-01&enddate='+year+'-12-31', headers={'token':Token})
            d = json.loads(r.text)
            if d!={}:
                min_temps_item = [item for item in d['results'] if item['datatype']=='TMIN']
                min_dates_temp += [item['date'] for item in min_temps_item]
                min_temps += [item['value'] for item in min_temps_item]

        df_max = pd.DataFrame()
        df_max['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in max_dates_temp]
        df_max['maxTemp'] = [float(v)/10.0*1.8 + 32 for v in max_temps]
        df_max['id'] = id_value
        df_min = pd.DataFrame()
        df_min['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in min_dates_temp]
        df_min['minTemp'] = [float(v)/10.0*1.8 + 32 for v in min_temps]

        state_df = pd.concat([state_df, pd.merge(df_max, df_min, on='date', how='outer')])
    state_df.groupby(['date']).median().reset_index()

    state_df.to_csv(state + '.csv')

CO
USC00052441
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00056258
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




USC00050128
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00058436
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00058429
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00051268
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00058793
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00055970
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00051609
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
USC00050797
working on year 2012
w

In [2]:
# Find temperature by region 
states = {'FRCC': ['fl'], 
                  'WECC': ['ca','or','wa', 'nv','mt','id','wy','ut','co','az','nm','tx'],
                  'SPP' : ['nm','ks','tx','ok','la','ar','mo'],
                  'RFC' : ['wi','mi','il','in','oh','ky','wv','va','md','pa','nj'],
                  'NPCC' : ['ny','ct','de','ri','ma','vt','nh','me'],
                  'SERC' : ['mo','ar','tx','la','ms','tn','ky','il','va','al','fl','ga','sc','nc'],
                  'MRO': ['ia','il','mi','mn','mo','mt','nd','ne','sd','wi','wy'], 
                  'TRE': ['ok','tx']}

In [33]:
for region in states:
    print("Calculating for " + region)
    region_df = pd.DataFrame()
    for s in states[region]:
        try:
            df1 = pd.read_csv("results/" + s.upper() + ".csv")
            print("processing " + s)
            df1 = df1.groupby("date").median()
            df1 = df1[['maxTemp','minTemp']]
            
            # combine with what the file contains
            region_df = pd.concat((region_df, df1))
            region_df.groupby(region_df.index).median()
        except IOError:
            print(s)
    
    # save as csv 
    region_df.to_csv("nerc_results/" + region + ".csv")

Calculating for FRCC
processing fl
Calculating for WECC
processing ca
processing or
processing wa
processing nv
processing mt
processing id
processing wy
processing ut
processing co
processing az
processing nm
processing tx
Calculating for SPP
processing nm
processing ks
processing tx
processing ok
processing la
processing ar
processing mo
Calculating for RFC
processing wi
processing mi
processing il
processing in
processing oh
processing ky
processing wv
processing va
processing md
processing pa
processing nj
Calculating for NPCC
processing ny
processing ct
processing de
ri
processing ma
processing vt
processing nh
processing me
Calculating for SERC
processing mo
processing ar
processing tx
processing la
processing ms
processing tn
processing ky
processing il
processing va
processing al
processing fl
processing ga
processing sc
processing nc
Calculating for MRO
processing ia
processing il
processing mi
processing mn
processing mo
processing mt
processing nd
processing ne
processing sd

In [22]:
df1 = pd.read_csv("results/AL.csv")
df2 = pd.read_csv("results/AR.csv")

df1 = df1.groupby("date").median()
df1 = df1[['maxTemp','minTemp']]

df2 = df2.groupby("date").median()
df2 = df2[['maxTemp','minTemp']]

df3 = pd.concat((df1, df2))
df3.groupby(df3.index).median()


Unnamed: 0_level_0,maxTemp,minTemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-01,66.290,38.525
2012-01-02,64.040,34.520
2012-01-03,48.245,23.540
2012-01-04,44.510,20.930
2012-01-05,57.020,25.970
...,...,...
2017-12-27,45.500,26.060
2017-12-28,38.030,23.000
2017-12-29,38.480,23.450
2017-12-30,47.795,26.240
