In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

# NOAA weather data
This script investigates two datasets: The Local Climatological Data and the Daily Summaries Data
## LCD dataset
Key column is REPORT_TYPE - this states what time frame the weather is reported over

In [4]:
PATH = '~/Dropbox (GaTech)/CDS-2019-AlbanyHub/Raw-Data/Weather/'
df = pd.read_csv(PATH+"noaa_LCD.csv", sep=',') # 1-1-2010 TO 5-29-2019
df2 = pd.read_csv(PATH+"noaa_LCD_2.csv", sep=',') #1-1-2001 TO 12-31-2009
df = pd.concat([df2,df], ignore_index=True)
#df.loc[21861:22361, ['DATE', 'REPORT_TYPE']]
df.head()

Unnamed: 0,STATION,DATE,REPORT_TYPE,SOURCE,AWND,BackupDirection,BackupDistance,BackupDistanceUnit,BackupElements,BackupElevation,...,ShortDurationPrecipitationValue060,ShortDurationPrecipitationValue080,ShortDurationPrecipitationValue100,ShortDurationPrecipitationValue120,ShortDurationPrecipitationValue150,ShortDurationPrecipitationValue180,Sunrise,Sunset,TStorms,WindEquipmentChangeDate
0,72216013869,2001-01-01T23:59:00,SOD,6,,,,,,,...,,,,,,,736.0,1745.0,,2007-05-31
1,72216013869,2001-01-02T23:59:00,SOD,6,,,,,,,...,,,,,,,737.0,1745.0,,2007-05-31
2,72216013869,2001-01-03T23:59:00,SOD,6,,,,,,,...,,,,,,,737.0,1746.0,,2007-05-31
3,72216013869,2001-01-04T23:59:00,SOD,6,,,,,,,...,,,,,,,737.0,1747.0,,2007-05-31
4,72216013869,2001-01-05T23:59:00,SOD,6,,,,,,,...,,,,,,,737.0,1748.0,,2007-05-31


## Report type Summary of Day (SOD)
Column Types: 

In [5]:
daysum = df[df['REPORT_TYPE'].str.strip() == 'SOD'].copy()
columns = [a for a in list(daysum) if daysum[a].any()] #get all columns used in SOD
daysum =daysum[columns]
#columns
daysum.iloc[1,:]

STATION                                                                             72216013869
DATE                                                                        2001-01-02T23:59:00
REPORT_TYPE                                                                               SOD  
SOURCE                                                                                        6
DailyAverageDryBulbTemperature                                                               32
DailyAverageStationPressure                                                               30.24
DailyAverageWindSpeed                                                                       5.5
DailyCoolingDegreeDays                                                                        0
DailyDepartureFromNormalAverageTemperature                                                -17.1
DailyHeatingDegreeDays                                                                       33
DailyMaximumDryBulbTemperature          

#### Explanation of some of these fields
<ul>
 <li>DailyAverageDryBulbTemperature - Average Temperature
 <li>DailyAverageStationPressure - Daily average station pressure (in inches of mercury, to hundredths)  
 <li>DailyCoolingDegreeDays - Measure of how hard cooling systems work
     <blockquote>https://www.weather.gov/key/climate_heat_coolDegree days are based on the assumption that when the outside temperature is 65°F, we don't need heating or cooling to be comfortable. Degree days are the difference between the daily temperature mean, (high temperature plus low temperature divided by two) and 65°F..</blockquote>
 <li>DailyDepartureFromNormalAverageTemperature - Average temperature’s departure from (1981-2010) normal temperature (in whole
Fahrenheit degrees using “-” to indicate below normal)
<li>DailyHeatingDegreeDays - Measure of how hard heating systems work
<li>DailyPrecipitation - Water equivalent amount of precipitation for the day (in inches
to hundredths). This is all types of precipitation (melted and frozen). T indicates trace amount of precipitation.
If left blank, precipitation amount is unreported.
<li>DailySnowfall: Daily amount of snowfall (in inches to the tenths). T indicates trace amount.


In [8]:
#reformat with month and day as separate columns
def formatdateD(series):
    a = series.split('-')
    b = a[2].split('T')
    return int(b[0])
def formatdateM(series):
    a = series.split('-')
    return int(a[1])
def formatdateY(series):
    a = series.split('-')
    return int(a[0])
#Strip uncertainty value 's' from daily temp records
def asciistrip(series):
    try:
        return float(series)
    except:
        return float(series[:-1])
daysum['MONTH'] = daysum['DATE'].apply(formatdateM)
daysum['YEAR'] = daysum['DATE'].apply(formatdateY)
daysum['DAY'] = daysum['DATE'].apply(formatdateD)

In [9]:
#day = daysum.loc[(daysum['YEAR']==y)&(daysum['MONTH']==m)&(daysum['DAY']==d), :]
#np.isnan(day['DailyAverageDryBulbTemperature'].apply(asciistrip)).iloc[0]

#### Check to see how many SOD are missing

In [10]:
days_per_month = [31,28,31,30,31,30,31,31,30,31,30,31]
days_per_month_ly =[31,29,31,30,31,30,31,31,30,31,30,31]
missing_dates = []
for y in range(2001, 2019):
    for m in range(1,13):
        if y%4 == 0:
            for d in range(1, days_per_month_ly[m-1]):
                day = daysum.loc[(daysum['YEAR']==y)&(daysum['MONTH']==m)&(daysum['DAY']==d), :]
                if (len(day) == 0) or np.isnan(day['DailyAverageDryBulbTemperature'].apply(asciistrip)).iloc[0]:
                    #print("1. "+str(day['DailyAverageDryBulbTemperature']))
                    missing_dates.append((m,d,y))
        else:
            for d in range(1, days_per_month[m-1]):
                day = daysum.loc[(daysum['YEAR']==y)&(daysum['MONTH']==m)&(daysum['DAY']==d), :]
                if (len(day) == 0) or np.isnan(day['DailyAverageDryBulbTemperature'].apply(asciistrip)).iloc[0]:
                    #print("2. "+str(day['DailyAverageDryBulbTemperature']))
                    missing_dates.append((m,d,y))
                

In [18]:
missing_dates

[(2, 4, 2001),
 (2, 5, 2001),
 (3, 27, 2001),
 (7, 27, 2001),
 (8, 23, 2001),
 (8, 28, 2001),
 (4, 1, 2002),
 (4, 2, 2002),
 (4, 20, 2002),
 (7, 10, 2003),
 (9, 4, 2003),
 (9, 5, 2003),
 (9, 18, 2003),
 (9, 19, 2003),
 (2, 28, 2004),
 (5, 15, 2004),
 (5, 16, 2004),
 (5, 26, 2004),
 (6, 8, 2004),
 (7, 7, 2004),
 (7, 8, 2004),
 (7, 9, 2004),
 (9, 5, 2004),
 (6, 7, 2006),
 (8, 20, 2010),
 (10, 20, 2010),
 (10, 21, 2010),
 (2, 28, 2012),
 (2, 12, 2013),
 (2, 13, 2013),
 (9, 3, 2014),
 (9, 6, 2014),
 (10, 10, 2018),
 (10, 11, 2018),
 (10, 12, 2018)]

## Report Type Summary of Month (SOM):
Column Types

In [11]:
monthsum0 = df[df['REPORT_TYPE'].str.strip() == 'SOM'].copy()
columns = [a for a in list(monthsum0) if monthsum0[a].any()] #get all columns used in SOM
monthsum = monthsum0[columns].copy()

In [12]:
monthsum.iloc[1, 36] #REM

'SOM639PCP MTOT: 0.75 DEPNRM: 0.00 PCP GT 24HRS: 0.35 DATE(S):10 DAYS W/PCP >=.01:15 DAYS W/PCP >=.10: 2 DAYS W/PCP >=1.00:  MSDP AMTS W/DATE-TIME TAGS:MIN:5  .08  21/2357 MIN:10  .11  21/2359 MIN:15  .13  22/0003 MIN:20  .14  22/0009 MIN:30  .14  22/0009 MIN:45  .17  10/0811 MIN:60  .17  10/0811 MIN:80  .20  10/0811 MIN:100  .21  10/0742 MIN:120  .28  10/0811 MIN:150  .34  10/0811 MIN:180  .35  10/0811 SN GT DP ON GRND:0 DATE(S):0  AVG DLY MIN: 47.3 AVG DLY MAX: 69.8 AVG MLY TMP: 58.6 AVG STP:29.96  AVG SLP:30.23  LWST SLP:29.90 DATE/TIME:170149 HGST SLP:30.50 DATE/TIME:181010 HDD MTH TOT: 182 DEP NORM:-255 CDD MTH TOT:  20 DEP NORM:-255'

#### Summary of Month columns
Key:
<br>s = suspect value (appears with value)
<br>T = trace precipitation amount or snow depth (an amount too small to measure, usually < 0.005 inches water
    equivalent) (appears instead of numeric value)
<br>M = missing value (appears instead of value)
<br>Blank = value is unreported (appears instead of value)
<br>Columns:
<ul> 
    <li><b>AWND</b> - Average wind speed in mph
    <li><b>CDSD</b> - Cooling Degree days Season to Date
    <li><b>CLDD</b> - Monthly Total Cooling Degree Days
    <li><b>HDSD</b> - Heating Degree days Season to Date
    <li><b>HTDD</b> - Monthy Total Heating Degree Days
    <li><b>MonthlyDaysWithGT001Precip</b> - Number of days with >=0.01 inches of precipitation
    <li><b>MonthlyDaysWithGT010Precip</b> - Number of days with >=0.1 inches of precipitation
    <li><b>MonthlyDaysWithGT32Temp</b> - uncertain (Number of days where the min temperature is less than 32 ?)
    <li><b>MonthlyDaysWithGT90Temp</b> - Number of days where the max temperature is greater than 90
    <li><b>MonthlyDaysWithLT32Temp</b> - Number of days where the max temperature is less than 32
    <li><b>MonthlyDepartureFromNormalAverageTemperature</b> - the average temperature for the month minus the average for 1981-2010
    <li><b>MonthlyDepartureFromNormalCoolingDegreeDays</b> - the number of cooling days for this month minus the average for 1981-2010
        <li><b>MonthlyDepartureFromNormalHeatingDegreeDays</b> - the number of heating days for this month minus the average for 1981-2010 <li><b>MonthlyDepartureFromNormalMaximumTemperature</b> - the average max temperature for this month minus the average for 1981-2010
<li><b>MonthlyDepartureFromNormalMinimumTemperature</b> - the average max temperature for this month minus the average for 1981-2010
    <li><b>MonthlyDepartureFromNormalPrecipitation</b> - the average precipitation (melted and frozen, in inches) for this month minus the average for 1981-2010
    <li><b>MonthlyGreatestPrecip</b> - Greatest 24 hour precipitation in inches
    <li><b>MonthlyGreatestPrecipDate</b> - Day of month when greatest participation occured
 <li>'MonthlyMaxSeaLevelPressureValue - (self explanatory)
 <li>'MonthlyMaxSeaLevelPressureValueDate',
 <li>'MonthlyMaxSeaLevelPressureValueTime',
 <li>'MonthlyMaximumTemperature',
 <li>'MonthlyMeanTemperature',
 <li>'MonthlyMinSeaLevelPressureValue',
 <li>'MonthlyMinSeaLevelPressureValueDate',
 <li>'MonthlyMinSeaLevelPressureValueTime',
 <li>'MonthlyMinimumTemperature',
 <li>'MonthlySeaLevelPressure',
 <li>'MonthlyStationPressure',
    <li><b>MonthlyTotalLiquidPrecipitation</b> - Total precipitation (melted and frozen, in inches) over the month
    <li><b>NormalsCoolingDegreeDay</b> - Reference Cooling Degree Days (ie avg number of cooling degree days for this month from 1981-2010)
    <li><b>NormalsHeatingDegreeDay</b> - Reference Heating Degree Days (ie avg number of heating degree days for this month from 1981-2010)
    <li><b>ShortDurationPrecipitationValue_XX_</b> - Maximum amount of precipitation occuring within XX minutes during the month
    <li><b>ShortDurationEndDate_XX_</b> - Time at which this large burst of participation ended
    <li><b>REM</b> - ?

In [14]:

SOM = monthsum[columns[1:35]].copy()
               
SOM['MONTH'] = SOM['DATE'].apply(formatdateM)
SOM['YEAR'] = SOM['DATE'].apply(formatdateY)
#fix missing values: append an empty report 
for y in range(2001, 2019):
    for m in range(1,13):
        #if the report for month m and year y does not exist:
        if len(SOM[(SOM['MONTH']==m)&(SOM['YEAR']==y)])==0:
            SOM = SOM.append({'YEAR':y, 'MONTH':m, 'REPORT_TYPE':'SOM'}, ignore_index=True)
            missing_months.append((m, y))
SOM.index=range(SOM.shape[0])#reindex

#### Fix missing monthly weather values using SOD

In [16]:

#Get Cooling Degree Days
def CDD_monthly(series):
    if series<=65:
        return 0
    else:
        return series-65
#Get Heating Degree Days
def HDD_monthly(series):
    if series>=65:
        return 0
    else:
        return 65-series

In [23]:
# #test these procedures on places where there is still data
days = daysum[(daysum['MONTH']==1) &(daysum['YEAR']==2001)]
vals = [0]*6
vals[0] = days.loc[:,'DailyMaximumDryBulbTemperature'].apply(asciistrip).mean()
vals[1] = days.loc[:,'DailyMinimumDryBulbTemperature'].apply(asciistrip).mean()
vals[2] = days.loc[:,'DailyAverageDryBulbTemperature'].apply(asciistrip).mean()
vals[3] = days.loc[:,'DailyAverageWindSpeed'].apply(asciistrip).mean()
vals[4] = days.loc[:,'DailyAverageDryBulbTemperature'].apply(asciistrip).agg(CDD_monthly).sum()
vals[5] = days.loc[:,'DailyAverageDryBulbTemperature'].apply(asciistrip).agg(HDD_monthly).sum()
print(vals)
temp = SOM[(SOM['MONTH']==1) &(SOM['YEAR']==2001)]
#vals2 = temp[['MonthlyMaximumTemperature', 'MonthlyMinimumTemperature', 'MonthlyMeanTemperature', 'AWND', 'CLDD', 'HTDD']]
print(list(temp.loc[0,['MonthlyMaximumTemperature', 'MonthlyMinimumTemperature', 'MonthlyMeanTemperature', 'AWND', 'CLDD', 'HTDD']]))

[59.25806451612903, 32.903225806451616, 46.25806451612903, 5.6096774193548375, 5.0, 586.0]
[59.3, 32.9, 46.1, 5.6, 4.0, 591.0]


In [24]:
missing_ct= SOM['MonthlyMaximumTemperature'].isnull().sum()
#fill these in with SOD
missing_cols = SOM.loc[SOM['MonthlyMaximumTemperature'].isnull(), :]
for vals in missing_cols.itertuples():
    #print(vals.MONTH)
    days = daysum[(daysum['MONTH']==vals.MONTH) &(daysum['YEAR']==vals.YEAR)]
    SOM.loc[vals.Index, 'MonthlyMaximumTemperature'] = days.loc[:,'DailyMaximumDryBulbTemperature'].apply(asciistrip).mean()
    SOM.loc[vals.Index, 'MonthlyMinimumTemperature'] = days.loc[:,'DailyMinimumDryBulbTemperature'].apply(asciistrip).mean()
    SOM.loc[vals.Index, 'MonthlyMeanTemperature'] = days.loc[:,'DailyAverageDryBulbTemperature'].apply(asciistrip).mean()
    SOM.loc[vals.Index, 'AWND'] = days.loc[:,'DailyAverageWindSpeed'].apply(asciistrip).mean()
    SOM.loc[vals.Index, 'CLDD'] = days.loc[:,'DailyAverageDryBulbTemperature'].apply(asciistrip).agg(CDD_monthly).sum()
    SOM.loc[vals.Index, 'HTDD'] = days.loc[:,'DailyAverageDryBulbTemperature'].apply(asciistrip).agg(HDD_monthly).sum()
SOM=SOM.sort_values(by=['YEAR', 'MONTH'])
SOM.index = range(len(SOM))
SOM = SOM[['YEAR', 'MONTH', 'DATE', 'MonthlyMaximumTemperature', 'MonthlyMinimumTemperature', 'MonthlyMeanTemperature', 'AWND', 'CLDD', 'HTDD']]

In [26]:
SOM.to_csv("LCD_month.csv")

Unnamed: 0,DATE,REPORT_TYPE,SOURCE,AWND,CDSD,CLDD,HDSD,HTDD,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,...,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,NormalsCoolingDegreeDay,MONTH,YEAR
114,2010-09-30T23:59:00,SOM,6.0,3.8,2726.0,454.0,0.0,0.0,,,...,,,,,,,,383.0,9,2010
141,2013-01-31T23:59:00,SOM,6.0,6.0,26.0,26.0,943.0,276.0,,,...,,,,,,,,4.0,1,2013
166,2015-02-28T23:59:00,SOM,6.0,6.5,2.0,0.0,1753.0,485.0,,,...,,,,,,,,5.0,2,2015
217,,SOM,,,,,,,,,...,,,,,,,,,12,2003
218,,SOM,,,,,,,,,...,,,,,,,,,12,2005
219,,SOM,,,,,,,,,...,,,,,,,,,12,2011


In [32]:
print(SOM[(SOM['MONTH']==1)&(SOM['YEAR']==2013)])
missing_cols

     YEAR  MONTH                 DATE  MonthlyMaximumTemperature  \
144  2013      1  2013-01-31T23:59:00                  68.774194   

     MonthlyMinimumTemperature  MonthlyMeanTemperature      AWND  CLDD   HTDD  
144                  45.129032                57.16129  6.070968  27.0  270.0  


Unnamed: 0,DATE,REPORT_TYPE,SOURCE,AWND,CDSD,CLDD,HDSD,HTDD,MonthlyDaysWithGT001Precip,MonthlyDaysWithGT010Precip,...,MonthlyMinSeaLevelPressureValue,MonthlyMinSeaLevelPressureValueDate,MonthlyMinSeaLevelPressureValueTime,MonthlyMinimumTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,NormalsCoolingDegreeDay,MONTH,YEAR
114,2010-09-30T23:59:00,SOM,6.0,3.8,2726.0,454.0,0.0,0.0,,,...,,,,,,,,383.0,9,2010
141,2013-01-31T23:59:00,SOM,6.0,6.0,26.0,26.0,943.0,276.0,,,...,,,,,,,,4.0,1,2013
166,2015-02-28T23:59:00,SOM,6.0,6.5,2.0,0.0,1753.0,485.0,,,...,,,,,,,,5.0,2,2015
217,,SOM,,,,,,,,,...,,,,,,,,,12,2003
218,,SOM,,,,,,,,,...,,,,,,,,,12,2005
219,,SOM,,,,,,,,,...,,,,,,,,,12,2011


## Daily summaries data
The documentation for the columns is available at https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf
The most important columns are:
<ul>
<li>PRCP = Precipitation (mm or inches as per user preference, inches to hundredths on Daily Form pdf file)
<li>SNOW = Snowfall (mm or inches as per user preference, inches to tenths on Daily Form pdf file)
<li>SNWD = Snow depth (mm or inches as per user preference, inches on Daily Form pdf file)
<li>TMAX = Maximum temperature (Fahrenheit or Celsius as per user preference, Fahrenheit to tenths onDaily Form pdf file)
<li>TMIN = Minimum temperature 

In [20]:
#include only areas with max/min temperature data
df = pd.read_csv("./data/noaa_daily_summaries.csv", sep=',')
core_attributes = ['STATION', 'NAME', 'LATITUDE', 'LONGITUDE','ELEVATION', 'DATE','PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN']
df = df.loc[(df['TMAX'].notnull())|(df['TMIN'].notnull()), core_attributes] #only include stations which record max temp
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(20962, 11)

#### Number of unique stations and dates

In [21]:
unique_stations = df['STATION'].value_counts()
unique_stations

USC00090140    8505
USW00013869    6683
USC00095061    5774
Name: STATION, dtype: int64

In [22]:
dates_represented = df['DATE'].nunique()
dates_represented

8863

In [23]:
stat1 = df[df['STATION']=='USC00090140'].copy()
stat2 = df[df['STATION']=='USW00013869'].copy()
stat3 = df[df['STATION']=='USC00095061'].copy()

In [24]:
stat2[stat2['DATE']=='2011-12-22']

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,PRCP,SNOW,SNWD,TMAX,TMIN
29025,USW00013869,"ALBANY SW GEORGIA REGIONAL AIRPORT, GA US",31.53556,-84.19444,57.9,2011-12-22,0.43,0.0,0.0,80.0,66.0


Locations of these weather stations: (looks like station 3 may not be useful!)
<br>Station 2 may be the same as from the previous data set
<img src="weather_stations.png">

#### Check the daily summaries data for missing values

In [26]:
stat2['MONTH'] = stat2['DATE'].apply(formatdateM)
stat2['YEAR'] = stat2['DATE'].apply(formatdateY)
stat2['DAY'] = stat2['DATE'].apply(formatdateD)
stat2.tail()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,PRCP,SNOW,SNWD,TMAX,TMIN,MONTH,YEAR,DAY
31732,USW00013869,"ALBANY SW GEORGIA REGIONAL AIRPORT, GA US",31.53556,-84.19444,57.9,2019-05-24,0.0,0.0,0.0,96.0,67.0,5,2019,24
31733,USW00013869,"ALBANY SW GEORGIA REGIONAL AIRPORT, GA US",31.53556,-84.19444,57.9,2019-05-25,0.0,0.0,0.0,99.0,75.0,5,2019,25
31734,USW00013869,"ALBANY SW GEORGIA REGIONAL AIRPORT, GA US",31.53556,-84.19444,57.9,2019-05-26,0.0,0.0,0.0,101.0,72.0,5,2019,26
31735,USW00013869,"ALBANY SW GEORGIA REGIONAL AIRPORT, GA US",31.53556,-84.19444,57.9,2019-05-27,0.0,0.0,0.0,99.0,75.0,5,2019,27
31736,USW00013869,"ALBANY SW GEORGIA REGIONAL AIRPORT, GA US",31.53556,-84.19444,57.9,2019-05-28,0.0,0.0,0.0,99.0,74.0,5,2019,28


In [27]:
days_per_month = [31,28,31,30,31,30,31,31,30,31,30,31]
days_per_month_ly =[31,29,31,30,31,30,31,31,30,31,30,31]
missing_dates2 = []
for y in range(2001, 2019):
    for m in range(1,13):
        if y%4 == 0:
            for d in range(1, days_per_month_ly[m-1]):
                day = stat2.loc[(stat2['YEAR']==y)&(stat2['MONTH']==m)&(stat2['DAY']==d), :]
                if (len(day) == 0) or np.isnan(day['TMAX'].apply(asciistrip)).iloc[0]:
                    missing_dates2.append((m,d,y))
        else:
            for d in range(1, days_per_month[m-1]):
                day = stat2.loc[(stat2['YEAR']==y)&(stat2['MONTH']==m)&(stat2['DAY']==d), :]
                if (len(day) == 0) or np.isnan(day['TMAX'].apply(asciistrip)).iloc[0]:
                    missing_dates2.append((m,d,y))
missing_dates2

[(2, 4, 2001),
 (2, 5, 2001),
 (3, 27, 2001),
 (7, 27, 2001),
 (8, 23, 2001),
 (8, 28, 2001),
 (4, 1, 2002),
 (4, 2, 2002),
 (4, 20, 2002),
 (7, 10, 2003),
 (9, 4, 2003),
 (9, 5, 2003),
 (9, 18, 2003),
 (9, 19, 2003),
 (5, 15, 2004),
 (5, 16, 2004),
 (5, 26, 2004),
 (6, 8, 2004),
 (7, 7, 2004),
 (7, 8, 2004),
 (7, 9, 2004),
 (9, 5, 2004),
 (6, 7, 2006),
 (8, 20, 2010),
 (10, 20, 2010),
 (10, 21, 2010),
 (2, 28, 2012),
 (2, 12, 2013),
 (2, 13, 2013),
 (9, 3, 2014),
 (9, 6, 2014),
 (10, 10, 2018),
 (10, 11, 2018),
 (10, 12, 2018)]

In [None]:

gb_month = stat2.groupby(['YEAR', 'MONTH'])
mns = gb_month[['TMAX', 'TMIN']].mean()

In [None]:
cts = gb_month['TMAX'].count() #Number of records for each month

In [None]:
record_count = len(mns)
year_count = len(mns.index.levels[0])
months = [m for i in range(year_count) for m in mns.index.levels[1]]#year columns
years = [y for y in mns.index.levels[0] for i in range(12)]
years = years[0:record_count]
months = months[0:record_count]
#date = [str(months[i])+"/"+str(years[i]) for i in range(len(months))]
stat2_df = pd.DataFrame(data={'YEAR':years,'MONTH':months})

In [None]:
stat2_df['TMAX'] = [0]*len(months)
stat2_df['TMIN'] = [0]*len(months)
for i in range(len(months)):
    stat2_df.loc[i,'TMAX'] = mns.loc[(years[i], months[i])]['TMAX']
    stat2_df.loc[i,'TMIN'] = mns.loc[(years[i], months[i])]['TMIN']
stat2_df_2001 = stat2_df[stat2_df['YEAR']>=2001]
stat2_df_2001.index = range(len(stat2_df_2001))

### Compare the daily summaries month averages to the SOM LCD

In [None]:
SOM_means = SOM[['YEAR', 'MONTH', 'MonthlyMaximumTemperature']].copy()
for i in range(len(SOM_means)):
    SOM_means.loc[i,'STATION2_MAX'] = stat2_df_2001.loc[i, 'TMAX']
    SOM_means.loc[i, 'DIFF'] = abs(SOM_means.loc[i, 'STATION2_MAX']-SOM_means.loc[i, 'MonthlyMaximumTemperature'])

In [None]:
SOM_means[SOM_means['DIFF']>0.1]