# Intro

## Data Details

### Columns
'DATE, REPORT_TYPE, SOURCE, AWND, CDSD, CLDD, DSNW, DYHF, DYTS, DailyAverageDryBulbTemperature, DailyAverageStationPressure, DailyAverageWindSpeed, DailyCoolingDegreeDays, DailyDepartureFromNormalAverageTemperature, DailyHeatingDegreeDays, DailyMaximumDryBulbTemperature, DailyMinimumDryBulbTemperature, DailyPeakWindDirection, DailyPeakWindSpeed, DailyPrecipitation, DailySnowDepth, DailySnowfall, DailySustainedWindDirection, DailySustainedWindSpeed, DailyWeather, HDSD, HTDD, HourlyAltimeterSetting, HourlyDewPointTemperature, HourlyDryBulbTemperature, HourlyPrecipitation, HourlyPresentWeatherType, HourlyPressureChange, HourlyPressureTendency, HourlyRelativeHumidity, HourlySeaLevelPressure, HourlySkyConditions, HourlyStationPressure, HourlyVisibility, HourlyWetBulbTemperature, HourlyWindDirection, HourlyWindGustSpeed, HourlyWindSpeed, MonthlyDaysWithGT001Precip, MonthlyDaysWithGT010Precip, MonthlyDaysWithGT32Temp, MonthlyDaysWithGT90Temp, MonthlyDaysWithLT0Temp, MonthlyDaysWithLT32Temp, MonthlyDepartureFromNormalAverageTemperature, MonthlyDepartureFromNormalCoolingDegreeDays, MonthlyDepartureFromNormalHeatingDegreeDays, MonthlyDepartureFromNormalMaximumTemperature, MonthlyDepartureFromNormalMinimumTemperature, MonthlyDepartureFromNormalPrecipitation, MonthlyGreatestPrecip, MonthlyGreatestPrecipDate, MonthlyMaxSeaLevelPressureValue, MonthlyMaxSeaLevelPressureValueDate, MonthlyMaxSeaLevelPressureValueTime, MonthlyMaximumTemperature, MonthlyMeanTemperature, MonthlyMinSeaLevelPressureValue, MonthlyMinSeaLevelPressureValueDate, MonthlyMinSeaLevelPressureValueTime, MonthlyMinimumTemperature, MonthlySeaLevelPressure, MonthlyStationPressure, MonthlyTotalLiquidPrecipitation, NormalsCoolingDegreeDay, NormalsHeatingDegreeDay, REM, REPORT_TYPE.1, SOURCE.1, ShortDurationEndDate005, ShortDurationEndDate010, ShortDurationEndDate015, ShortDurationEndDate020, ShortDurationEndDate030, ShortDurationEndDate045, ShortDurationEndDate060, ShortDurationEndDate080, ShortDurationEndDate100, ShortDurationEndDate120, ShortDurationEndDate150, ShortDurationEndDate180, ShortDurationPrecipitationValue005, ShortDurationPrecipitationValue010, ShortDurationPrecipitationValue015, ShortDurationPrecipitationValue020, ShortDurationPrecipitationValue030, ShortDurationPrecipitationValue045, ShortDurationPrecipitationValue060, ShortDurationPrecipitationValue080, ShortDurationPrecipitationValue100, ShortDurationPrecipitationValue120, ShortDurationPrecipitationValue150, ShortDurationPrecipitationValue180, Sunrise, Sunset, MonthlyInd'

## Analysis Code

In [210]:
import pandas as pd
import numpy as np
from plotnine import *

pd.set_option('display.max_columns', None)


In [211]:
def DropNaCols(df):
    '''given a dataframe, drop all the columns with nothing but NaN'''
    naColList = []
    for ele in df.columns:
        uniqueVals = list(df[ele].unique())
        if len(uniqueVals) == 1:
            naColList.append(ele)
    return(df.drop(columns=naColList))

In [216]:
def ReadingType(df):
    '''indicates whether the given row provides a monthly, daily, or hourly reading'''
    
    #get the different column types into separate lists
    monthlyCols = [ele for ele in list(df.columns) if 'month' in ele.lower()]
    dailyCols = [ele for ele in list(df.columns) if 'dai' in ele.lower()]
    hourlyCols = [ele for ele in list(df.columns) if 'hour' in ele.lower()]

    #create columns indicating whether a row contains monthly, daily, or hourly readings
    boolMask_monthly = ~df[monthlyCols].isna()
    df['monthlyInd'] = boolMask_monthly.sum(axis=1)

    boolMask_daily = ~df[dailyCols].isna()
    df['dailyInd'] = boolMask_daily.sum(axis=1)

    boolMask_hourly = ~df[hourlyCols].isna()
    df['hourlyInd'] = boolMask_hourly.sum(axis=1)

    #if a row has more than one non-NaN value in a monthly column, the reading is monthly, else if more than one non-NaN value in a daily column, the reading is daily, else if more than one non-Nan value in an hourly column, the reading is hourly else indicate no valid readings for the row
    df['ReadingType'] = ['Monthly' if df.iloc[ele]['monthlyInd'] > 0 else 'Daily' if df.iloc[ele]['dailyInd'] > 0 else 'Hourly' if df.iloc[ele]['hourlyInd'] > 0 else 'noValidReading' for ele in range(df.shape[0])]

    return df

In [214]:
def SplitDataframes(df):
    '''given a dataframe of labeled monthly, daily, and hourly readings, split the dataframe by those labels into component dataframes and load those to a dictionary labeled according to reading type'''
    dfDict = {}
    for ele in df['ReadingType'].unique():
        dfDict[ele] = DropNaCols(df[df['ReadingType'] == ele]).reset_index(drop=True)
    return dfDict

In [217]:
#read in data
df = pd.read_csv('3063831.csv')

#add reading type column
df = ReadingType(df)

#clean up data
df = DropNaCols(df)

#next, put dataframe into separate dataframes depending on the type of reading
dfDict = SplitDataframes(df)

In [None]:
#get all the unique values in group of columns
uniqueVal = []
for ele in dfDict['Monthly'][monthlyCols].columns:
    vals = list(df[ele].unique())
    for i in vals:
        if i not in uniqueVals:
            uniqueVal.append(i)

uniqueVal

[nan,
 '14',
 '8s',
 '11',
 '10',
 '1s',
 '12',
 '8',
 '14s',
 '3',
 '9',
 '13',
 '4s',
 '7s',
 '5',
 '6',
 '7',
 '9s',
 '17',
 '4',
 '11s',
 6.0,
 7.0,
 nan,
 '8',
 '6s',
 '6',
 '1s',
 '10',
 '5',
 '9s',
 '7',
 '2',
 '12',
 '3s',
 '5s',
 '3',
 '4',
 '9',
 '8s',
 3.0,
 6.0,
 nan,
 20.0,
 15.0,
 10.0,
 2.0,
 0.0,
 21.0,
 18.0,
 4.0,
 1.0,
 23.0,
 '23',
 '22',
 '4',
 '0s',
 '0',
 8.0,
 3.0,
 nan,
 0.0,
 11.0,
 12.0,
 9.0,
 21.0,
 14.0,
 '0',
 '12s',
 '12',
 '9s',
 '16',
 1.0,
 16.0,
 28.0,
 nan,
 0.0,
 3.0,
 nan,
 1.0,
 2.0,
 0.0,
 '0',
 '0s',
 '1',
 10.0,
 5.0,
 nan,
 2.8,
 3.2,
 -0.5,
 2.1,
 5.3,
 0.0,
 -21.2,
 -23.5,
 -15.2,
 1.1,
 6.3,
 -1.1,
 3.5,
 2.5,
 -0.1,
 2.3,
 5.0,
 3.1,
 -8.7,
 4.5,
 19.5,
 3.3,
 4.3,
 2.9,
 6.6,
 5.1,
 2.2,
 14.1,
 0.5,
 -1.6,
 1.3,
 1.5,
 3.6,
 7.5,
 nan,
 0.0,
 -3.0,
 6.0,
 35.0,
 27.0,
 1.0,
 23.0,
 229.0,
 18.0,
 14.0,
 -2.0,
 -11.0,
 99.0,
 77.0,
 -6.0,
 17.0,
 5.0,
 -14.0,
 98.0,
 34.0,
 87.0,
 157.0,
 44.0,
 -1.0,
 11.0,
 100.0,
 127.0,
 230.0,
 nan,