## Analysis Code

In [476]:
import pandas as pd
import numpy as np
from plotnine import *
import re
import plotly.express as px
import datetime as dt
import plotly.io as pio
pio.renderers.default = "notebook_connected"

from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)


In [241]:
def DropNaCols(df):
    '''given a dataframe, drop all the columns with nothing but NaN'''
    naColList = []
    for ele in df.columns:
        uniqueVals = list(df[ele].unique())
        if len(uniqueVals) == 1:
            naColList.append(ele)
    return(df.drop(columns=naColList))

In [306]:
def ReadingType(df):
    '''indicates whether the given row provides a monthly, daily, or hourly reading'''
    
    #get the different column types into separate lists
    monthlyCols = [ele for ele in list(df.columns) if 'month' in ele.lower()]
    dailyCols = [ele for ele in list(df.columns) if 'dai' in ele.lower()]
    hourlyCols = [ele for ele in list(df.columns) if 'hour' in ele.lower()]

    #create columns indicating whether a row contains monthly, daily, or hourly readings
    boolMask_monthly = ~df[monthlyCols].isna()
    df['monthlyInd'] = boolMask_monthly.sum(axis=1)

    boolMask_daily = ~df[dailyCols].isna()
    df['dailyInd'] = boolMask_daily.sum(axis=1)

    boolMask_hourly = ~df[hourlyCols].isna()
    df['hourlyInd'] = boolMask_hourly.sum(axis=1)

    #if a row has more than one non-NaN value in a monthly column, the reading is monthly, else if more than one non-NaN value in a daily column, the reading is daily, else if more than one non-Nan value in an hourly column, the reading is hourly else indicate no valid readings for the row
    df['ReadingType'] = ['monthly' if df.iloc[ele]['monthlyInd'] > 0 else 'daily' if df.iloc[ele]['dailyInd'] > 0 else 'hourly' if df.iloc[ele]['hourlyInd'] > 0 else 'noValidReading' for ele in range(df.shape[0])]

    return df

In [243]:
def SplitDataframes(df):
    '''given a dataframe of labeled monthly, daily, and hourly readings, split the dataframe by those labels into component dataframes and load those to a dictionary labeled according to reading type'''
    dfDict = {}
    for ele in df['ReadingType'].unique():
        dfDict[ele] = DropNaCols(df[df['ReadingType'] == ele]).reset_index(drop=True)
    return dfDict

In [479]:
def CleanWeatherDF(df):
    '''given a dataframe of local weather data, do some cosmetic cleaning including...
    1. remove uppercase from column names and add underscores between words
    2. convert date column to datetime'''

    #1. remove uppercase from column names and add underscores between words
    new_column_names = []
    for ele in list(df.columns):
        if ele.isupper():
            new_column_names.append(re.sub('([.])','_',ele.lower()))
        else:
            new_column_names.append(re.sub('([.])','_',re.sub('(?<!^)(?=[A-Z])', '_',ele).lower()))
    df.columns = new_column_names

    # convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])

    return df

In [428]:
#clean up numerican columns, converting strings to digits, removing unnecessary strings, converting to floats
def CleanHourlyColumns(df):
    '''given a dataframe of hourly weather values, clean up the column values'''
    numerical_columns = ['hourly_altimeter_setting','hourly_dew_point_temperature','hourly_dry_bulb_temperature','hourly_precipitation','hourly_pressure_change','hourly_pressure_tendency','hourly_relative_humidity','hourly_sea_level_pressure','hourly_station_pressure','hourly_visibility','hourly_wet_bulb_temperature','hourly_wind_direction','hourly_wind_gust_speed','hourly_wind_speed']

    for ele in numerical_columns:
        df[ele] = df[ele].replace('[A-Za-z*]','',regex=True).replace('',0).astype(float).fillna(0)

    return df

In [542]:
#read in data
df = pd.read_csv('3063831.csv')

#add reading type column
df = ReadingType(df)

#clean up data
df = DropNaCols(df)

#next, put dataframe into separate dataframes depending on the type of reading
dfDict = SplitDataframes(df)

#clean up dataframes
for ele in dfDict.values():
    ele = CleanWeatherDF(ele)

#for numerical columns in the hourly data, make them truly numerical
dfDict['hourly'] = CleanHourlyColumns(dfDict['hourly'])


Columns (21,27,28,29,30,31,36,37,38,42,43,49,50,52,54,57,59,60,61,62,64,115,118,120) have mixed types.Specify dtype option on import or set low_memory=False.



In [547]:
df = dfDict['hourly']

Parse current weather data into something categorical and analytical useful.

In [600]:
weather_au, weather_aw, weather_mw = [], [], []
notable_weather_present_mask = df['hourly_present_weather_type'].isna()
for ele in range(df.shape[0]):
    if notable_weather_present_mask[ele]:
        weather_au.append(['na'])
        weather_aw.append(['na'])
        weather_mw.append(['na'])
    else:
        ele_list = df['hourly_present_weather_type'][ele].split('|')
        weather_au.append(ele_list[0].strip().split(' '))
        weather_aw.append(ele_list[1].strip().split(' '))
        weather_mw.append(ele_list[2].strip().split(' '))

df['weather_au'], df['weather_aw'], df['weather_mw'] = weather_au, weather_aw, weather_mw

In [595]:
au_codes = {
    'DZ:01':'drizzle','RA:02':'rain','SN:03':'snow','SG:04':'snow grains','IC:05':'ice crystals','PL:06':'ice pellets','GR:07':'hail','GS:08':'small hail and/or snow pellets','UP:09':'unknown precipitation','BR:1':'mist','FG:2':'fog','FU:3':'smoke','VA:4':'volcanic ash','DU:5':'widespread dust','SA:6':'sand','HZ:7':'haze','PY:8':'spray','PO:1':'well develed dust/sand whirls','SQ:2':'squalls','FC:3':'funnel cloud, waterspout, or tornado','SS:4':'sandstorm','DS:5':'duststorm','na':'no reading'
}

au_code_modifiers = {
    '-':'light','+','heavy','VC':'vicinity','MI':'shallow','PR':'partial','BC':'patches','DR':'low drifting','BL':'blowing', 'SH':'showers','TS':'thunderstorm','FZ':'freezing'
}

In [604]:
converted = []
for ele in weather_au:
    weather_conditions = []
    for i in ele:
        weather_conditions.append(au_codes[i])
    converted.append(weather_conditions)

KeyError: '-FZ:8'