In [1]:
#import libraries
from meteostat import Point, Daily, Hourly, Stations
from datetime import datetime, timedelta
import pandas as pd
import csv
import numpy as np

In [2]:
NOGA_obsv_df = pd.read_csv("C:/Users/Dovekie/Documents/PelagicBirdProgram/datasets/noga/noga_obsv_200601-202312_masscoast.csv", sep='\t', index_col=["Unnamed: 0"])

In [3]:
NOGA_obsv_df.shape

(565392, 4)

In [4]:
# split date column into three: year, month, day
NOGA_obsv_df['OBSERVATION DATE']=NOGA_obsv_df['OBSERVATION DATE'].values.astype('datetime64')
NOGA_obsv_df['OBSERVATION DATE']=NOGA_obsv_df['OBSERVATION DATE'].dt.strftime('%Y-%#m-%#d')
NOGA_obsv_df['OBSERVATION DATE']=pd.to_datetime(NOGA_obsv_df['OBSERVATION DATE'], errors='coerce')
NOGA_obsv_df.rename(columns={'OBSERVATION DATE': 'year'}, inplace=True)
NOGA_obsv_df['month']=NOGA_obsv_df.year.dt.month
NOGA_obsv_df['day']=NOGA_obsv_df.year.dt.day
NOGA_obsv_df['year']=NOGA_obsv_df.year.dt.year

In [5]:
# round latitude and longitude to 2 decimal places
NOGA_obsv_df[['LATITUDE', 'LONGITUDE']]=NOGA_obsv_df[['LATITUDE', 'LONGITUDE']].round(decimals=2)
# keep the largest count for each location (after rounding latitude and longitude) on each day
NOGA_obsv_df_rounded=NOGA_obsv_df.sort_values('OBSERVATION COUNT').drop_duplicates(subset=['LATITUDE', 'LONGITUDE', 'year', 'month', 'day'], keep='last').sort_index()

In [6]:
NOGA_obsv_df_rounded.shape

(310792, 6)

In [7]:
# filter zeros to keep 5019 zeros
all_zeros=NOGA_obsv_df_rounded[NOGA_obsv_df_rounded['OBSERVATION COUNT']==0]
print(all_zeros.shape)
many_zeros=all_zeros.drop(all_zeros.iloc[::50].index)
NOGA_obsv_df_50=NOGA_obsv_df_rounded.drop(many_zeros.index).reset_index(drop=True)
NOGA_obsv_df_50.shape

(284734, 6)


(31753, 6)

In [8]:
# for checking number of zeros 0-5695 1-5046 2-3159 3-1992 4-1297 5-1160 10-704
all_zeros=NOGA_obsv_df_50[NOGA_obsv_df_50['OBSERVATION COUNT']==0]
all_zeros.shape

(5695, 6)

In [9]:
# define function that converts date format to 366 days in a year
def days_in_a_year(index):
    month=NOGA_obsv_df_50.iloc[index, 4]
    day=NOGA_obsv_df_50.iloc[index, 5]
    return np.piecewise(index, [month==1, month==2, month==3, month==4, month==5, month==6, month==7, month==8, month==9, month==10, month==11, month==12], [day,31+day,60+day,91+day,121+day,152+day,182+day,213+day,244+day,274+day,305+day,335+day])
# make days column into cos and sin components and add to existing dataframe
list_of_days_cos=[]
list_of_days_sin=[]
for index, row in NOGA_obsv_df_50.iterrows():
    list_of_days_cos.append(np.cos(np.deg2rad(days_in_a_year(index)*(60/61))))
    list_of_days_sin.append(np.sin(np.deg2rad(days_in_a_year(index)*(60/61))))
NOGA_obsv_df_50['days_cos']=pd.Series(list_of_days_cos)
NOGA_obsv_df_50['days_sin']=pd.Series(list_of_days_sin)

In [10]:
NOGA_obsv_df_50.head()

Unnamed: 0,OBSERVATION COUNT,LATITUDE,LONGITUDE,year,month,day,days_cos,days_sin
0,0,41.67,-70.08,2006,1,1,0.999853,0.017166
1,1,42.8,-70.81,2006,1,2,0.999411,0.034328
2,2,41.77,-70.5,2006,1,2,0.999411,0.034328
3,0,41.57,-70.51,2006,1,2,0.999411,0.034328
4,3,41.77,-70.49,2006,1,2,0.999411,0.034328


In [None]:
NOGA_obsv_df_50.to_csv("C:/Users/Dovekie/Documents/PelagicBirdProgram/datasets/noga/noga_obsv_200601-202312_masscoast_50.csv", sep='\t')

In [11]:
# new dataframe with wdir and wspd combined
def weather_for_day(latitude, longitude, month, day, year):
    # find nearest station
    stations = Stations()
    stations = stations.nearby(latitude, longitude)
    station = stations.fetch(2)
    station=station.index
    defective_stations=['72506', '74492', '74494', 'KNZW0']
    # pick a non-defective station
    if station[0] in defective_stations:
        station = station[1]
    else:
        station = station[0]
    # hourly data for station
    weather_for_day=pd.DataFrame(index=[0], columns=['temp', 'prcp', 'wind_x', 'wind_y'])
    start = datetime(year, month, day, 0, 00)
    end = datetime(year, month, day, 12, 00)
    stn = Hourly(station, start, end)
    stn = stn.fetch()
    # fill missing rows of hourly weather with rows of NaN values
    times=[
        datetime(start.year, start.month, start.day, 0, 00),
        datetime(start.year, start.month, start.day, 1, 00),
        datetime(start.year, start.month, start.day, 2, 00),
        datetime(start.year, start.month, start.day, 3, 00), 
        datetime(start.year, start.month, start.day, 4, 00),
        datetime(start.year, start.month, start.day, 5, 00),
        datetime(start.year, start.month, start.day, 6, 00),
        datetime(start.year, start.month, start.day, 7, 00),
        datetime(start.year, start.month, start.day, 8, 00),
        datetime(start.year, start.month, start.day, 9, 00),
        datetime(start.year, start.month, start.day, 10, 00),
        datetime(start.year, start.month, start.day, 11, 00),
        datetime(start.year, start.month, start.day, 12, 00)
    ]
    nan_row=pd.Series({"temp":np.nan, "dwpt":np.nan, "rhum":np.nan, "prcp":np.nan, "snow": np.nan, "wdir":np.nan, "wspd":np.nan, "wpgt":np.nan, "pres":np.nan, "tsun":np.nan, "coco":np.nan})
    for time in times:
        if time not in stn.index:
            stn=pd.concat([stn, nan_row.to_frame(time).T])
    stn=stn.sort_index()
    # filter to every 3 hrs
    counter=-1 
    for index, row in stn.iterrows():
        counter+=1  
        if counter%3!=0:
            stn=stn.drop(index)
    # replace NaNs
    stn['prcp']=stn['prcp'].fillna(0)
    stn['temp']=stn['temp'].interpolate(method='linear', limit_direction='both')
    stn['wdir']=stn['wdir'].interpolate(method='linear', limit_direction='both')
    stn['wspd']=stn['wspd'].interpolate(method='linear', limit_direction='both')
    # create a list for each parameter
    stn_temp=stn['temp'].tolist()
    stn_prcp=stn['prcp'].tolist()
    stn_wdir=stn['wdir'].tolist()
    stn_wspd=stn['wspd'].tolist()
    # convert wind speed and wind direction into x and y components
    stn_wind_x=[]
    stn_wind_y=[]
    for i in range(5):
        cosine=np.around(np.cos(np.deg2rad(stn_wdir[i])), decimals=1)
        sine=np.around(np.sin(np.deg2rad(stn_wdir[i])), decimals=1)
        hourly_wind_x=np.around(float(stn_wspd[i])*cosine, decimals=1)
        hourly_wind_y=np.around(float(stn_wspd[i])*sine, decimals=1)
        stn_wind_x.append(hourly_wind_x)
        stn_wind_y.append(hourly_wind_y)
    stn_wind_x=pd.Series(stn_wind_x, dtype=object).fillna(0).tolist()
    stn_wind_y=pd.Series(stn_wind_y, dtype=object).fillna(0).tolist()
    # create a dataframe
    temp_df=pd.DataFrame([stn_temp], columns=['temp000','temp003','temp006','temp009','temp012'])
    prcp_df=pd.DataFrame([stn_prcp], columns=['prcp000','prcp003','prcp006','prcp009','prcp012'])
    wdir_df=pd.DataFrame([stn_wdir], columns=['wdir000','wdir003','wdir006','wdir009','wdir012'])
    wspd_df=pd.DataFrame([stn_wspd], columns=['wspd000','wspd003','wspd006','wspd009','wspd012'])
    windx_df=pd.DataFrame([stn_wind_x], columns=['windx000','windx003','windx006','windx009','windx012'])
    windy_df=pd.DataFrame([stn_wind_y], columns=['windy000','windy003','windy006','windy009','windy012'])
    weather_for_day=pd.concat([temp_df,prcp_df,wdir_df,wspd_df,windx_df,windy_df], axis=1)
    return weather_for_day 

In [None]:
print(weather_for_day(42.691015, -70.631701, 10, 2, 2022)) #1500
print(weather_for_day(42.691015, -70.631701, 1, 28, 2024)) #1
print(weather_for_day(42.691015, -70.631701, 1, 29, 2024)) #0
print(weather_for_day(42.691015, -70.631701, 11, 22, 2023)) #848

In [12]:
def weather(df):
    weather_df=pd.DataFrame({})
    for index, row in df.iterrows():
        weather_for_the_day=weather_for_day(row['LATITUDE'], row['LONGITUDE'], int(row['month']), int(row['day']), int(row['year']))
        weather_df=pd.concat([weather_df,weather_for_the_day], axis=0, ignore_index=True)
    return weather_df

In [13]:
weather_df=weather(NOGA_obsv_df_50)

In [14]:
weather_df.tail(50)

Unnamed: 0,temp000,temp003,temp006,temp009,temp012,prcp000,prcp003,prcp006,prcp009,prcp012,...,windx000,windx003,windx006,windx009,windx012,windy000,windy003,windy006,windy009,windy012
31703,6.1,6.7,6.7,6.7,4.4,0.0,0.0,0.0,0.0,0.0,...,0.0,8.5,8.5,6.8,0.0,0.0,-4.7,-2.8,2.3,0.0
31704,5.0,5.0,5.0,5.0,6.7,0.0,0.0,0.0,0.0,0.0,...,-4.9,0.0,0.0,0.0,0.0,-1.6,0.0,0.0,0.0,0.0
31705,5.6,5.6,4.4,3.3,4.4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31706,5.6,5.6,4.4,4.4,3.9,0.0,0.0,0.0,0.0,0.0,...,0.0,-4.7,0.0,0.0,0.0,0.0,-8.5,0.0,0.0,0.0
31707,5.0,5.0,5.0,5.0,5.6,0.0,0.0,0.0,0.0,0.0,...,0.0,-7.6,0.0,0.0,-4.9,0.0,0.0,0.0,0.0,1.6
31708,5.0,5.0,5.0,5.0,5.6,0.0,0.0,0.0,0.0,0.0,...,0.0,-7.6,0.0,0.0,-4.9,0.0,0.0,0.0,0.0,1.6
31709,5.6,5.6,4.4,4.4,3.9,0.0,0.0,0.0,0.0,0.0,...,0.0,-4.7,0.0,0.0,0.0,0.0,-8.5,0.0,0.0,0.0
31710,6.9,6.8,6.7,6.1,5.7,0.0,0.2,0.3,0.3,0.8,...,0.0,4.9,7.6,11.7,14.8,0.0,1.6,1.5,6.5,3.0
31711,7.8,8.3,8.3,8.3,8.9,0.3,1.1,0.4,0.5,0.5,...,0.0,0.0,10.1,7.8,4.5,0.0,11.2,5.6,10.4,22.3
31712,7.8,8.3,10.0,10.0,10.0,1.6,1.5,0.8,0.8,2.3,...,4.6,7.5,7.8,8.9,2.6,6.1,5.6,10.4,11.8,13.0


In [15]:
weather_df.to_csv("C:/Users/Dovekie/Documents/PelagicBirdProgram/datasets/noga/noga_weather_200601-202312_masscoast_50_000-012_3.csv", sep='\t')