# Add Weather Indicators to Sakura Bloom Data Frame

For each location, we have 14 types of weather data by month.

In [3]:
import pandas as pd
import numpy as np

In [4]:
!pwd

/home/snoopy/sakura/processing


### import csv's for converting between romaji and kanji location representations

In [5]:
location_kanji = pd.read_csv('../data/flowering.csv').l_name.unique()

collected google romaji translation for each location kanji referred to in the sakura bloom dataset

In [6]:
location_romaji = pd.read_csv('../data/location_romaji.csv',names=['location'],index_col=0)

Create dictionary to lookup romaji for all of the location names (of bloom data) written in kanji 
The weather data references romaji location names, so we'll use this to map names later on

In [9]:
location_dict = dict(zip(location_kanji,location_romaji.location))

### Grab weather station codes for use in reading the weather csv files

In [7]:
weather_station_df = pd.read_csv('../data/station_info.csv',index_col=0).set_index('location')

Locations that are in our sakura data but don't have weather stations

In [10]:
np.setdiff1d(list(location_dict.values()),weather_station_df.index.values)

array(['IKUHARA', 'IRIOMOTE ISLAND', 'KOTOKO', 'KUSHIMOTO', 'KUTYASU',
       'MINAMI DAITOJIMA', 'MONBETSU', 'NASE', 'NISHIGO', 'OBARI',
       'SHIGA', 'SHIROKAWA', 'TAKATA', 'TANESHIMA', 'TIDE', 'TOYOKA',
       'YONEKO'], dtype='<U16')

Locations that have weather stations but no sakura data (not relevant here so it's commented out)
In the future, could look at matching stations that are 'nearby', even if the exact name perhaps doesn't match

In [11]:
# np.setdiff1d(weather_station_df.index.values,list(location_dict.values()))

What we want our data frame to look like (after all of the processing work is done)

In [13]:
pd.DataFrame([],columns=['bloom_days','location','year','last_yr_{WEATHER}_month','last_yr_bloom_days','2_yr_bloom_days'])

Unnamed: 0,bloom_days,location,year,last_yr_{WEATHER}_month,last_yr_bloom_days,2_yr_bloom_days


### Map bloom locations to romaji

In [35]:
df = pd.read_csv('../sakura/data/bloom.csv')
df['l_name_romaji']=df.l_name.map(location_dict) # romaji name column to match to weather station data

### Merge weather data and bloom data

The script below will take a while to run. It's recommended to convert the notebook to a python script (with jupyter nbconvert) and then run this in the background using a utility like dtach.

In [None]:
# # for a given view and location and lag value this populates the lag data in the locations data frame
# for location in df.l_name_romaji.unique(): # loop through all romaji locations
    
#     for view in range(1,15): # try to loop through all weather parameters
#         try:
#             # attempt to lookup weather station code for reading weather csv later
#             number = weather_station_df.loc[location,'number']
#             w = pd.read_csv(f'{location}_{number}_{view}.csv',index_col=0).set_index('Year')
#         except:
#             # if there is no weather for this location, break out to next view
#             print(f'exception raised: no view {view} for location {location}, going to next view {view+1} if exists')
#             continue
        
#         # utility function to create a mask, we'll use later
#         def locindex(yr,location): 
#             return (df.year==yr) & (df.l_name_romaji==location)
        
#         # loop over all years from one location
#         for yr in df.loc[(df.l_name_romaji==location)].year.unique():
#             #get col names from weather data that we're importing
#             col_names = getattr(w.columns,'values') 
#             # iterate over how many lags to create
#             for lag in range(1,max_lag+1):
#                 #create list (or array?) of cols to add
#                 new_cols = [f'{col}_{view}_lag_{lag}' for col in col_names]

#                 # try to lookup the lagged vals in the weather import frame @ row index = yr -1
#                 try:
#                     vals = [i for i in w.loc[yr-lag,:]]
#                 except:

#                     print(f'attempt to lookup lagged {lag} vals in {view} for location {location} at year {yr} failed, continue to next year')

#                     continue # if something fails, continue to next yr

#                 # try to set values in main df
#                 for col,val in zip(new_cols,vals):
#                     try:
#                         df.loc[locindex(yr,location),col] = val
#                     except:
#                         print('raised exception at attempt to set col val in main df,continue to next year')
#                         continue # go to the next column if there's an issue


1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968


1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 14 for location ASAHIKAWA at year 1953 failed, continue to next year
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
196

1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 11 for location OBIHIRO at year 1953 failed, continue to next year
1954
attempt to lookup lagged vals in 11 for location OBIHIRO at year 1954 failed, continue to next year
1955
attempt to lookup lagged vals in 11 for location OBIHIRO at year 1955 failed, continue to next year
1956
attempt to lookup lagged vals in 11 for location OBIHIRO at year 1956 failed, continue to next year
1957
attempt to lookup lagged vals in 11 for location OBIHIRO at year 1957 failed, continue to next year
1958
attempt to lookup lagged vals in 11 for location OBIHIRO at year 1958 failed, continue to next year
1959
attem

1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 14 for location MURORAN at year 1953 failed, continue to next year
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971


1971
1972
1973
1974
1975
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 14 for location HAKODATE at year 1953 failed, continue to next year
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
exception raised: no view 1 for location KUTYASU, going to next view 2 if exists
exception raised: no view 2 for location KUTYASU, going to next view 3 if exists
exception raised: no view 3 for location KUTYASU, going to next view 4 if exists
exception raised: no view 4 for location KUTYASU, going to next view 5 if exists
exception raised: no view 5 for location KUTYASU, going to next view 6 if exists
exception raised: no view 6 for location KUTYASU, going to next view 7 if exists
exception raised: no view 7 for location KUTYASU, going to next view 8 if exists
exception raised: no view 8 for location KUTYASU, going to next view 9 if exi

1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 9 for location HIROO at year 1953 failed, continue to next year
1954
attempt to lookup lagged vals in 9 for location HIROO at year 1954 failed, continue to next year
1955
attempt to lookup lagged vals in 9 for location HIROO at year 1955 failed, continue to next year
1956
attempt to lookup lagged vals in 9 for location HIROO at year 1956 failed, continue to next year
1957
attempt to lookup lagged vals in 9 for location HIROO at year 1957 failed, continue to next year
1958
attempt to lookup lagged vals in 9 for location HIROO at year 1958 failed, continue to next year
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 10 for location HIROO at year 1953 failed, continue to next year
1954
attempt to lookup lagged vals in 10 for location HIROO at year 1954 failed, continue to next year
1955
attempt to lookup lagged vals in 10 for location HIR

1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 9 for location SHINJO at year 1953 failed, continue to next year
1954
attempt to lookup lagged vals in 9 for location SHINJO at year 1954 failed, continue to next year
1955
attempt to lookup lagged vals in 9 for location SHINJO at year 1955 failed, continue to next year
1956
attempt to lookup lagged vals in 9 for location SHINJO at year 1956 failed, continue to next year
1957
attempt to lookup lagged vals in 9 for location SHINJO at year 1957 failed, continue to next year
1958
attempt to lookup lagged vals in 9 for location SHINJO at year 1958 failed, continue to next year
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1953
attempt to lookup lagged vals in 10 for location SHINJO at year 1953 failed, continue to next year
1954
attempt to lookup lagged vals in 10 for location SHINJO at year 1954 failed, continue to next year
1955
attempt to lookup l

In [None]:
df.to_csv('../data/df_with_weather_data.csv')