## 2022-07-13 IMD Gridded data extraction sample flow

by Nikhil VJ, https://nikhilvj.co.in

Processing weather data from https://imdpune.gov.in/Clim_Pred_LRF_New/Grided_Data_Download.html

Using imdblib package, https://imdlib.readthedocs.io/en/latest/Usage.html#processing   

Got imdblib reference from here: https://www.youtube.com/watch?v=40Dvj6MwGTs  

In [1]:
import imdlib as imd
import pandas as pd

In [15]:
settings = {
    'tmax': {
        'start_yr': 1951, 'end_yr': 2020, 'junk': 99, 'valid_criteria': 'lt'
    },
    'tmin': {
        'start_yr': 1951, 'end_yr': 2020, 'junk': 99, 'valid_criteria': 'lt'
    },
    'rain': {
        'start_yr': 1901, 'end_yr': 2020, 'junk': -900, 'valid_criteria': 'gt'
    }
}

In [27]:
limit = 9
start_yr = 1951

In [26]:
# tmax
variable = 'tmax'
res = imd.get_data(variable, start_yr, start_yr+limit, fn_format='yearwise')
data = imd.open_data(variable, start_yr, start_yr+limit,'yearwise')
df_tmax1 = data.get_xarray().to_dataframe()

Downloading: maxtemp for year 1951
Downloading: maxtemp for year 1952
Downloading: maxtemp for year 1953
Downloading: maxtemp for year 1954
Downloading: maxtemp for year 1955
Downloading: maxtemp for year 1956
Downloading: maxtemp for year 1957
Downloading: maxtemp for year 1958
Downloading: maxtemp for year 1959
Downloading: maxtemp for year 1960
Downloading: maxtemp for year 1961
Download Successful !!!


In [19]:
df_tmax2 = df_tmax1[df_tmax1[variable] < settings[variable]['junk']].reset_index()

In [21]:
print(f"{round(100*(len(df_tmax1) - len(df_tmax2))/len(df_tmax1),1)}% of {len(df_tmax1)//1000}k rows data was junk, discarded.")

65.1% of 3861k rows data was junk, discarded.


In [20]:
df_tmax2

Unnamed: 0,time,lat,lon,tmax
0,1951-01-01,8.5,73.5,31.450001
1,1951-01-01,8.5,76.5,30.020000
2,1951-01-01,8.5,77.5,31.190001
3,1951-01-01,8.5,78.5,30.170000
4,1951-01-01,9.5,76.5,30.180000
...,...,...,...,...
1346944,1961-12-31,33.5,78.5,9.960000
1346945,1961-12-31,33.5,79.5,11.090000
1346946,1961-12-31,34.5,76.5,8.970000
1346947,1961-12-31,34.5,77.5,8.500000


In [22]:
# tmin
variable = 'tmin'
res = imd.get_data(variable, start_yr, start_yr+limit, fn_format='yearwise')
data = imd.open_data(variable, start_yr, start_yr+limit,'yearwise')
df_tmin1 = data.get_xarray().to_dataframe()

Downloading: mintemp for year 1951
Downloading: mintemp for year 1952
Downloading: mintemp for year 1953
Downloading: mintemp for year 1954
Downloading: mintemp for year 1955
Downloading: mintemp for year 1956
Downloading: mintemp for year 1957
Downloading: mintemp for year 1958
Downloading: mintemp for year 1959
Downloading: mintemp for year 1960
Downloading: mintemp for year 1961
Download Successful !!!


In [23]:
df_tmin2 = df_tmin1[df_tmin1[variable] < settings[variable]['junk']].reset_index()

In [24]:
print(f"{round(100*(len(df_tmin1) - len(df_tmin2))/len(df_tmin1),1)}% of {len(df_tmin1)//1000}k rows data was junk, discarded.")

65.1% of 3861k rows data was junk, discarded.


In [25]:
df_tmin2

Unnamed: 0,time,lat,lon,tmin
0,1951-01-01,8.5,73.5,22.469999
1,1951-01-01,8.5,76.5,21.760000
2,1951-01-01,8.5,77.5,22.930000
3,1951-01-01,8.5,78.5,22.990000
4,1951-01-01,9.5,76.5,20.879999
...,...,...,...,...
1346944,1961-12-31,33.5,78.5,0.450000
1346945,1961-12-31,33.5,79.5,1.270000
1346946,1961-12-31,34.5,76.5,-0.720000
1346947,1961-12-31,34.5,77.5,-1.650000


In [None]:
# combine tmax and tmin data

In [30]:
dft1 = pd.merge(df_tmax2, df_tmin2, how='left', on=['time','lat','lon'] )
dft1

Unnamed: 0,time,lat,lon,tmax,tmin
0,1951-01-01,8.5,73.5,31.450001,22.469999
1,1951-01-01,8.5,76.5,30.020000,21.760000
2,1951-01-01,8.5,77.5,31.190001,22.930000
3,1951-01-01,8.5,78.5,30.170000,22.990000
4,1951-01-01,9.5,76.5,30.180000,20.879999
...,...,...,...,...,...
1346944,1961-12-31,33.5,78.5,9.960000,0.450000
1346945,1961-12-31,33.5,79.5,11.090000,1.270000
1346946,1961-12-31,34.5,76.5,8.970000,-0.720000
1346947,1961-12-31,34.5,77.5,8.500000,-1.650000


In [51]:
# nr pune: at .5 res, 18.5, 73.5 (tried for 74.0 which is closer, but no data available on tht)
dft1_pune = dft1[(dft1['lat']==18.5) & (dft1['lon']==73.5)].copy()
dft1_pune

Unnamed: 0,time,lat,lon,tmax,tmin
70,1951-01-01,18.5,73.5,28.090000,17.250000
404,1951-01-02,18.5,73.5,28.629999,16.719999
738,1951-01-03,18.5,73.5,29.160000,15.470000
1072,1951-01-04,18.5,73.5,29.020000,15.430000
1406,1951-01-05,18.5,73.5,28.770000,15.010000
...,...,...,...,...,...
1345344,1961-12-27,18.5,73.5,28.730000,15.690000
1345679,1961-12-28,18.5,73.5,29.260000,14.600000
1346014,1961-12-29,18.5,73.5,28.379999,13.380000
1346349,1961-12-30,18.5,73.5,29.219999,13.370000


In [33]:
dft1_pune.to_csv('temp_pune.csv',index=False)

## Rain data

In [34]:
# rain
limit = 2 # seems its a lot of data - one yr's file is 24.2MB
variable = 'rain'
res = imd.get_data(variable, start_yr, start_yr+limit, fn_format='yearwise')

Downloading: rain for year 1951
Downloading: rain for year 1952
Downloading: rain for year 1953
Download Successful !!!


In [35]:
data = imd.open_data(variable, start_yr, start_yr+limit,'yearwise')
df_rain1 = data.get_xarray().to_dataframe()

In [36]:
df_rain2 = df_rain1[df_rain1[variable] > settings[variable]['junk']].reset_index()

In [48]:
# ensuring that the discarded data is junk only
df_rain1[df_rain1[variable] < settings[variable]['junk']][variable].describe()

count    13646296.0
mean         -999.0
std             0.0
min          -999.0
25%          -999.0
50%          -999.0
75%          -999.0
max          -999.0
Name: rain, dtype: float64

In [43]:
print(f"{round(100*(len(df_rain1) - len(df_rain2))/len(df_rain1),1)}% of {len(df_rain1)//pow(10,6)}M rows data was junk, discarded.")

71.5% of 19M rows data was junk, discarded.


In [44]:
df_rain2

Unnamed: 0,time,lat,lon,rain
0,1951-01-01,8.25,77.00,0.000000
1,1951-01-01,8.25,77.25,0.000000
2,1951-01-01,8.25,77.50,5.228102
3,1951-01-01,8.25,77.75,2.281837
4,1951-01-01,8.50,76.75,0.000000
...,...,...,...,...
5440539,1953-12-31,37.00,75.50,6.498086
5440540,1953-12-31,37.25,74.50,6.639433
5440541,1953-12-31,37.25,74.75,7.242740
5440542,1953-12-31,37.25,75.00,7.362216


In [53]:
# pune: if .25 res, then 18.5, 73.75
# nr pune
dfr1_pune = df_rain2[(df_rain2['lat']==18.5) & (df_rain2['lon']==73.75)].copy()
dfr1_pune

Unnamed: 0,time,lat,lon,rain
954,1951-01-01,18.5,73.75,0.0
5918,1951-01-02,18.5,73.75,0.0
10882,1951-01-03,18.5,73.75,0.0
15846,1951-01-04,18.5,73.75,0.0
20810,1951-01-05,18.5,73.75,0.0
...,...,...,...,...
5416678,1953-12-27,18.5,73.75,0.0
5421642,1953-12-28,18.5,73.75,0.0
5426606,1953-12-29,18.5,73.75,0.0
5431570,1953-12-30,18.5,73.75,0.0


In [54]:
dfr1_pune.to_csv('rain_pune.csv',index=False)

## Visualized here:
https://docs.google.com/spreadsheets/d/13j3kMvGdr52_dX-4nG9YDFCu3ZO_sYadFmYeiZJVgWo/edit?usp=sharing