## Data Preparation for Rice Yield Modeling

In this notebook, we will compile data from multiple sources into a single grand `.csv` file to faciliate ease of modeling.

In [1]:
# Load required packages.
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
# Raw data files

# Rice yield
df_rice = pd.read_csv('../Raw_data/ICRISAT_rice.csv')

# Rice-irrigated area
df_irrigation = pd.read_csv('../Raw_data/ICRISAT_rice_irrigation.csv')

# Monthly precipitation (mm)
df_prec = pd.read_csv('../Raw_data/ICRISAT_prec.csv')

# Monthly evapotranspiration (mm)
df_et = pd.read_csv('../Raw_data/ICRISAT_evapotranspiration_actual.csv')

# Monthly max temperature (celsius)
df_maxT = pd.read_csv('../Raw_data/ICRISAT_max_temp.csv')

# Monthly min temperature (celsius)
df_minT = pd.read_csv('../Raw_data/ICRISAT_min_temp.csv')

# Surface runoff (mm)
df_runoff = pd.read_csv('../Raw_data/ICRISAT_runoff.csv')

# Month-averaged wind speed (m/s)
df_windspeed = pd.read_csv('../Raw_data/ICRISAT_windspeed.csv')

# Fertilizer use
df_npk = pd.read_csv('../Raw_data/ICRISAT_npk.csv')

ICRISAT databases use the key `-1` to denote missing values. We will first replace all instances of `-1` with NaNs, and then use the `dropna()` method of Pandas DataFrame to drop all rows with missing values.

In [3]:
# Data cleaning

# Area, production, and yield of rice
df_rice.replace(-1, np.NaN, inplace=True)
df_rice.dropna(inplace=True)

# Rice-irrigated area
df_irrigation.replace(-1, np.NaN, inplace=True)
df_irrigation.dropna(inplace=True)

# Precipitation
df_prec.replace(-1, np.NaN, inplace=True)
df_prec.dropna(inplace=True)

# Evapotranspiration
df_et.replace(-1, np.NaN, inplace=True)
df_et.dropna(inplace=True)

# Surface runoff
df_runoff.replace(-1, np.NaN, inplace=True)
df_runoff.dropna(inplace=True)

# Wind speed
df_windspeed.replace(-1, np.NaN, inplace=True)
df_windspeed.dropna(inplace=True)

# NPK fertilizer data
df_npk.replace(-1, np.NaN, inplace=True)
df_npk.dropna(inplace=True)

In [4]:
# Drop rows (district, year) with zero gross cropped area.
df_rice.drop(df_rice.loc[df_rice['RICE AREA (1000 ha)']<=0].index, inplace=True)

## Average quantities over growing and harvesting seasons.

Rice is a Kharif (monsoon) crop that is typically sown during June &ndash; July and harvested during November &ndash; December every year.

For convenience, let us define the following seasons for our analysis.
1. Rice growing season: June &ndash; October
2. Rice harvesting season: November &ndash; December

We will now construct averages of different variables over the growing and harvesting seasons separately.

In [5]:
# Precipitation data
print(len(df_prec.columns),'\n', df_prec.columns[10:15], '\n', df_prec.columns[15:17])

# Averages over growing season
df_prec['prec_grow'] = df_prec.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_prec['prec_harvest'] = df_prec.iloc[:,15:17].mean(axis=1)

df_prec[['prec_grow', 'prec_harvest']].head()

17 
 Index(['JUNE PRECIPITATION (Millimeters)', 'JULY PRECIPITATION (Millimeters)',
       'AUGUST PRECIPITATION (Millimeters)',
       'SEPTEMBER PRECIPITATION (Millimeters)',
       'OCTOBER PRECIPITATION (Millimeters)'],
      dtype='object') 
 Index(['NOVEMBER PRECIPITATION (Millimeters)', 'DECEMBER PRECIPITATION (Millimeters)'], dtype='object')


Unnamed: 0,prec_grow,prec_harvest
0,267.152,2.16
1,296.12,0.53
2,230.264,0.0
3,334.662,4.13
4,194.122,131.625


In [6]:
# Evapotranspiration data
print(len(df_et.columns),'\n', df_et.columns[10:15], '\n', df_et.columns[15:17])

# Averages over growing season
df_et['et_grow'] = df_et.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_et['et_harvest'] = df_et.iloc[:,15:17].mean(axis=1)

df_et[['et_grow', 'et_harvest']].head()

17 
 Index(['JUNE ACTUAL (Millimeters)', 'JULY ACTUAL (Millimeters)',
       'AUGUST ACTUAL (Millimeters)', 'SEPTEMBER ACTUAL (Millimeters)',
       'OCTOBER ACTUAL (Millimeters)'],
      dtype='object') 
 Index(['NOVEMBER ACTUAL (Millimeters)', 'DECEMBER ACTUAL (Millimeters)'], dtype='object')


Unnamed: 0,et_grow,et_harvest
0,109.302,49.39
1,121.01,45.755
2,121.182,47.015
3,121.95,47.94
4,98.358,53.29


In [7]:
# Mean maximum temperature
print(len(df_maxT.columns),'\n', df_maxT.columns[10:15], '\n', df_maxT.columns[15:17])

# Averages over growing season
df_maxT['maxT_grow'] = df_maxT.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_maxT['maxT_harvest'] = df_maxT.iloc[:,15:17].mean(axis=1)

df_maxT[['maxT_grow', 'maxT_harvest']].head()

17 
 Index(['JUNE MAXIMUM (Centigrate)', 'JULY MAXIMUM (Centigrate)',
       'AUGUST MAXIMUM (Centigrate)', 'SEPTEMBER MAXIMUM (Centigrate)',
       'OCTOBER MAXIMUM (Centigrate)'],
      dtype='object') 
 Index(['NOVEMBER MAXIMUM (Centigrate)', 'DECEMBER MAXIMUM (Centigrate)'], dtype='object')


Unnamed: 0,maxT_grow,maxT_harvest
0,32.144,28.7
1,31.34,28.19
2,31.732,28.54
3,30.942,27.26
4,31.382,28.21


In [8]:
# Mean minimum temperature
print(len(df_minT.columns),'\n', df_minT.columns[10:15], '\n', df_minT.columns[15:17])

# Averages over growing season
df_minT['minT_grow'] = df_minT.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_minT['minT_harvest'] = df_minT.iloc[:,15:17].mean(axis=1)

df_minT[['minT_grow', 'minT_harvest']].head()

17 
 Index(['JUNE MINIMUM (Centigrate)', 'JULY MINIMUM (Centigrate)',
       'AUGUST MINIMUM (Centigrate)', 'SEPTEMBER MINIMUM (Centigrate)',
       'OCTOBER MINIMUM (Centigrate)'],
      dtype='object') 
 Index(['NOVEMBER MINIMUM (Centigrate)', 'DECEMBER MINIMUM (Centigrate)'], dtype='object')


Unnamed: 0,minT_grow,minT_harvest
0,24.084,14.45
1,23.71,13.825
2,23.842,13.6
3,23.126,12.53
4,23.48,13.495


In [9]:
# Mean surface runoff
print(len(df_runoff.columns),'\n', df_runoff.columns[9:14], '\n', df_runoff.columns[14:16])

# Averages over growing season
df_runoff['runoff_grow'] = df_runoff.iloc[:,9:14].mean(axis=1)

# Average over harvesting season
df_runoff['runoff_harvest'] = df_runoff.iloc[:,14:16].mean(axis=1)

df_runoff[['runoff_grow', 'runoff_harvest']].head()

16 
 Index(['JUNE Q (mm)', 'JULY Q (mm)', 'AUG Q (mm)', 'SEPT Q (mm)',
       'OCT Q (mm)'],
      dtype='object') 
 Index(['NOV Q (mm)', 'DEC Q (mm)'], dtype='object')


Unnamed: 0,runoff_grow,runoff_harvest
0,99.178,30.02
1,40.292,0.05
2,26.236,0.38
3,52.416,0.705
4,137.788,0.585


In [10]:
# Mean wind speed
print(len(df_windspeed.columns),'\n', df_windspeed.columns[9:14], '\n', df_windspeed.columns[14:16])

# Averages over growing season
df_windspeed['windspeed_grow'] = df_windspeed.iloc[:,9:14].mean(axis=1)

# Average over harvesting season
df_windspeed['windspeed_harvest'] = df_windspeed.iloc[:,14:16].mean(axis=1)

df_windspeed[['windspeed_grow', 'windspeed_harvest']].head()

16 
 Index(['JUNE', 'JULY', 'AUG', 'SEPT', 'OCT'], dtype='object') 
 Index(['NOV', 'DEC'], dtype='object')


Unnamed: 0,windspeed_grow,windspeed_harvest
0,2.2764,1.479
1,2.1974,1.0875
2,1.88,1.123
3,2.4454,1.238
4,2.2446,1.051


## Mass of NPK fertilizer used

District-level fertilizer data are available separately for nitrogen, phosphorous, and potash in units of kilogram per hectare. For a growing season lasting $N_{\rm growing}$ days, we derive the season-averaged fertilizer use as follows.

\begin{align}
\rm{Mass \ of \ fertilizer \ used \ during \ growing \ season} = \rm{(kg/ha \ of \  fertlizer)} \times \rm{(gross\  cropped\ area)} \times \frac{N_{\rm growing}}{365.25}
\end{align}

In [11]:
# No. of days during rice growing season  (June 1 to Oct 31, including end dates)
N_growing = (datetime(2022,10, 31) - datetime(2022, 6, 1)).days + 1

In [12]:
df_npk.tail()

Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,NITROGEN CONSUMPTION (tons),NITROGEN SHARE IN NPK (Percent),NITROGEN PER HA OF NCA (Kg per ha),NITROGEN PER HA OF GCA (Kg per ha),PHOSPHATE CONSUMPTION (tons),PHOSPHATE SHARE IN NPK (Percent),PHOSPHATE PER HA OF NCA (Kg per ha),PHOSPHATE PER HA OF GCA (Kg per ha),POTASH CONSUMPTION (tons),POTASH SHARE IN NPK (Percent),POTASH PER HA OF NCA (Kg per ha),POTASH PER HA OF GCA (Kg per ha),TOTAL CONSUMPTION (tons),TOTAL PER HA OF NCA (Kg per ha),TOTAL PER HA OF GCA (Kg per ha)
13901,2117,2017,20,Telangana,Vikarabad,8335.0,56.03,41.09,39.68,5537.0,37.22,27.3,26.36,1005.0,6.76,4.95,4.78,14877.0,73.34,70.83
13902,2118,2017,20,Telangana,Jayashankar Bhuppaly,8692.0,58.29,62.08,48.91,4139.0,27.76,29.56,23.29,2081.0,13.96,14.86,11.71,14912.0,106.5,83.9
13903,2119,2017,20,Telangana,Janagaon,8289.0,50.18,64.46,51.62,6024.0,36.46,46.85,37.51,2207.0,13.36,17.16,13.74,16520.0,128.47,102.88
13904,2120,2017,20,Telangana,Warangal Urban,101516.0,77.24,1582.85,1029.01,18827.0,14.32,293.55,190.84,11086.0,8.43,172.85,112.37,131429.0,2049.26,1332.22
13905,2121,2017,20,Telangana,Mahabubabad,7867.0,61.22,55.9,46.05,4118.0,32.05,29.26,24.1,865.0,6.73,6.15,5.06,12850.0,91.31,75.21


In [13]:
df_rice.tail()

Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (kg per ha)
13960,2119,2017,20,Telangana,Janagaon,51.36,154.53,3008.8
13961,2120,2016,20,Telangana,Warangal Urban,37.97,137.92,3632.45
13962,2120,2017,20,Telangana,Warangal Urban,34.8,117.91,3388.63
13963,2121,2016,20,Telangana,Mahabubabad,47.78,159.52,3338.59
13964,2121,2017,20,Telangana,Mahabubabad,53.0,140.62,2653.18


We will first merge the rice yield and fertlizer use tables into a single data frame through their shared columns.

In [14]:
# Inner join in the SQL sense
df_rice_npk = df_rice.merge(df_npk, on=['Dist Code', 'Year', 'State Code', 'State Name', 'Dist Name'],
                            validate='one_to_one', how='inner')

In [15]:
# Mass (kg) of nitrogen fertilizer
df_rice_npk['nitrogen'] = df_rice_npk['NITROGEN PER HA OF GCA (Kg per ha)'] * df_rice_npk['RICE AREA (1000 ha)'] * N_growing/365.25
# Mass (kg) of phosphate fertilizer
df_rice_npk['phosphate'] = df_rice_npk['PHOSPHATE PER HA OF GCA (Kg per ha)'] * df_rice_npk['RICE AREA (1000 ha)'] * N_growing/365.25
# Mass (kg) of potash fertilizer
df_rice_npk['potash'] = df_rice_npk['POTASH PER HA OF GCA (Kg per ha)'] * df_rice_npk['RICE AREA (1000 ha)'] * N_growing/365.25

### Building the final data set

`Dist Code`: District code <br>
`Year`: Year <br>
`Dist Name`: District name <br>
`State Name`: State name <br>
`RICE AREA (1000 ha)`: Gross cropped area of rice in units of 1000 hectares <br>
`RICE YIELD (kg per ha)`: Rice yield (kg/ha) <br>
`nitrogen`: tonnes of nitogen fertilizer used <br>
`phosphate`: tonnes of phosphate fertilizer used <br>
`potash`: tonnes of potash fertilizer used <br>
`prec_grow`: Average monthly precipitation (mm) during growing season <br>
`prec_harvest`: Average monthly precipitation (mm) during harvesting season <br>
`et_grow`: Average monthly evapotranspiration (mm) during growing season <br>
`et_harvest`: Average monthly evapotranspiration (mm) during harvesting season <br>
`maxT_grow`: Average of monthly maximum temperatures (celsius) during growing season <br>
`maxT_harvest`: Average of monthly maximum temperatures (celsius) during harvesting season <br>
`minT_grow`: Average of monthly minimum temperatures (celsius) during growing season <br>
`minT_harvest`: Average of monthly minimum temperatures (celsius) during harvesting season <br>
`runoff_grow`: Mean surface runoff (mm) during growing season <br>
`runoff_harvest`: Mean surface runoff (mm) during harvesting season <br>
`windspeed_grow`: Mean wind speed (m/s) during growing season <br>
`windspeed_harvest`: Mean wind speed (m/s) during harvesting season <br>
`RICE IRRIGATED AREA (1000 ha)`: Rice-irrigated area in units of 1000 hectares <br>
`RA-IA`: Difference between cropped area and irrigated area of rice (1000 hecatres) <br>

In [16]:
# Retain only necessary columns in different DataFrame objects.
df_rice_npk = df_rice_npk[['Dist Code', 'Year', 'Dist Name', 'State Name', 'RICE AREA (1000 ha)',
                           'RICE YIELD (kg per ha)', 'nitrogen', 'phosphate', 'potash' 
                         ]].copy()

df_prec = df_prec[['Dist Code', 'Year', 'prec_grow', 'prec_harvest']].copy()

df_et = df_et[['Dist Code', 'Year', 'et_grow', 'et_harvest']].copy()

df_maxT = df_maxT[['Dist Code', 'Year', 'maxT_grow', 'maxT_harvest']].copy()

df_minT = df_minT[['Dist Code', 'Year', 'minT_grow', 'minT_harvest']].copy()

df_runoff = df_runoff[['Dist Code', 'Year', 'runoff_grow', 'runoff_harvest']].copy()

df_windspeed = df_windspeed[['Dist Code', 'Year', 'windspeed_grow', 'windspeed_harvest']].copy()

df_irrigation = df_irrigation[['Dist Code', 'Year', 'RICE IRRIGATED AREA (1000 ha)']].copy()

In [17]:
# Merge contents of different DataFrame objects into a single grand DataFrame object.
final_df = df_rice_npk.merge(df_prec, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_et, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_maxT, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_minT, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_runoff, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_windspeed, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_irrigation, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

In [18]:
final_df['RA-IA'] = final_df['RICE AREA (1000 ha)'] -  final_df['RICE IRRIGATED AREA (1000 ha)']

In [19]:
final_df.sample(10, random_state=570)

Unnamed: 0,Dist Code,Year,Dist Name,State Name,RICE AREA (1000 ha),RICE YIELD (kg per ha),nitrogen,phosphate,potash,prec_grow,...,maxT_grow,maxT_harvest,minT_grow,minT_harvest,runoff_grow,runoff_harvest,windspeed_grow,windspeed_harvest,RICE IRRIGATED AREA (1000 ha),RA-IA
2144,89,2009,Thanjavur,Tamil Nadu,162.94,2621.0,9511.895182,3342.404614,4510.232733,85.166,...,34.956,29.15,26.104,22.695,4.258,27.655,2.3984,1.839,162.94,0.0
5935,526,2002,Rewari,Haryana,0.3,0.0,9.925207,5.54947,0.081684,50.774,...,35.26,26.44,26.05,11.345,2.54,0.355,1.4574,0.5115,0.34,-0.04
1052,44,2013,Srikakulam,Andhra Pradesh,203.14,1749.0,7254.225339,2114.574776,816.047167,255.034,...,31.404,28.085,25.262,17.96,86.736,0.335,1.8844,1.3,175.41,27.73
5747,517,2005,Sindhudurg,Maharashtra,78.7,2849.0,915.815901,342.854045,303.293963,1103.81,...,28.932,30.82,23.142,18.7,936.97,0.05,3.11,1.8115,4.2,74.5
175,8,2009,Balaghat,Madhya Pradesh,247.8,1365.0,4428.160559,2895.016361,233.552772,245.858,...,31.658,27.77,23.57,12.48,92.75,0.49,1.5286,0.615,130.82,116.98
4683,217,2008,Mirzpur,Uttar Pradesh,102.48,2068.0,3271.540376,2013.750939,418.976959,201.284,...,33.062,29.605,25.142,13.4,62.224,0.02,0.915,0.379,83.51,18.97
5247,239,2000,Pithorgarh,Uttarakhand,23.05,1157.0,20.276427,10.234768,8.786452,239.322,...,16.906,6.895,8.37,-2.645,172.952,0.53,2.083,1.324,4.49,18.56
922,39,2011,Vidisha,Madhya Pradesh,0.4,1000.0,3.734834,3.671162,0.234579,233.316,...,32.034,28.475,23.126,13.395,100.286,0.01,1.4346,0.748,0.0,0.4
4561,211,2015,Jhansi,Uttar Pradesh,9.22,1785.0,446.081396,167.425355,31.553983,128.248,...,35.126,28.32,25.712,11.625,29.7,0.05,1.1232,0.555,10.27,-1.05
2538,111,2001,Nanded,Maharashtra,23.9,1021.0,702.306653,343.594645,167.292148,152.586,...,31.914,30.105,22.774,15.63,10.28,0.055,2.0088,1.002,0.0,23.9


In [20]:
len(final_df)

10480

In [21]:
# Write grand data set to file.
final_df.to_csv('../Final_data/rice_yield.csv', index=False)