## Data Preparation for Rice Yield Modeling

In this notebook, we will compile data from multiple sources into a single grand `.csv` file to faciliate ease of modeling.

In [118]:
# Load required packages.
import numpy as np
import pandas as pd
from datetime import datetime

import os

In [119]:
 os.getcwd()

'/Users/darbinyan/Jupyter notebooks/Bootcamp joint project'

In [120]:
# Raw data files

# Rice yield
df_rice = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_rice.csv')

# Monthly precipitation (mm)
df_prec = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_prec.csv')

# Monthly evapotranspiration (mm)
df_et = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_evapotranspiration_actual.csv')

# Monthly max temperature (celsius)
df_maxT = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_max_temp.csv')

# Monthly min temperature (celsius)
df_minT = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_min_temp.csv')

# Surface runoff (mm)
df_runoff = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_runoff.csv')

# Month-averaged wind speed (m/s)
df_windspeed = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_windspeed.csv')

# Fertilizer use
df_npk = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_npk.csv')

#Irrigation data (added 05/26)
df_irrig = pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Raw_data/ICRISAT_irrig.csv')

ICRISAT databases use the key `-1` to denote missing values. We will first replace all instances of `-1` with NaNs, and then use the `dropna()` method of Pandas DataFrame to drop all rows with missing values.

In [121]:
# Data cleaning

# Area, production, and yield of rice
df_rice.replace(-1, np.NaN, inplace=True)
df_rice.dropna(inplace=True)

# Precipitation
df_prec.replace(-1, np.NaN, inplace=True)
df_prec.dropna(inplace=True)

# Evapotranspiration
df_et.replace(-1, np.NaN, inplace=True)
df_et.dropna(inplace=True)

# Surface runoff
df_runoff.replace(-1, np.NaN, inplace=True)
df_runoff.dropna(inplace=True)

# Wind speed
df_windspeed.replace(-1, np.NaN, inplace=True)
df_windspeed.dropna(inplace=True)

# NPK fertilizer data
df_npk.replace(-1, np.NaN, inplace=True)
df_npk.dropna(inplace=True)

#Irrigation data (added 05/26)
df_irrig.replace(-1, np.NaN, inplace=True)
df_irrig.dropna(inplace=True)

In [122]:
# Drop rows (district, year) with zero gross cropped area.
df_rice.drop(df_rice.loc[df_rice['RICE AREA (1000 ha)']<=0].index, inplace=True)

In [123]:
#df_npk.loc[df_npk['Dist Name']=='Thanjavur']

## Average quantities over growing and harvesting seasons.

Rice is a Kharif (monsoon) crop that is typically sown during June &ndash; July and harvested during November &ndash; December every year.

For convenience, let us define the following seasons for our analysis.
1. Rice growing season: June &ndash; October
2. Rice harvesting season: November &ndash; December

We will now construct averages of different variables over the growing and harvesting seasons separately.

In [124]:
# Precipitation data
print(len(df_prec.columns),'\n', df_prec.columns[10:15], '\n', df_prec.columns[15:17])

# Averages over growing season
df_prec['prec_grow'] = df_prec.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_prec['prec_harvest'] = df_prec.iloc[:,15:17].mean(axis=1)

df_prec[['prec_grow', 'prec_harvest']].head()

17 
 Index(['JUNE PRECIPITATION (Millimeters)', 'JULY PRECIPITATION (Millimeters)',
       'AUGUST PRECIPITATION (Millimeters)',
       'SEPTEMBER PRECIPITATION (Millimeters)',
       'OCTOBER PRECIPITATION (Millimeters)'],
      dtype='object') 
 Index(['NOVEMBER PRECIPITATION (Millimeters)', 'DECEMBER PRECIPITATION (Millimeters)'], dtype='object')


Unnamed: 0,prec_grow,prec_harvest
0,267.152,2.16
1,296.12,0.53
2,230.264,0.0
3,334.662,4.13
4,194.122,131.625


In [125]:
# Evapotranspiration data
print(len(df_et.columns),'\n', df_et.columns[10:15], '\n', df_et.columns[15:17])

# Averages over growing season
df_et['et_grow'] = df_et.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_et['et_harvest'] = df_et.iloc[:,15:17].mean(axis=1)

df_et[['et_grow', 'et_harvest']].head()

17 
 Index(['JUNE ACTUAL (Millimeters)', 'JULY ACTUAL (Millimeters)',
       'AUGUST ACTUAL (Millimeters)', 'SEPTEMBER ACTUAL (Millimeters)',
       'OCTOBER ACTUAL (Millimeters)'],
      dtype='object') 
 Index(['NOVEMBER ACTUAL (Millimeters)', 'DECEMBER ACTUAL (Millimeters)'], dtype='object')


Unnamed: 0,et_grow,et_harvest
0,109.302,49.39
1,121.01,45.755
2,121.182,47.015
3,121.95,47.94
4,98.358,53.29


In [126]:
# Mean maximum temperature
print(len(df_maxT.columns),'\n', df_maxT.columns[10:15], '\n', df_maxT.columns[15:17])

# Averages over growing season
df_maxT['maxT_grow'] = df_maxT.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_maxT['maxT_harvest'] = df_maxT.iloc[:,15:17].mean(axis=1)

df_maxT[['maxT_grow', 'maxT_harvest']].head()

17 
 Index(['JUNE MAXIMUM (Centigrate)', 'JULY MAXIMUM (Centigrate)',
       'AUGUST MAXIMUM (Centigrate)', 'SEPTEMBER MAXIMUM (Centigrate)',
       'OCTOBER MAXIMUM (Centigrate)'],
      dtype='object') 
 Index(['NOVEMBER MAXIMUM (Centigrate)', 'DECEMBER MAXIMUM (Centigrate)'], dtype='object')


Unnamed: 0,maxT_grow,maxT_harvest
0,32.144,28.7
1,31.34,28.19
2,31.732,28.54
3,30.942,27.26
4,31.382,28.21


In [127]:
# Mean minimum temperature
print(len(df_minT.columns),'\n', df_minT.columns[10:15], '\n', df_minT.columns[15:17])

# Averages over growing season
df_minT['minT_grow'] = df_minT.iloc[:,10:15].mean(axis=1)

# Average over harvesting season
df_minT['minT_harvest'] = df_minT.iloc[:,15:17].mean(axis=1)

df_minT[['minT_grow', 'minT_harvest']].head()

17 
 Index(['JUNE MINIMUM (Centigrate)', 'JULY MINIMUM (Centigrate)',
       'AUGUST MINIMUM (Centigrate)', 'SEPTEMBER MINIMUM (Centigrate)',
       'OCTOBER MINIMUM (Centigrate)'],
      dtype='object') 
 Index(['NOVEMBER MINIMUM (Centigrate)', 'DECEMBER MINIMUM (Centigrate)'], dtype='object')


Unnamed: 0,minT_grow,minT_harvest
0,24.084,14.45
1,23.71,13.825
2,23.842,13.6
3,23.126,12.53
4,23.48,13.495


In [128]:
# Mean surface runoff
print(len(df_runoff.columns),'\n', df_runoff.columns[9:14], '\n', df_runoff.columns[14:16])

# Averages over growing season
df_runoff['runoff_grow'] = df_runoff.iloc[:,9:14].mean(axis=1)

# Average over harvesting season
df_runoff['runoff_harvest'] = df_runoff.iloc[:,14:16].mean(axis=1)

df_runoff[['runoff_grow', 'runoff_harvest']].head()

16 
 Index(['JUNE Q (mm)', 'JULY Q (mm)', 'AUG Q (mm)', 'SEPT Q (mm)',
       'OCT Q (mm)'],
      dtype='object') 
 Index(['NOV Q (mm)', 'DEC Q (mm)'], dtype='object')


Unnamed: 0,runoff_grow,runoff_harvest
0,99.178,30.02
1,40.292,0.05
2,26.236,0.38
3,52.416,0.705
4,137.788,0.585


In [129]:
# Mean wind speed
print(len(df_windspeed.columns),'\n', df_windspeed.columns[9:14], '\n', df_windspeed.columns[14:16])

# Averages over growing season
df_windspeed['windspeed_grow'] = df_windspeed.iloc[:,9:14].mean(axis=1)

# Average over harvesting season
df_windspeed['windspeed_harvest'] = df_windspeed.iloc[:,14:16].mean(axis=1)

df_windspeed[['windspeed_grow', 'windspeed_harvest']].head()

16 
 Index(['JUNE', 'JULY', 'AUG', 'SEPT', 'OCT'], dtype='object') 
 Index(['NOV', 'DEC'], dtype='object')


Unnamed: 0,windspeed_grow,windspeed_harvest
0,2.2764,1.479
1,2.1974,1.0875
2,1.88,1.123
3,2.4454,1.238
4,2.2446,1.051


## Mass of NPK fertilizer used

District-level fertilizer data are available separately for nitrogen, phosphorous, and potash in units of kilogram per hectare. For a growing season lasting $N_{\rm growing}$ days, we derive the season-averaged fertilizer use as follows.

\begin{align}
\rm{Mass \ of \ fertilizer \ used \ during \ growing \ season} = \rm{(kg/ha \ of \  fertlizer)} \times \rm{(gross\  cropped\ area)} \times \frac{N_{\rm growing}}{365.25}
\end{align}

In [130]:
# No. of days during rice growing season  (June 1 to Oct 31, including end dates)
N_growing = (datetime(2022,10, 31) - datetime(2022, 6, 1)).days + 1

In [131]:
df_npk.tail()

Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,NITROGEN CONSUMPTION (tons),NITROGEN SHARE IN NPK (Percent),NITROGEN PER HA OF NCA (Kg per ha),NITROGEN PER HA OF GCA (Kg per ha),PHOSPHATE CONSUMPTION (tons),PHOSPHATE SHARE IN NPK (Percent),PHOSPHATE PER HA OF NCA (Kg per ha),PHOSPHATE PER HA OF GCA (Kg per ha),POTASH CONSUMPTION (tons),POTASH SHARE IN NPK (Percent),POTASH PER HA OF NCA (Kg per ha),POTASH PER HA OF GCA (Kg per ha),TOTAL CONSUMPTION (tons),TOTAL PER HA OF NCA (Kg per ha),TOTAL PER HA OF GCA (Kg per ha)
13901,2117,2017,20,Telangana,Vikarabad,8335.0,56.03,41.09,39.68,5537.0,37.22,27.3,26.36,1005.0,6.76,4.95,4.78,14877.0,73.34,70.83
13902,2118,2017,20,Telangana,Jayashankar Bhuppaly,8692.0,58.29,62.08,48.91,4139.0,27.76,29.56,23.29,2081.0,13.96,14.86,11.71,14912.0,106.5,83.9
13903,2119,2017,20,Telangana,Janagaon,8289.0,50.18,64.46,51.62,6024.0,36.46,46.85,37.51,2207.0,13.36,17.16,13.74,16520.0,128.47,102.88
13904,2120,2017,20,Telangana,Warangal Urban,101516.0,77.24,1582.85,1029.01,18827.0,14.32,293.55,190.84,11086.0,8.43,172.85,112.37,131429.0,2049.26,1332.22
13905,2121,2017,20,Telangana,Mahabubabad,7867.0,61.22,55.9,46.05,4118.0,32.05,29.26,24.1,865.0,6.73,6.15,5.06,12850.0,91.31,75.21


In [132]:
df_rice.tail()

Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,RICE AREA (1000 ha),RICE PRODUCTION (1000 tons),RICE YIELD (kg per ha)
13960,2119,2017,20,Telangana,Janagaon,51.36,154.53,3008.8
13961,2120,2016,20,Telangana,Warangal Urban,37.97,137.92,3632.45
13962,2120,2017,20,Telangana,Warangal Urban,34.8,117.91,3388.63
13963,2121,2016,20,Telangana,Mahabubabad,47.78,159.52,3338.59
13964,2121,2017,20,Telangana,Mahabubabad,53.0,140.62,2653.18


We will first merge the rice yield and fertlizer use tables into a single data frame through their shared columns.

In [133]:
# Inner join in the SQL sense
df_rice_npk = df_rice.merge(df_npk, on=['Dist Code', 'Year', 'State Code', 'State Name', 'Dist Name'],
                            validate='one_to_one', how='inner')

In [134]:
'Thanjavur' in list(df_rice_npk['Dist Name'])

True

In [135]:
# Mass (kg) of nitrogen fertilizer
df_rice_npk['nitrogen'] = df_rice_npk['NITROGEN PER HA OF GCA (Kg per ha)'] * df_rice_npk['RICE AREA (1000 ha)'] * N_growing/365.25
# Mass (kg) of phosphate fertilizer
df_rice_npk['phosphate'] = df_rice_npk['PHOSPHATE PER HA OF GCA (Kg per ha)'] * df_rice_npk['RICE AREA (1000 ha)'] * N_growing/365.25
# Mass (kg) of potash fertilizer
df_rice_npk['potash'] = df_rice_npk['POTASH PER HA OF GCA (Kg per ha)'] * df_rice_npk['RICE AREA (1000 ha)'] * N_growing/365.25

### Building the final data set

`Dist Code`: District code <br>
`Year`: Year <br>
`Dist Name`: District name <br>
`State Name`: State name <br>
`RICE AREA (1000 ha)`: Gross cropped area of rice in units of 1000 hectares <br>
`RICE YIELD (kg per ha)`: Rice yield (kg/ha) <br>
`nitrogen`: tonnes of nitogen fertilizer used <br>
`phosphate`: tonnes of phosphate fertilizer used <br>
`potash`: kg of potash fertilizer used <br>
`prec_grow`: Average monthly precipitation (mm) during growing season <br>
`prec_harvest`: Average monthly precipitation (mm) during harvesting season <br>
`et_grow`: Average monthly evapotranspiration (mm) during growing season <br>
`et_harvest`: Average monthly evapotranspiration (mm) during harvesting season <br>
`maxT_grow`: Average of monthly maximum temperatures (celsius) during growing season <br>
`maxT_harvest`: Average of monthly maximum temperatures (celsius) during harvesting season <br>
`minT_grow`: Average of monthly minimum temperatures (celsius) during growing season <br>
`minT_harvest`: Average of monthly minimum temperatures (celsius) during harvesting season <br>
`runoff_grow`: Mean surface runoff (mm) during growing season <br>
`runoff_harvest`: Mean surface runoff (mm) during harvesting season <br>
`windspeed_grow`: Mean wind speed (m/s) during growing season <br>
`windspeed_harvest`: Mean wind speed (m/s) during harvesting season <br>

`RICE IRRIGATED AREA (1000 ha)`: Irrigated area (1000 ha) per district <br>
`RA-IA`: The difference between rice area (RA) and irrigated area (IR) (in units of 1000ha) per district <br>

In [136]:
# Retain only necessary columns in different DataFrame objects.
df_rice_npk = df_rice_npk[['Dist Code', 'Year', 'Dist Name', 'State Name', 'RICE AREA (1000 ha)',
                           'RICE YIELD (kg per ha)', 'nitrogen', 'phosphate', 'potash' 
                         ]].copy()

df_prec = df_prec[['Dist Code', 'Year', 'prec_grow', 'prec_harvest']].copy()

df_et = df_et[['Dist Code', 'Year', 'et_grow', 'et_harvest']].copy()

df_maxT = df_maxT[['Dist Code', 'Year', 'maxT_grow', 'maxT_harvest']].copy()

df_minT = df_minT[['Dist Code', 'Year', 'minT_grow', 'minT_harvest']].copy()

df_runoff = df_runoff[['Dist Code', 'Year', 'runoff_grow', 'runoff_harvest']].copy()

df_windspeed = df_windspeed[['Dist Code', 'Year', 'windspeed_grow', 'windspeed_harvest']].copy()



df_irrig = df_irrig[['Dist Code', 'Year', "RICE IRRIGATED AREA (1000 ha)"]].copy()

In [137]:
# Merge contents of different DataFrame objects into a single grand DataFrame object.
final_df = df_rice_npk.merge(df_prec, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_et, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_maxT, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_minT, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_runoff, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df = final_df.merge(df_windspeed, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')


final_df = final_df.merge(df_irrig, on=['Dist Code', 'Year'], validate='one_to_one', how='inner')

final_df['RA-IA'] = final_df["RICE AREA (1000 ha)"].values - final_df["RICE IRRIGATED AREA (1000 ha)"].values 

In [138]:
final_df.sample(10, random_state=570)

Unnamed: 0,Dist Code,Year,Dist Name,State Name,RICE AREA (1000 ha),RICE YIELD (kg per ha),nitrogen,phosphate,potash,prec_grow,...,maxT_grow,maxT_harvest,minT_grow,minT_harvest,runoff_grow,runoff_harvest,windspeed_grow,windspeed_harvest,RICE IRRIGATED AREA (1000 ha),RA-IA
6275,908,2011,Mungair,Bihar,29.37,2122.0,1234.835416,289.731733,136.06934,233.094,...,32.376,28.29,25.532,13.75,83.67,0.01,1.415,0.623,111.55,-82.18
2636,117,2001,Wardha,Maharashtra,0.5,600.0,10.04501,4.626653,3.248501,175.45,...,32.006,29.48,23.214,15.225,23.63,0.005,2.0298,0.9265,0.0,0.5
1924,81,1997,Uttara Kannada,Karnataka,90.31,1527.0,1157.599885,696.451434,373.382708,524.684,...,28.006,29.425,21.318,19.58,365.698,3.445,2.6968,1.54,11.85,78.46
2184,91,1997,Ramananthapuram,Tamil Nadu,144.63,3322.0,3240.650462,638.557784,515.571797,80.48,...,35.146,30.015,26.96,24.63,4.022,31.45,3.7554,1.4395,171.31,-26.68
4298,207,2009,Etawah,Uttar Pradesh,42.98,2594.0,1902.116526,1004.800031,190.661751,125.004,...,35.292,27.375,26.338,12.005,8.03,0.255,1.3574,0.613,80.1,-37.12
4737,227,2006,Rae - Bareily,Uttar Pradesh,130.84,1829.0,5068.618014,1725.347049,231.836659,112.262,...,33.768,26.98,25.804,12.66,29.57,0.03,1.14,0.6935,130.78,0.06
5571,652,2014,Kannur,Kerala,4.96,2250.0,49.802474,24.18443,28.755371,1036.732,...,28.99,30.215,23.108,21.48,855.504,1.98,2.382,1.9375,6.68,-1.72
5188,244,2015,Dehradun,Uttarakhand,10.25,2159.0,311.975483,62.901745,5.882279,325.048,...,27.518,20.235,21.272,10.23,199.074,0.155,1.1372,0.5065,9.84,0.41
6000,709,2001,Mayurbhanja,Orissa,350.63,1535.0,2731.890086,841.598398,619.815923,270.902,...,30.544,27.985,24.232,14.76,122.492,0.61,1.224,0.556,78.77,271.86
4793,229,2010,Hardoi,Uttar Pradesh,141.17,2045.0,6410.81088,1932.527438,527.483009,95.548,...,34.21,26.89,26.462,11.83,4.776,0.12,1.0158,0.5175,140.94,0.23


In [139]:
len(final_df)

6411

In [142]:
# Write grand data set to file.
final_df.to_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Final_data/rice_yield.csv', index=False)

In [144]:
pd.read_csv('/Users/darbinyan/Jupyter notebooks/Bootcamp joint project/Final_data/rice_yield.csv')

Unnamed: 0,Dist Code,Year,Dist Name,State Name,RICE AREA (1000 ha),RICE YIELD (kg per ha),nitrogen,phosphate,potash,prec_grow,...,maxT_grow,maxT_harvest,minT_grow,minT_harvest,runoff_grow,runoff_harvest,windspeed_grow,windspeed_harvest,RICE IRRIGATED AREA (1000 ha),RA-IA
0,1,1990,Durg,Chhattisgarh,397.90,1210.0,2590.157421,1283.411335,421.692296,298.092,...,31.112,28.905,23.310,15.020,138.024,0.315,1.6064,0.9355,218.29,179.61
1,1,1991,Durg,Chhattisgarh,393.20,1293.0,2692.975934,2142.851187,765.892238,227.654,...,31.796,27.900,23.908,13.965,78.862,0.205,1.7214,0.8190,261.90,131.30
2,1,1992,Durg,Chhattisgarh,398.40,1291.0,3828.370398,2318.049906,904.523433,205.528,...,32.140,28.450,24.106,14.220,72.598,0.040,1.5902,0.6540,273.01,125.39
3,1,1993,Durg,Chhattisgarh,410.20,1387.0,3235.543047,1864.346366,439.882645,242.886,...,32.224,27.970,24.338,13.855,86.954,0.000,1.9704,0.9485,283.19,127.01
4,1,1994,Durg,Chhattisgarh,430.10,1399.0,3967.235335,1871.915310,491.850702,318.698,...,31.282,28.110,23.310,13.975,157.294,0.000,2.0116,0.9465,286.47,143.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6406,916,2011,Ranchi,Jharkhand,95.51,2590.0,7210.695132,3127.848557,950.197023,296.180,...,30.352,26.340,22.982,12.240,157.024,0.000,1.0998,0.5855,1.49,94.02
6407,916,2012,Ranchi,Jharkhand,114.29,2509.0,6483.720983,2423.436138,254.695382,251.824,...,30.928,26.085,22.712,11.095,117.830,0.460,1.1018,0.6765,6.75,107.54
6408,916,2013,Ranchi,Jharkhand,105.60,2340.0,4056.341027,935.568296,203.480575,274.302,...,30.408,24.665,22.514,11.095,125.126,0.000,1.1202,0.5845,3.06,102.54
6409,916,2014,Ranchi,Jharkhand,87.26,2608.0,5167.418942,2355.073937,414.139185,204.312,...,30.862,24.460,22.424,11.440,69.912,0.000,1.1268,0.6230,2.93,84.33
