In [4]:
import pandas as pd
crop_df = pd.read_csv("../data/processed/crop_clean.csv")
temp_df = pd.read_csv("../data/processed/temperature_clean.csv")
rain_df = pd.read_csv("../data/processed/rainfall_clean.csv")



In [10]:
# Merge rainfall and temperature on Year
climate_df = pd.merge(
    rain_df,
    temp_df,
    on="Year",
    how="inner"
)

# Merge with crop data ONLY on Year
final_df = pd.merge(
    climate_df,
    crop_df,
    on="Year",
    how="inner"
)

print(len(final_df))
final_df.head()



18684


Unnamed: 0,State_x,Year,Annual_Rainfall,Avg_Temperature,State_y,Total_Production
0,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Andhra Pradesh,21093500.0
1,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Arunachal Pradesh,267148.0
2,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Assam,5778334.0
3,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Bihar,19031372.0
4,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Goa,72538.0


In [17]:
# we are taking Average rainfall per rainfall-region (State_x)
final_df["Avg_Rainfall_State"] = final_df.groupby("State_x")[
    "Annual_Rainfall"
].transform("mean")

# Difference from average rainfall
final_df["Rainfall_Deviation"] = (
    final_df["Annual_Rainfall"] - final_df["Avg_Rainfall_State"]
)

final_df.head()




Unnamed: 0,State_x,Year,Annual_Rainfall,Avg_Temperature,State_y,Total_Production,Temp_Level,Avg_Rainfall_State,Rainfall_Deviation
0,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Andhra Pradesh,21093500.0,High,2890.44605,-135.34605
1,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Arunachal Pradesh,267148.0,High,2890.44605,-135.34605
2,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Assam,5778334.0,High,2890.44605,-135.34605
3,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Bihar,19031372.0,High,2890.44605,-135.34605
4,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Goa,72538.0,High,2890.44605,-135.34605


In [20]:
# Simple temperature category
def temp_category(temp):
    if temp < 24:
        return "Low"
    elif temp < 28:
        return "Medium"
    else:
        return "High"

final_df["Temp_Level"] = final_df["Avg_Temperature"].apply(temp_category)

final_df.tail()


Unnamed: 0,State_x,Year,Annual_Rainfall,Avg_Temperature,State_y,Total_Production,Temp_Level,Avg_Rainfall_State,Rainfall_Deviation
18679,LAKSHADWEEP,2014,1395.0,29.723333,Uttar Pradesh,199418844.0,High,1602.065125,-207.065125
18680,LAKSHADWEEP,2014,1395.0,29.723333,Uttarakhand,11873861.0,High,1602.065125,-207.065125
18681,LAKSHADWEEP,2014,1395.0,29.723333,West Bengal,43584403.0,High,1602.065125,-207.065125
18682,LAKSHADWEEP,2015,1642.9,29.895,Odisha,6831648.7,High,1602.065125,40.834875
18683,LAKSHADWEEP,2015,1642.9,29.895,Sikkim,103416.0,High,1602.065125,40.834875


In [21]:
import numpy as np

# we apply log transformations to reduce the skewness of the production data
final_df["Log_Production"] = np.log1p(final_df["Total_Production"])

final_df.head()


Unnamed: 0,State_x,Year,Annual_Rainfall,Avg_Temperature,State_y,Total_Production,Temp_Level,Avg_Rainfall_State,Rainfall_Deviation,Log_Production
0,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Andhra Pradesh,21093500.0,High,2890.44605,-135.34605,16.864476
1,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Arunachal Pradesh,267148.0,High,2890.44605,-135.34605,12.495562
2,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Assam,5778334.0,High,2890.44605,-135.34605,15.569626
3,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Bihar,19031372.0,High,2890.44605,-135.34605,16.761599
4,ANDAMAN & NICOBAR ISLANDS,1997,2755.1,29.179167,Goa,72538.0,High,2890.44605,-135.34605,11.19188


In [22]:

print(final_df.columns)

print( len(final_df))
print( final_df.shape[1])


Index(['State_x', 'Year', 'Annual_Rainfall', 'Avg_Temperature', 'State_y',
       'Total_Production', 'Temp_Level', 'Avg_Rainfall_State',
       'Rainfall_Deviation', 'Log_Production'],
      dtype='object')
18684
10


In [23]:
from pathlib import Path

processed_dir = Path("../data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

final_df.to_csv(processed_dir / "engineered_features.csv", index=False)

