In [1]:
import pandas as pd

crop_df = pd.read_csv("../data/raw/crop_production.csv")
temp_df = pd.read_csv("../data/raw/temperatures.csv")
rain_df = pd.read_csv("../data/raw/rainfall.csv")



In [2]:
print("Crop columns:")
print(crop_df.columns)

print("\nTemperature columns:")
print(temp_df.columns)

print("\nRainfall columns:")
print(rain_df.columns)


Crop columns:
Index(['State_Name', 'District_Name', 'Crop_Year', 'Season', 'Crop', 'Area',
       'Production'],
      dtype='object')

Temperature columns:
Index(['YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP',
       'OCT', 'NOV', 'DEC', 'ANNUAL', 'JAN-FEB', 'MAR-MAY', 'JUN-SEP',
       'OCT-DEC'],
      dtype='object')

Rainfall columns:
Index(['SUBDIVISION', 'YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
       'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'ANNUAL', 'Jan-Feb', 'Mar-May',
       'Jun-Sep', 'Oct-Dec'],
      dtype='object')


In [3]:
crop_df.rename(columns={"Crop_Year": "Year"}, inplace=True)
crop_df = crop_df.dropna(subset=["Production"])
crop_df = crop_df[["State_Name", "Year", "Crop", "Production"]]
crop_df.head()


Unnamed: 0,State_Name,Year,Crop,Production
0,Andaman and Nicobar Islands,2000,Arecanut,2000.0
1,Andaman and Nicobar Islands,2000,Other Kharif pulses,1.0
2,Andaman and Nicobar Islands,2000,Rice,321.0
3,Andaman and Nicobar Islands,2000,Banana,641.0
4,Andaman and Nicobar Islands,2000,Cashewnut,165.0


In [4]:
temp_df["Avg_Temperature"] = temp_df.loc[:, "JAN":"DEC"].mean(axis=1)
temp_df = temp_df[["YEAR", "Avg_Temperature"]]
temp_df.rename(columns={"YEAR": "Year"}, inplace=True)
temp_df.head()


Unnamed: 0,Year,Avg_Temperature
0,1901,28.995833
1,1902,29.218333
2,1903,28.628333
3,1904,28.4875
4,1905,28.528333


In [5]:
rain_df["Annual_Rainfall"] = rain_df.loc[:, "JAN":"DEC"].sum(axis=1)
rain_df = rain_df[["SUBDIVISION", "YEAR", "Annual_Rainfall"]]
rain_df.rename(columns={
    "SUBDIVISION": "State",
    "YEAR": "Year"
}, inplace=True)
rain_df.head()


Unnamed: 0,State,Year,Annual_Rainfall
0,ANDAMAN & NICOBAR ISLANDS,1901,3373.2
1,ANDAMAN & NICOBAR ISLANDS,1902,3520.7
2,ANDAMAN & NICOBAR ISLANDS,1903,2957.4
3,ANDAMAN & NICOBAR ISLANDS,1904,3079.6
4,ANDAMAN & NICOBAR ISLANDS,1905,2566.7


In [6]:
crop_state_year = crop_df.groupby(
    ["State_Name", "Year"]
)["Production"].sum().reset_index()

crop_state_year.rename(columns={
    "State_Name": "State",
    "Production": "Total_Production"
}, inplace=True)

crop_state_year.head()


Unnamed: 0,State,Year,Total_Production
0,Andaman and Nicobar Islands,2000,89060914.0
1,Andaman and Nicobar Islands,2001,89718700.0
2,Andaman and Nicobar Islands,2002,94387137.67
3,Andaman and Nicobar Islands,2003,95296454.67
4,Andaman and Nicobar Islands,2004,87186497.63


In [None]:
from pathlib import Path

processed_dir = Path("../data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

crop_state_year.to/;_csv(processed_dir / "crop_clean.csv", index=False)
temp_df.to_csv(processed_dir / "temperature_clean.csv", index=False)
rain_df.to_csv(processed_dir / "rainfall_clean.csv", index=False)
