### Importing the Dataset

In [6]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle (1).json to kaggle (1) (1).json
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [7]:
import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("rajeev86/yield-data")
yield_df = pd.read_csv(os.path.join(path, 'Yield-data.csv'))

yield_df.head()

Unnamed: 0,Year,Month,Crop,Region,Temperature,Rainfall,Humidity,Soil_pH,Soil_Nitrogen,Soil_Phosphorus,Soil_Potassium,Fertilizer_Use,Pesticide_Use,Previous_Year_Yield,Sowing_To_Harvest_Days,Predicted_Yield
0,2021,7,Arhar,Raipur Division,21.505078,176.154756,72.445577,6.322492,44.537201,52.550188,154.114109,241.875971,0.941697,1744.11021,160,2517.801925
1,2018,8,Niger,Balod Division,33.825449,79.320037,72.706677,6.46017,46.678445,48.860212,128.354348,91.621999,2.673295,2962.370166,106,1266.410174
2,2025,3,Kulthi,Surguja Division,33.294622,92.265445,83.901317,7.78945,49.872309,23.452611,165.424454,112.23872,6.304583,5431.411536,141,4055.301658
3,2022,6,Niger,Surguja Division,15.229613,137.951252,64.961169,7.906748,16.147365,50.594482,190.483506,203.998456,5.299106,5262.210032,107,1708.575732
4,2019,1,Mustard,Raipur Division,24.002331,240.418969,37.446442,6.655318,44.518562,36.673924,176.818389,223.3528,0.689873,2544.202082,152,2940.90993


### Data Exploration

In [8]:
yield_df.columns

Index(['Year', 'Month', 'Crop', 'Region', 'Temperature', 'Rainfall',
       'Humidity', 'Soil_pH', 'Soil_Nitrogen', 'Soil_Phosphorus',
       'Soil_Potassium', 'Fertilizer_Use', 'Pesticide_Use',
       'Previous_Year_Yield', 'Sowing_To_Harvest_Days', 'Predicted_Yield'],
      dtype='object')

In [9]:
min(yield_df['Predicted_Yield'])

897.2600370277964

In [10]:
yield_df.shape

(1000, 16)

### Preprocessing

In [11]:
yield_df.drop(['Crop', 'Region'], axis=1).corr()

Unnamed: 0,Year,Month,Temperature,Rainfall,Humidity,Soil_pH,Soil_Nitrogen,Soil_Phosphorus,Soil_Potassium,Fertilizer_Use,Pesticide_Use,Previous_Year_Yield,Sowing_To_Harvest_Days,Predicted_Yield
Year,1.0,-0.04562,-0.004184,-0.014567,0.010178,0.021731,-0.003533,-0.060149,0.029526,0.023128,-0.031174,0.013266,0.038321,0.036483
Month,-0.04562,1.0,0.004746,-0.01403,0.017137,-0.054315,-0.007757,-0.06079,0.015935,-0.03768,-0.021785,-0.029244,0.019099,0.003299
Temperature,-0.004184,0.004746,1.0,0.019431,0.014258,-0.020377,0.016601,0.006191,-0.033142,-0.097762,-0.019644,0.036209,0.046339,0.034611
Rainfall,-0.014567,-0.01403,0.019431,1.0,-0.037919,-0.017653,0.010509,-0.004957,-0.038493,-0.018857,-0.050052,0.01958,0.034902,0.070348
Humidity,0.010178,0.017137,0.014258,-0.037919,1.0,-0.019501,-0.001251,-0.027415,-0.005947,-0.023234,-0.026644,-0.028292,0.003739,-0.010551
Soil_pH,0.021731,-0.054315,-0.020377,-0.017653,-0.019501,1.0,-0.015294,0.017954,0.033732,-0.00526,-0.031462,0.063661,-0.020836,0.002417
Soil_Nitrogen,-0.003533,-0.007757,0.016601,0.010509,-0.001251,-0.015294,1.0,-0.053562,-0.052567,0.010696,0.044192,0.02133,-0.016348,0.011306
Soil_Phosphorus,-0.060149,-0.06079,0.006191,-0.004957,-0.027415,0.017954,-0.053562,1.0,0.094624,-0.014931,0.018027,-0.038329,-0.073206,-0.00411
Soil_Potassium,0.029526,0.015935,-0.033142,-0.038493,-0.005947,0.033732,-0.052567,0.094624,1.0,0.005783,0.004902,0.025589,-0.051261,-0.001758
Fertilizer_Use,0.023128,-0.03768,-0.097762,-0.018857,-0.023234,-0.00526,0.010696,-0.014931,0.005783,1.0,-0.035732,-0.028864,-0.007951,0.035479


In [12]:
from sklearn.model_selection import train_test_split

X = yield_df.drop(['Predicted_Yield'], axis=1)
y = yield_df['Predicted_Yield']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def fit_preprocessing(X_train):
    """Fits OneHotEncoder and StandardScaler on training data."""

    # Using sin-cos transformation for 'Month' (makes transitions smooth)
    X_train = X_train.copy()
    X_train['Month_sin'] = np.sin(2 * np.pi * X_train['Month'] / 12)
    X_train['Month_cos'] = np.cos(2 * np.pi * X_train['Month'] / 12)
    X_train.drop(columns=['Month'], inplace=True)

    # Converting 'Year' to relative years since 2015
    X_train['Year'] = X_train['Year'] - 2015

    # Identify categorical & numerical columns
    cat_columns = ['Crop', 'Region']
    num_columns = X_train.drop(columns=cat_columns).columns

    # Initialize encoders
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    scaler = StandardScaler()

    # Fit encoders on training data
    ohe.fit(X_train[cat_columns])
    scaler.fit(X_train[num_columns])

    return ohe, scaler  # Return fitted encoders

def transform_preprocessing(X, ohe, scaler):
    """Applies fitted OneHotEncoder and StandardScaler to new data."""

    X = X.copy()

    # Apply same transformations as training set
    X['Month_sin'] = np.sin(2 * np.pi * X['Month'] / 12)
    X['Month_cos'] = np.cos(2 * np.pi * X['Month'] / 12)
    X.drop(columns=['Month'], inplace=True)

    X['Year'] = X['Year'] - 2015

    cat_columns = ['Crop', 'Region']
    num_columns = X.drop(columns=cat_columns).columns

    # Transform using fitted encoders
    cat_data = ohe.transform(X[cat_columns])
    num_data = scaler.transform(X[num_columns])

    # Convert categorical data to DataFrame
    cat_data = pd.DataFrame(cat_data, columns=ohe.get_feature_names_out(cat_columns))
    num_data = pd.DataFrame(num_data, columns=num_columns)

    # Reset index to avoid mismatches during concatenation
    X_transformed = pd.concat([num_data.reset_index(drop=True), cat_data.reset_index(drop=True)], axis=1)

    return X_transformed

In [14]:
ohe, scaler = fit_preprocessing(X_train)

In [15]:
from sklearn.preprocessing import FunctionTransformer
preprocessor = FunctionTransformer(transform_preprocessing, kw_args={'ohe': ohe, 'scaler':scaler})

In [16]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

### Model Training and Evaluation

In [17]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.15)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"R² Score = {r2:.4f}, Mean Absolute Error = {mae:.4f}, Mean Squared Error: {mse:.4f}")

R² Score = 0.9495, Mean Absolute Error = 184.2401, Mean Squared Error: 52361.8216


### Saving the model and testing

In [18]:
import cloudpickle

with open('preprocessor_YR.pkl', 'wb') as preprocessor_file:
    cloudpickle.dump(preprocessor, preprocessor_file)
with open('model_YR.pkl', 'wb') as model_file:
    cloudpickle.dump(ridge, model_file)
print("Model Saved!")

Model Saved!


In [19]:
import pandas as pd
import cloudpickle

# Define the column names matching your input data
columns = ['Year', 'Month', 'Crop', 'Region', 'Temperature', 'Rainfall',
           'Humidity', 'Soil_pH', 'Soil_Nitrogen', 'Soil_Phosphorus',
           'Soil_Potassium', 'Fertilizer_Use', 'Pesticide_Use',
           'Previous_Year_Yield', 'Sowing_To_Harvest_Days']

# Create the DataFrame with the correct column names
manual_input = pd.DataFrame([[2025, 11, "Maize", "Surguja Division", 31.360535,
                              83.101153, 60.120492, 6.789534, 44.640928,
                              31.812222, 222.420815, 158.644019, 9.709772,
                              4931.249679, 99]], columns=columns)

In [22]:
def predict_yield(new_data):

    with open('preprocessor_YR.pkl', 'rb') as prep_file:
        preprocessor = cloudpickle.load(prep_file)
    with open('model_YR.pkl', 'rb') as model_file:
        model_fit = cloudpickle.load(model_file)

    # Transform input features
    new_data_transformed = preprocessor.transform(new_data)

    # Predict Market Demand (returns scaled values)
    predicted_yield = model_fit.predict(new_data_transformed)

    return float(predicted_yield[0])

In [23]:
predict_yield(manual_input)

2803.3243173281144