In [1]:
# Importing the required modules
import numpy as np # type: ignore
import pandas as pd # type: ignore

In [2]:
# Reading our dataset
dataset = pd.read_csv('crop_yield.csv')

In [3]:
dataset.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [4]:
# Getting information about our data
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [5]:
# Statistical description of data
dataset.describe()

Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0
mean,2009.127584,179926.6,16435940.0,1437.755177,24103310.0,48848.35,79.954009
std,6.498099,732828.7,263056800.0,816.909589,94946000.0,213287.4,878.306193
min,1997.0,0.5,0.0,301.3,54.17,0.09,0.0
25%,2004.0,1390.0,1393.0,940.7,188014.6,356.7,0.6
50%,2010.0,9317.0,13804.0,1247.6,1234957.0,2421.9,1.03
75%,2015.0,75112.0,122718.0,1643.7,10003850.0,20041.7,2.388889
max,2020.0,50808100.0,6326000000.0,6552.7,4835407000.0,15750510.0,21105.0


In [6]:
# Function to cap outliers only for numeric columns
def cap_outliers(dataset, column):
    if pd.api.types.is_numeric_dtype(dataset[column]):
        lower_bound = dataset[column].quantile(0.01)
        upper_bound = dataset[column].quantile(0.99)
        dataset[column] = dataset[column].clip(lower=lower_bound, upper=upper_bound)

# Apply the function to numeric columns only
for column in dataset.columns:
    cap_outliers(dataset, column)

In [7]:
dataset.describe()

Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0
mean,2009.125705,159951.8,906777.4,1428.350138,21246460.0,42382.254291,5.10753
std,6.495098,479640.7,4381942.0,769.742365,64014370.0,127797.071394,14.591018
min,1997.0,3.0,2.0,345.6,400.2,0.72,0.081241
25%,2004.0,1390.0,1393.0,940.7,188014.6,356.7,0.6
50%,2010.0,9317.0,13804.0,1247.6,1234957.0,2421.9,1.03
75%,2015.0,75112.0,122718.0,1643.7,10003850.0,20041.7,2.388889
max,2019.0,3299226.0,37571120.0,4472.3,442727000.0,885873.2616,104.273387


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Encoding categorical variables (e.g., one-hot encoding)
df_encoded = pd.get_dummies(dataset, columns=['Crop', 'Season', 'State'])

# Split data into features (X) and target (y)
X = df_encoded.drop(columns=['Yield', 'Crop_Year'])
y = df_encoded['Yield']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and fit Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(r2)

Mean Squared Error: 4.4879235051629065
0.9782026029175848


In [9]:
import joblib

# Save the trained model to a pickle file
model_filename = 'random_forest_model.pkl'
joblib.dump(model, model_filename)

['random_forest_model.pkl']

In [None]:
# Load the trained model
model = joblib.load('random_forest_model.pkl')

# List of encoded columns used in the trained model (you can get this from your training process)
encoded_columns = model.feature_names_in_  # if you have used scikit-learn version 1.0 or higher

# Function to preprocess the input data
def preprocess_input(crop, season, state, area, production, annual_rainfall, fertilizer, pesticide):
    # Create a dictionary from the input data
    input_data = {
        'Crop': [crop],
        'Season': [season],
        'State': [state],
        'Area': [area],
        'Production': [production],
        'Annual_Rainfall': [annual_rainfall],
        'Fertilizer': [fertilizer],
        'Pesticide': [pesticide]
    }

    # Convert input data to DataFrame
    input_df = pd.DataFrame(input_data)

    # Apply the same pd.get_dummies() transformation as in the original dataset
    input_df_encoded = pd.get_dummies(input_df, columns=['Crop', 'Season', 'State'])

    # Add any missing columns with a value of 0 (in case the input doesn't have all categories)
    for col in encoded_columns:
        if col not in input_df_encoded.columns:
            input_df_encoded[col] = 0

    # Reorder columns to match the training data columns
    input_df_encoded = input_df_encoded[encoded_columns]

    return input_df_encoded

# Input features
crop = input("Enter the crop type: ")
season = input("Enter the season: ")
state = input("Enter the state: ")
area = float(input("Enter the area (in hectares): "))
production = float(input("Enter the amount of production (in metric tonnes):"))
annual_rainfall = float(input("Enter the annual rainfall (in mm): "))
fertilizer = float(input("Enter the amount of fertilizer (in kg): "))
pesticide = float(input("Enter the amount of pesticide (in kg): "))

# Preprocess input data
input_data_encoded = preprocess_input(crop, season, state, area, production, annual_rainfall, fertilizer, pesticide)

# Predict the yield using the pre-trained model
predicted_yield = model.predict(input_data_encoded)

print(f"Predicted yield: {predicted_yield}")