Run Code

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import datetime

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/Andhra_Crop_Project/crops_data.csv')

# Convert date column to datetime
data['Date'] = pd.to_datetime(data['Date'], format='%b-%y')

# Label encoding for categorical columns (Location and Crop)
le_location = LabelEncoder()
le_crop = LabelEncoder()

data['Location'] = le_location.fit_transform(data['Location'])
data['Crop'] = le_crop.fit_transform(data['Crop'])

# Features and target variable
X = data[['Location', 'Crop', 'Rainfall', 'Temperature', 'Humidity']]
y = data['Crop_Price']

# Train-test split ensuring that each crop-location pair has a balanced representation
# We use a stratified approach on 'Location' and 'Crop' to avoid skewness
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=data[['Location', 'Crop']]
)

# Define models
random_forest = RandomForestRegressor(random_state=42)
adaboost = AdaBoostRegressor(random_state=42)

# Hyperparameter tuning with GridSearchCV for both models
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_ab = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Grid search for RandomForest
grid_search_rf = GridSearchCV(estimator=random_forest, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, y_train)

# Grid search for AdaBoost
grid_search_ab = GridSearchCV(estimator=adaboost, param_grid=param_grid_ab, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search_ab.fit(X_train, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return r2, mse, mae, rmse

# Best models after hyperparameter tuning
best_rf = grid_search_rf.best_estimator_
best_ab = grid_search_ab.best_estimator_

# Evaluate RandomForest
r2_rf, mse_rf, mae_rf, rmse_rf = evaluate_model(best_rf, X_test, y_test)
print(f"Random Forest Results: R²: {r2_rf}, MSE: {mse_rf}, MAE: {mae_rf}, RMSE: {rmse_rf}")

# Evaluate AdaBoost
r2_ab, mse_ab, mae_ab, rmse_ab = evaluate_model(best_ab, X_test, y_test)
print(f"AdaBoost Results: R²: {r2_ab}, MSE: {mse_ab}, MAE: {mae_ab}, RMSE: {rmse_ab}")

# Predict prices for the next 12 months starting from December 2024
def predict_future_prices(model, location, crop, rainfall, temperature, humidity, start_date='2024-12', months=12):
    # Convert inputs into correct format
    location_encoded = le_location.transform([location])[0]
    crop_encoded = le_crop.transform([crop])[0]

    future_dates = pd.date_range(start=start_date, periods=months, freq='M')

    predictions = []
    for date in future_dates:
        X_future = np.array([[location_encoded, crop_encoded, rainfall, temperature, humidity]])
        price_pred = model.predict(X_future)[0]
        predictions.append((date.strftime('%b-%y'), price_pred))

    return predictions

# Example usage:
predicted_prices = predict_future_prices(best_rf, location='Location_A', crop='Crop_A', rainfall=100, temperature=25, humidity=80)
print("Predicted Prices for the next 12 months:", predicted_prices)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Random Forest Results: R²: 0.8619784538188393, MSE: 180.968425864049, MAE: 8.71433260421392, RMSE: 13.452450552373312
AdaBoost Results: R²: 0.8061201249380563, MSE: 254.2076709575909, MAE: 12.35662150955358, RMSE: 15.943891336734294


ValueError: y contains previously unseen labels: 'Location_A'

Identifying Best Params


In [None]:
# Print the best parameters for Random Forest
print("Best parameters for Random Forest:")
print(grid_search_rf.best_params_)

# Print the best parameters for AdaBoost
print("Best parameters for AdaBoost:")
print(grid_search_ab.best_params_)


Best parameters for Random Forest:
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best parameters for AdaBoost:
{'learning_rate': 1.0, 'n_estimators': 50}


Final Code

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import datetime
import joblib

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/Andhra_Crop_Project/crops_data.csv')

# Convert date column to datetime
data['Date'] = pd.to_datetime(data['Date'], format='%b-%y')

# Label encoding for categorical columns (Location and Crop)
le_location = LabelEncoder()
le_crop = LabelEncoder()

data['Location'] = le_location.fit_transform(data['Location'])
data['Crop'] = le_crop.fit_transform(data['Crop'])

# Features and target variable
X = data[['Location', 'Crop', 'Rainfall', 'Temperature', 'Humidity']]
y = data['Crop_Price']

# Train-test split ensuring that each crop-location pair has a balanced representation
# We use a stratified approach on 'Location' and 'Crop' to avoid skewness
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=data[['Location', 'Crop']]
)

# Define Random Forest with best-fit parameters
random_forest = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42
)

# Define AdaBoost with best-fit parameters
adaboost = AdaBoostRegressor(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

# Train the models
random_forest.fit(X_train, y_train)
adaboost.fit(X_train, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return r2, mse, mae, rmse

# Evaluate RandomForest
r2_rf, mse_rf, mae_rf, rmse_rf = evaluate_model(random_forest, X_test, y_test)
print(f"Random Forest Results: R²: {r2_rf}, MSE: {mse_rf}, MAE: {mae_rf}, RMSE: {rmse_rf}")

# Evaluate AdaBoost
r2_ab, mse_ab, mae_ab, rmse_ab = evaluate_model(adaboost, X_test, y_test)
print(f"AdaBoost Results: R²: {r2_ab}, MSE: {mse_ab}, MAE: {mae_ab}, RMSE: {rmse_ab}")

# Save the models
# joblib.dump(random_forest, 'best_random_forest_model.pkl')
# joblib.dump(adaboost, 'best_adaboost_model.pkl')
# print("Models saved successfully.")

Random Forest Results: R²: 0.8619784538188393, MSE: 180.968425864049, MAE: 8.71433260421392, RMSE: 13.452450552373312
AdaBoost Results: R²: 0.8061201249380563, MSE: 254.2076709575909, MAE: 12.35662150955358, RMSE: 15.943891336734294
Models saved successfully.
Predicted Prices for the next 12 months: [('Dec-24', 66.99830375936558), ('Jan-25', 66.99830375936558), ('Feb-25', 66.99830375936558), ('Mar-25', 66.99830375936558), ('Apr-25', 66.99830375936558), ('May-25', 66.99830375936558), ('Jun-25', 66.99830375936558), ('Jul-25', 66.99830375936558), ('Aug-25', 66.99830375936558), ('Sep-25', 66.99830375936558), ('Oct-25', 66.99830375936558), ('Nov-25', 66.99830375936558)]


  future_dates = pd.date_range(start=start_date, periods=months, freq='M')


Optimization and Fine Tuning


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import joblib

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/Andhra_Crop_Project/crops_data.csv')

# Convert date column to datetime
data['Date'] = pd.to_datetime(data['Date'], format='%b-%y')

# Create Month and Year features
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# Label encoding for categorical columns (Location and Crop)
le_location = LabelEncoder()
le_crop = LabelEncoder()

data['Location'] = le_location.fit_transform(data['Location'])
data['Crop'] = le_crop.fit_transform(data['Crop'])

# Features and target variable including Month and Year
X = data[['Location', 'Crop', 'Rainfall', 'Temperature', 'Humidity', 'Month', 'Year']]
y = data['Crop_Price']

# Train-test split ensuring that each crop-location pair has a balanced representation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=data[['Location', 'Crop']]
)

# Define Random Forest with best-fit parameters
random_forest = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42
)

# Define AdaBoost with best-fit parameters
adaboost = AdaBoostRegressor(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

# Train the models
random_forest.fit(X_train, y_train)
adaboost.fit(X_train, y_train)

# Evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return r2, mse, mae, rmse

# Evaluate RandomForest
r2_rf, mse_rf, mae_rf, rmse_rf = evaluate_model(random_forest, X_test, y_test)
print(f"Random Forest Results: R²: {r2_rf:.2f}, MSE: {mse_rf:.2f}, MAE: {mae_rf:.2f}, RMSE: {rmse_rf:.2f}")

# Evaluate AdaBoost
r2_ab, mse_ab, mae_ab, rmse_ab = evaluate_model(adaboost, X_test, y_test)
print(f"AdaBoost Results: R²: {r2_ab:.2f}, MSE: {mse_ab:.2f}, MAE: {mae_ab:.2f}, RMSE: {rmse_ab:.2f}")

# Save the models
joblib.dump(random_forest, 'random_forest_model.joblib')
joblib.dump(adaboost, 'adaboost_model.joblib')

print("Models saved successfully.")

# Load the models (example)
loaded_random_forest = joblib.load('random_forest_model.joblib')
loaded_adaboost = joblib.load('adaboost_model.joblib')

# Predict prices for the next 12 months
def predict_future_prices(model, location, crop, rainfall, temperature, humidity, start_date='2024-12', months=12):
    # Convert inputs into correct format
    location_encoded = le_location.transform([location])[0]
    crop_encoded = le_crop.transform([crop])[0]

    future_dates = pd.date_range(start=start_date, periods=months, freq='M')

    predictions = []
    for i, date in enumerate(future_dates):
        # Month and Year for the future dates
        month = date.month
        year = date.year

        # Create a DataFrame for prediction
        X_future = pd.DataFrame([[location_encoded, crop_encoded, rainfall, temperature, humidity, month, year]],
                             columns=['Location', 'Crop', 'Rainfall', 'Temperature', 'Humidity', 'Month', 'Year'])
        price_pred = model.predict(X_future)[0]

        # Format price with two decimal places
        formatted_price = f"{price_pred:.2f}"

        predictions.append((date.strftime('%b-%y'), formatted_price))

    return predictions

# Example usage:
predicted_prices = predict_future_prices(loaded_random_forest, location='Nellore', crop='Maize', rainfall=980, temperature=25, humidity=71)
print("Predicted Prices for the next 12 months:", predicted_prices)


Random Forest Results: R²: 0.97, MSE: 36.87, MAE: 4.25, RMSE: 6.07
AdaBoost Results: R²: 0.91, MSE: 120.30, MAE: 8.64, RMSE: 10.97
Models saved successfully.
Predicted Prices for the next 12 months: [('Dec-24', '58.72'), ('Jan-25', '58.79'), ('Feb-25', '57.85'), ('Mar-25', '56.37'), ('Apr-25', '56.29'), ('May-25', '56.05'), ('Jun-25', '56.28'), ('Jul-25', '56.32'), ('Aug-25', '57.36'), ('Sep-25', '59.70'), ('Oct-25', '60.06'), ('Nov-25', '59.11')]


  future_dates = pd.date_range(start=start_date, periods=months, freq='M')


**Best Model Random Forest**

In [None]:
pip install joblib



In [None]:
import joblib

# Save the best Random Forest model
joblib.dump(best_rf, 'best_random_forest_model.pkl')

# Save the best AdaBoost model
joblib.dump(best_ab, 'best_adaboost_model.pkl')

print("Models saved successfully.")


Models saved successfully.
