### Importing Dataset

In [6]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle (1).json to kaggle (1) (1).json
cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [7]:
import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("rajeev86/soil-climate-data")
dd = pd.read_csv(os.path.join(path, 'Soil-Climate-data.csv'))

df.head()

Unnamed: 0,Crop_Type,Soil_Type,Farm_Size_Acres,Irrigation_Available,Soil_pH,Soil_Nitrogen,Soil_Organic_Matter,Temperature,Rainfall,Humidity,Compatible
0,summer paddy,Red and Yellow soils,79.838232,0,5.17829,198.870486,1.45855,20.172143,1861.635725,57.924332,0
1,Kulthi,Alluvial soils,33.932796,1,4.862699,27.78168,2.530317,33.646919,1400.435779,53.26191,0
2,Arhar,Laterite soils,59.673206,1,7.691357,62.500094,4.656399,22.535805,574.308028,65.547263,0
3,Gram,Alluvial soils,50.000261,0,6.482151,134.655093,2.268048,25.672081,1900.397115,34.972994,0
4,summer paddy,Red and Yellow soils,94.628058,0,6.054078,69.894889,1.448071,14.366488,1568.615247,25.953544,0


### Data Exploration and Preprocessing

In [8]:
df.columns

Index(['Crop_Type', 'Soil_Type', 'Farm_Size_Acres', 'Irrigation_Available',
       'Soil_pH', 'Soil_Nitrogen', 'Soil_Organic_Matter', 'Temperature',
       'Rainfall', 'Humidity', 'Compatible'],
      dtype='object')

In [9]:
df['Compatible'].value_counts()

Unnamed: 0_level_0,count
Compatible,Unnamed: 1_level_1
0,9292
1,708


In [10]:
df.shape

(10000, 11)

In [11]:
df.drop(['Crop_Type','Soil_Type'], axis=1).corr()

Unnamed: 0,Farm_Size_Acres,Irrigation_Available,Soil_pH,Soil_Nitrogen,Soil_Organic_Matter,Temperature,Rainfall,Humidity,Compatible
Farm_Size_Acres,1.0,-0.017461,-0.006919,-0.005662,-0.008523,0.011524,0.004341,-0.005598,-0.019075
Irrigation_Available,-0.017461,1.0,-0.00335,0.004874,-0.001021,-0.006237,-0.011689,0.002251,0.278195
Soil_pH,-0.006919,-0.00335,1.0,0.007736,0.010532,-0.003583,0.003339,-0.01588,-0.003025
Soil_Nitrogen,-0.005662,0.004874,0.007736,1.0,-0.005385,-0.003351,-0.016172,-0.011887,0.097972
Soil_Organic_Matter,-0.008523,-0.001021,0.010532,-0.005385,1.0,-0.003674,0.009687,0.010878,-3.2e-05
Temperature,0.011524,-0.006237,-0.003583,-0.003351,-0.003674,1.0,-0.015698,0.017433,-0.005748
Rainfall,0.004341,-0.011689,0.003339,-0.016172,0.009687,-0.015698,1.0,-0.014239,-0.058224
Humidity,-0.005598,0.002251,-0.01588,-0.011887,0.010878,0.017433,-0.014239,1.0,-0.001183
Compatible,-0.019075,0.278195,-0.003025,0.097972,-3.2e-05,-0.005748,-0.058224,-0.001183,1.0


In [12]:
from sklearn.model_selection import train_test_split

X = df.drop('Compatible', axis=1)
y = df['Compatible']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd

def fit_preprocessor(X_train):
    # Initialize the encoders
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    scaler = StandardScaler()

    # Fit the one-hot encoder on categorical column
    ohe.fit(X_train[['Crop_Type', 'Soil_Type']])

    # Fit the scaler on numerical columns
    num_cols = X_train.select_dtypes(include=['number']).columns
    scaler.fit(X_train[num_cols])

    return ohe, scaler  # Return both fitted transformers

def transform_preprocessor(X, ohe, scaler):
    # One-hot encoding for categorical column
    X_encoded = ohe.transform(X[['Crop_Type', 'Soil_Type']])
    X_encoded = pd.DataFrame(X_encoded, columns=ohe.get_feature_names_out(['Crop_Type', 'Soil_Type']))

    # Standardization for numerical columns
    num_cols = X.select_dtypes(include=['number']).columns
    X_scaled = scaler.transform(X[num_cols])
    X_scaled = pd.DataFrame(X_scaled, columns=num_cols)

    # Combine transformed features
    X_transformed = pd.concat([X_scaled.reset_index(drop=True), X_encoded.reset_index(drop=True)], axis=1)
    return X_transformed

In [14]:
ohe, scaler = fit_preprocessor(X_train)

In [15]:
from sklearn.preprocessing import FunctionTransformer

preprocessor = FunctionTransformer(transform_preprocessor, kw_args={'ohe': ohe, 'scaler':scaler})

In [16]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [17]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)

In [18]:
from imblearn.under_sampling import RandomUnderSampler

# Increased sampling_strategy to a value above 1
undersample = RandomUnderSampler(sampling_strategy=1, random_state=42)  # Majority class reduced
X_train_resampled, y_train_resampled = undersample.fit_resample(X_train_resampled, y_train_resampled)

# Check the new distribution
print("Training set class distribution after sampling:")
print(pd.Series(y_train_resampled).value_counts())

Training set class distribution after sampling:
Compatible
0    7447
1    7447
Name: count, dtype: int64


### Training and Evaluation

In [20]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42, n_estimators=200, min_samples_split=10,
                                         min_samples_leaf=4, max_features='log2', max_depth=30, bootstrap=False, n_jobs=-1,
                                         class_weight='balanced')

model.fit(X_train_resampled, y_train_resampled)

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='r2')
print("Cross-Validation R² Scores:", cv_scores)
print(f"Mean CV R² Score: {cv_scores.mean():.2f}")

Mean Absolute Error (MAE): 0.0065
Mean Squared Error (MSE): 0.0065
Root Mean Squared Error (RMSE): 0.0806
R² Score: 0.9091
Cross-Validation R² Scores: [0.99060087 0.9919436  0.9919436  0.99328634 0.99328408]
Mean CV R² Score: 0.99


### Saving the model and testing manually

In [22]:
import cloudpickle
# Save model
with open('preprocessor_SC.pkl', 'wb') as preprocessor_file:
    cloudpickle.dump(preprocessor, preprocessor_file)
with open('model_SC.pkl', 'wb') as model_file:
    cloudpickle.dump(model, model_file)
print("Model Saved!")

Model Saved!


In [24]:
import pandas as pd
import cloudpickle
manual_input = pd.DataFrame([['Soybean', 'Alluvial soils', 3, 1, 6, 190, 5, 20, 1000, 24]],
                           columns=X_train.columns)

In [33]:
def predict_compatibility(new_data):

    with open('preprocessor_SC.pkl', 'rb') as prep_file:
        preprocessor = cloudpickle.load(prep_file)
    with open('model_SC.pkl', 'rb') as model_file:
        model_fit = cloudpickle.load(model_file)

    # Transform input features
    new_data_transformed = preprocessor.transform(new_data)

    # Predict Market Demand (returns scaled values)
    predicted_compatibility = model_fit.predict(new_data_transformed)

    return int(predicted_compatibility[0])

In [34]:
predict_compatibility(manual_input)

1