In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [12]:
# Load dataset
data = pd.read_csv('/kaggle/input/soil-health-card/SoilHealthCardFinalDataset.csv')

# Drop irrelevant columns
data = data.drop(['id', 'local_crop_name'], axis=1)

# Separate features and target
X = data.drop('recommended_crop', axis=1)
y = data['recommended_crop']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=48)

# Encode categorical features (handle unseen categories)
categorical_cols = X.select_dtypes(include=['object']).columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[categorical_cols] = encoder.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = encoder.transform(X_test[categorical_cols])

# Train Random Forest model
model = RandomForestClassifier(random_state=48)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Print results
print(f"Overall Accuracy: {accuracy:.2f}\n")
print("Example Predictions:")
for i in range(5):
    print(f"Actual: {y_test.iloc[i]}, Predicted: {y_pred[i]}")

Overall Accuracy: 0.08

Example Predictions:
Actual: Turmeric, Predicted: Ginger
Actual: Fenugreek, Predicted: Coriander
Actual: Ginger, Predicted: Turmeric
Actual: Cashew, Predicted: Betel Nut
Actual: Cluster Bean, Predicted: Mustard
