In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier 
import joblib 
import os


models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

print("Step 1: Loading data...")

try:
    df = pd.read_excel('crop_prediction.xlsx')
    print("Data loaded successfully!")
    print("First 5 rows of data:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'synthetic_crop_prediction_data.csv' not found.")
    print("Please ensure the generated CSV file is in the same directory as this script.")
    exit() 

X = df[['Location', 'Soil Type', 'Rainfall (mm)', 'Temperature (°C)', 'Humidity (%)', 'Season']]
y = df['Crop'] 

print("\nStep 2: Defining preprocessing steps...")
numerical_cols = ['Rainfall (mm)', 'Temperature (°C)', 'Humidity (%)']
categorical_cols = ['Location', 'Soil Type', 'Season']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols) 
    ])

print("\nStep 3: Creating a machine learning pipeline...")

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('classifier', RandomForestClassifier(random_state=42)) 
])
print("Pipeline created successfully.")

print("\nStep 4: Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")

print("\nStep 5: Training the model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete!")

from sklearn.metrics import accuracy_score
y_pred = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")

print("\nStep 6: Saving the trained model pipeline...")
output_model_path = os.path.join(models_dir, 'crop_prediction_model.pkl')
joblib.dump(model_pipeline, output_model_path)
print(f"Trained crop prediction model saved successfully to '{output_model_path}'")
print("\nYou can now use this 'crop_prediction_model.pkl' in your Flask application.")

Step 1: Loading data...
Data loaded successfully!
First 5 rows of data:
  Location   Soil Type  Rainfall (mm)  Temperature (°C)  Humidity (%)  Season  \
0    Dewas  Black Soil          950.5              28.3          72.5  Kharif   
1   Indore       Loamy          800.2              30.1          65.0  Kharif   
2   Ujjain        Clay         1100.8              27.5          80.3  Kharif   
3   Bhopal       Sandy          700.0              32.4          60.2    Rabi   
4     Pune    Red Soil          650.3              26.7          68.9  Kharif   

        Crop  
0    Soybean  
1      Maize  
2       Rice  
3      Wheat  
4  Sugarcane  

Step 2: Defining preprocessing steps...

Step 3: Creating a machine learning pipeline...
Pipeline created successfully.

Step 4: Splitting data into training and testing sets...
Training data shape: (577, 6), Test data shape: (145, 6)

Step 5: Training the model...
Model training complete!

Model Accuracy on Test Set: 0.8621

Step 6: Saving the tra