In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import streamlit as st

# Load the dataset
df = pd.read_csv('merged_dataset.csv')

# Display first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Handle missing values (if any)
# For numerical columns, fill with mean
numerical_cols = ['n', 'p', 'k', 'temperature', 'humidity', 'ph', 'rainfall', 'moisture', 'windspeed']
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)

# For categorical columns, fill with mode
categorical_cols = ['soil_type', 'crop']
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Encode Soil Type
le = LabelEncoder()
df['soil_type_encoded'] = le.fit_transform(df['soil_type'])

# Normalize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Features and target
X = df[numerical_cols + ['soil_type_encoded']]
y = df['crop']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save preprocessor
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(le, 'label_encoder.pkl')

In [None]:
# EDA
# Distribution plots
for col in numerical_cols + ['soil_type_encoded']:
    plt.figure(figsize=(8, 6))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df[numerical_cols + ['soil_type_encoded']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Comparison of features by crop
for col in numerical_cols:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='crop', y=col, data=df)
    plt.title(f'{col} by Crop')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Build and Train Models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f'{name} trained.')

In [None]:
# Evaluate Models
results = {}
for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': cm
    }
    
    print(f'{name} Results:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    print(f'Confusion Matrix:\n{cm}')
    print('-' * 50)

In [None]:
# Select Best Model
best_model_name = max(results, key=lambda x: results[x]['F1-Score'])
best_model = trained_models[best_model_name]
print(f'Best Model: {best_model_name} with F1-Score: {results[best_model_name]["F1-Score"]:.4f}')

# Save best model
joblib.dump(best_model, 'best_crop_model.pkl')

In [None]:
# Prediction Function
def predict_crop(n, p, k, temperature, humidity, ph, rainfall, moisture, soil_type, windspeed):
    # Load preprocessors
    scaler = joblib.load('scaler.pkl')
    le = joblib.load('label_encoder.pkl')
    model = joblib.load('best_crop_model.pkl')
    
    # Prepare input
    input_data = pd.DataFrame({
        'n': [n], 'p': [p], 'k': [k], 'temperature': [temperature],
        'humidity': [humidity], 'ph': [ph], 'rainfall': [rainfall],
        'moisture': [moisture], 'windspeed': [windspeed]
    })
    
    # Scale numerical features
    input_data[numerical_cols] = scaler.transform(input_data[numerical_cols])
    
    # Encode soil type
    soil_encoded = le.transform([soil_type])[0]
    input_data['soil_type_encoded'] = soil_encoded
    
    # Predict
    prediction = model.predict(input_data)
    
    return prediction[0]

# Test the function
print(predict_crop(90, 42, 43, 20.87, 82.00, 6.50, 202.93, 29.44, 2, 10.10))