# Hybrid Model for Crop Prediction
This notebook demonstrates how to preprocess data, explore it, and build a hybrid model using stacking to predict crop types based on weather conditions.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from hybridmodels.classifier import HybridClassifier

In [None]:
dataset_path = "path_to_your_csv_file.csv"  # Replace with your file path
df = pd.read_csv(dataset_path)

# Inspect the dataset
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Information:")
df.info()
print("\nSummary statistics:")
print(df.describe())

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values)

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Data visualization
plt.figure(figsize=(10, 6))
sns.countplot(x='Crop Type', data=df)
plt.title("Distribution of Crop Types")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

# Pairplot
sns.pairplot(df, hue='Crop Type', diag_kind='kde')
plt.show()

In [None]:
# Data preprocessing
X = df[['Temperature (°C)', 'Humidity (%)', 'Rainfall (mm)', 'Wind Speed (m/s)']]
y = df['Crop Type']

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Define and train the hybrid model
model = HybridClassifier(
    models=["RandomForest", "LogisticRegression", "XGBClassifier"],  # Use multiple models
    meta_model="LogisticRegression"  # Meta model to combine predictions
)

# Train the hybrid model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

# Confusion matrix visualization
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Feature importance for RandomForest (if applicable)
if "RandomForest" in model.models:
    rf_model = model.models["RandomForest"]
    feature_importances = rf_model.feature_importances_
    features = X.columns
    importance_df = pd.DataFrame({"Feature": features, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)

    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(x="Importance", y="Feature", data=importance_df, palette="viridis")
    plt.title("Feature Importance")
    plt.show()

In [None]:
# Save results to CSV
results_df = pd.DataFrame({"Actual": label_encoder.inverse_transform(y_test), 
                            "Predicted": label_encoder.inverse_transform(y_pred)})
results_df.to_csv("prediction_results.csv", index=False)
print("Prediction results saved to 'prediction_results.csv'.")

# Save preprocessing objects for deployment
import joblib
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
print("Scaler and Label Encoder saved for future use.")