In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Create directories if they don't exist
os.makedirs('visualizations', exist_ok=True)
os.makedirs('models', exist_ok=True)

def analyze_and_preprocess(df, target_column):
    print("\n==== DATA ANALYSIS ====")
    print("Initial data shape:", df.shape)
    print("\nData types:")
    print(df.dtypes)

    # Drop any rows with missing target values
    df = df.dropna(subset=[target_column])

    # Convert boolean columns to int
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    # Identify numeric and categorical features
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numeric_features.remove(target_column)

    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

    print("\nNumeric features:", numeric_features)
    print("Categorical features:", categorical_features)

    return df, numeric_features, categorical_features

In [3]:
def create_visualizations(df, numeric_features, categorical_features, target_column):
    print("\n==== CREATING VISUALIZATIONS ====")
    
    # Correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df[numeric_features + [target_column]].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.savefig('visualizations/correlation_heatmap.png')
    plt.close()
    
    # Distribution of target variable
    plt.figure(figsize=(8, 6))
    sns.histplot(df[target_column], kde=True)
    plt.title(f'Distribution of {target_column}')
    plt.tight_layout()
    plt.savefig('visualizations/target_distribution.png')
    plt.close()
    
    # Pairplot for numeric features
    if len(numeric_features) > 0:
        sns.pairplot(df[numeric_features + [target_column]], diag_kind='kde')
        plt.tight_layout()
        plt.savefig('visualizations/numeric_pairplot.png')
        plt.close()
    
    # Boxplots for categorical features
    for feature in categorical_features:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=feature, y=target_column, data=df)
        plt.title(f'{target_column} by {feature}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'visualizations/{feature}_boxplot.png')
        plt.close()

In [4]:
def build_and_train_models(X, y, numeric_features, categorical_features):
    print("\n==== MODEL TRAINING ====")
    
    # Create preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    # Define models
    models = {
        'Linear Regression': LinearRegression(),
        'SGD Regression': SGDRegressor(max_iter=1000, tol=1e-3),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42)
    }
    
    results = {}
    best_score = float('inf')
    best_model_name = None
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    for name, model in models.items():
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)])
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Evaluate
        y_pred = pipeline.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[name] = {
            'model': pipeline,
            'mse': mse,
            'r2': r2
        }
        
        print(f"{name} - MSE: {mse:.4f}, R2: {r2:.4f}")
        
        # Track best model
        if mse < best_score:
            best_score = mse
            best_model_name = name
    
    print(f"\nBest model: {best_model_name} with MSE: {best_score:.4f}")
    return models, results, best_model_name

In [5]:
def visualize_linear_regression(models, X, y, numeric_features, target_column):
    if 'Linear Regression' in models:
        print("\n==== LINEAR REGRESSION VISUALIZATION ====")
        
        # Get the first numeric feature for visualization
        if len(numeric_features) > 0:
            feature = numeric_features[0]
            
            # Simple linear regression for visualization
            X_vis = X[[feature]]
            y_vis = y
            
            # Train simple model
            model = LinearRegression()
            model.fit(X_vis, y_vis)
            
            # Predictions
            x_range = np.linspace(X_vis.min(), X_vis.max(), 100)
            y_pred = model.predict(x_range.reshape(-1, 1))
            
            # Plot
            plt.figure(figsize=(10, 6))
            plt.scatter(X_vis, y_vis, alpha=0.5, label='Actual Data')
            plt.plot(x_range, y_pred, color='red', linewidth=2, label='Regression Line')
            plt.xlabel(feature)
            plt.ylabel(target_column)
            plt.title(f'Simple Linear Regression: {target_column} vs {feature}')
            plt.legend()
            plt.tight_layout()
            plt.savefig('visualizations/linear_regression_plot.png')
            plt.close()

In [13]:
def tune_best_model(best_model_name, X, y, numeric_features, categorical_features):
    print(f"\n==== TUNING BEST MODEL: {best_model_name} ====")
    
    # Create preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    # Define parameter grids for different models
    param_grids = {
        'Linear Regression': {},
        'SGD Regression': {
            'regressor__alpha': [0.0001, 0.001, 0.01],
            'regressor__penalty': ['l2', 'l1', 'elasticnet'],
            'regressor__learning_rate': ['constant', 'optimal', 'invscaling']
        },
        'Decision Tree': {
            'regressor__max_depth': [None, 5, 10, 20],
            'regressor__min_samples_split': [2, 5, 10]
        },
        'Random Forest': {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__max_depth': [None, 10, 20],
            'regressor__min_samples_split': [2, 5]
        }
    }
    
    # Select the appropriate model
    model_map = {
        'Linear Regression': LinearRegression(),
        'SGD Regression': SGDRegressor(max_iter=1000, tol=1e-3),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42)
    }
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model_map[best_model_name])])
    
    # Perform grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grids[best_model_name],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1)
    
    grid_search.fit(X, y)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Evaluate
    y_pred = best_model.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Training MSE: {mse:.4f}")
    print(f"Training R2: {r2:.4f}")
    
    return best_model, mse, r2

In [14]:
def feature_importance(model, numeric_features, categorical_features):
    print("\n==== FEATURE IMPORTANCE ====")
    
    # Check if model is a pipeline
    if hasattr(model, 'named_steps'):
        # Get feature names
        preprocessor = model.named_steps['preprocessor']
        
        # Get categorical feature names after one-hot encoding
        if len(categorical_features) > 0:
            ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
            cat_feature_names = ohe.get_feature_names_out(categorical_features)
        else:
            cat_feature_names = []
        
        all_feature_names = numeric_features + list(cat_feature_names)
        
        # Get feature importance based on model type
        if hasattr(model.named_steps['regressor'], 'coef_'):
            # Linear models
            importance = model.named_steps['regressor'].coef_
            plt.barh(all_feature_names, importance)
            plt.title('Feature Coefficients')
        elif hasattr(model.named_steps['regressor'], 'feature_importances_'):
            # Tree-based models
            importance = model.named_steps['regressor'].feature_importances_
            plt.barh(all_feature_names, importance)
            plt.title('Feature Importances')
        else:
            print("Feature importance not available for this model type.")
            return
        
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.savefig('visualizations/feature_importance.png')
        plt.close()

In [15]:
def save_best_model(model, numeric_features, categorical_features, target_column):
    print("\n==== SAVING BEST MODEL ====")
    
    # Save the model
    model_data = {
        'model': model,
        'numeric_features': numeric_features,
        'categorical_features': categorical_features,
        'target_column': target_column
    }
    
    joblib.dump(model_data, 'models/best_model.pkl')
    print("Model saved to 'models/best_model.pkl'")

In [16]:
if __name__ == "__main__":
    print("==== CROP YIELD PREDICTION MODEL ====")
    
    # Load data directly from CSV
    print("Loading data from dataset file")
    df = pd.read_csv('african_crop_yield.csv')
    
    # Set target column
    target_column = 'Yield_tons_per_hectare'
    
    # Analyze and preprocess data
    df, numeric_features, categorical_features = analyze_and_preprocess(df, target_column)
    
    # Create visualizations
    create_visualizations(df, numeric_features, categorical_features, target_column)
    
    # Prepare features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Build and train models
    models, results, best_model_name = build_and_train_models(X, y, numeric_features, categorical_features)
        
    # Create train/test split for visualizations
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Visualize linear regression results
    visualize_linear_regression(models, X, y, numeric_features, target_column)
    
    # Tune the best model
    best_model, test_mse, r2 = tune_best_model(best_model_name, X, y, numeric_features, categorical_features)
    
    # Make prediction on a single test data point
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    if len(X_test) > 0:
        # Select a single test example
        single_example = X_test.iloc[0:1]
        actual_value = y_test.iloc[0]
        
        # Make prediction using the best model
        predicted_value = best_model.predict(single_example)[0]
        
        print(f"\n==== PREDICTION ON SINGLE TEST EXAMPLE ====")
        print(f"Example features:")
        for col in single_example.columns:
            print(f"  {col}: {single_example[col].iloc[0]}")
            
        print(f"\nActual {target_column}: {actual_value}")
        print(f"Predicted {target_column}: {predicted_value:.2f}")
        print(f"Absolute error: {abs(actual_value - predicted_value):.2f}")
    
    # Analyze feature importance for the best model
    feature_importance(best_model, numeric_features, categorical_features)
    
    # Save the best model
    save_best_model(best_model, numeric_features, categorical_features, target_column)
    
    print("\n==== MODEL TRAINING COMPLETE ====")
    print(f"Best model: Tuned {best_model_name}")
    print(f"Test MSE: {test_mse:.2f}")
    print(f"R² Score: {r2:.2f}")
    print("\nAll visualizations saved in 'visualizations' folder")
    print("Model saved in 'models' folder")

==== CROP YIELD PREDICTION MODEL ====
Loading data from dataset file

==== DATA ANALYSIS ====
Initial data shape: (1000000, 10)

Data types:
Region                     object
Soil_Type                  object
Crop                       object
Rainfall_mm               float64
Temperature_Celsius       float64
Fertilizer_Used              bool
Irrigation_Used              bool
Weather_Condition          object
Days_to_Harvest             int64
Yield_tons_per_hectare    float64
dtype: object

Numeric features: ['Rainfall_mm', 'Temperature_Celsius', 'Fertilizer_Used', 'Irrigation_Used', 'Days_to_Harvest']
Categorical features: ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']

==== CREATING VISUALIZATIONS ====

==== MODEL TRAINING ====
Linear Regression - MSE: 0.2508, R2: 0.9131
SGD Regression - MSE: 0.2510, R2: 0.9130
Decision Tree - MSE: 0.5320, R2: 0.8156
Random Forest - MSE: 0.2663, R2: 0.9077

Best model: Linear Regression with MSE: 0.2508

==== LINEAR REGRESSION VISUALIZATION ===