<a href="https://colab.research.google.com/github/apoorva14-unique/IML-PA/blob/main/Bagging_Boosting_stacking_(for_all_6_%26_4).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Bagging wih all features

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]
# data = data.dropna() # Remove or move this line

# Outlier removal using IQR
def remove_outliers(df, column):
    # Ensure the column is numeric before calculating quantiles
    # Non-numeric values will be coerced to NaN
    df[column] = pd.to_numeric(df[column], errors='coerce')
    # Drop rows where the value in this column became NaN after coercion
    df = df.dropna(subset=[column])

    # Now calculate quantiles on numeric data
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

numerical_cols = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20', 'S', 'CU', 'FC', 'MN', 'ZN', 'BA', 'Temparature', 'Humidity', 'Rainfall']

# Apply the outlier removal function, which now includes numeric conversion and dropna per column
# Note: This processes each column sequentially and might drop rows multiple times.
# An alternative is to convert all numerical columns first, then drop NaNs once, then remove outliers.
# Keeping this loop structure but improving the function's robustness within the loop.
for col in numerical_cols:
    data = remove_outliers(data, col)

# After processing numerical columns for outliers, drop any remaining rows with NaNs
# that might exist in other columns (like categorical ones, though unlikely here)
# or were not removed by the per-column dropna in remove_outliers (e.g., if a row had NaNs in multiple numerical columns)
# It's safer to drop NaNs from the entire set of features and the target before further processing.
features = ['MANDAL NAME', 'VILLAGE NAME', 'SOIL TYPE'] + numerical_cols # Re-define features to include categorical
# Ensure 'CROP' column is included in the subset check if it might contain NaNs before encoding
data = data.dropna(subset=features + ['CROP'])


# Encoding categorical variables
le_mandal = LabelEncoder()
le_village = LabelEncoder()
le_soil = LabelEncoder()
le_crop = LabelEncoder()
data['MANDAL NAME'] = le_mandal.fit_transform(data['MANDAL NAME'])
data['VILLAGE NAME'] = le_village.fit_transform(data['VILLAGE NAME'])
data['SOIL TYPE'] = le_soil.fit_transform(data['SOIL TYPE'])
# Encode 'CROP' after dropping NaNs
data['CROP'] = le_crop.fit_transform(data['CROP'])


# Defining features and target
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# Standardize only the numerical columns within X_train and X_test
# Use pandas column indexing to select these columns and assign the scaled NumPy array back.

# Note: scaler.fit_transform and scaler.transform return NumPy arrays.
# When assigning a NumPy array to a slice of a DataFrame, pandas aligns by index.
# This is safe here because the rows should correspond after the split.
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# --- Added code to handle minority classes before SMOTE ---
# Check class distribution in y_train
class_counts = y_train.value_counts()
# Identify classes with fewer than SMOTE's default k_neighbors + 1 samples (5+1=6)
# You might need to adjust this threshold depending on your data and SMOTE parameters
min_samples_for_smote = 6 # Default k_neighbors (5) + 1

# Get the labels of classes with insufficient samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train and y_train to exclude samples from these classes
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index
    X_train = X_train.loc[train_indices_to_keep]
    y_train = y_train.loc[train_indices_to_keep]
# --- End of added code ---

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# Now apply SMOTE to the filtered training data
X_train, y_train = smote.fit_resample(X_train, y_train)


# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_

# Evaluating model
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5)

print(f"Bagging (Random Forest) with All Features")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores * 100):.2f}%")

Bagging (Random Forest) with All Features
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Test Accuracy: 38.71%
Cross-Validation Accuracy: 87.80% ± 3.66%


Bagging with 6 features

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]

# Outlier removal using IQR
def remove_outliers(df, column):
    # Assume the column is already numeric and contains no NaNs at this point
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy() # Use .copy() to avoid SettingWithCopyWarning
    return df

numerical_cols = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20'] # Features used in this cell

# --- Added steps for explicit numeric conversion and initial NaN drop ---
# Explicitly convert the numerical columns to numeric, coercing errors to NaN
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Drop rows where any of the relevant numerical columns (or the target 'CROP') have NaN
# This step should happen *before* outlier removal
data = data.dropna(subset=numerical_cols + ['CROP'])
# --- End of added steps ---

# Apply the outlier removal function *after* cleaning NaNs
for col in numerical_cols:
    # The remove_outliers function now assumes input data is clean (no NaNs in the column)
    data = remove_outliers(data, col)


# Encoding categorical variables
le_crop = LabelEncoder()
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])

# Defining features and target
features = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20']
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# X_train and X_test should now be clean and contain only numeric values
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# Check class distribution *before* SMOTE to identify minority classes with too few samples
class_counts = y_train.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train and y_train to exclude samples from these classes
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index
    X_train = pd.DataFrame(X_train, index=y_train.index, columns=features).loc[train_indices_to_keep].values # Reconstruct DataFrame to use .loc
    y_train = y_train.loc[train_indices_to_keep]


# Now apply SMOTE to the filtered training data
# X_train is now a NumPy array after scaling. SMOTE expects array-like or sparse matrix.
# y_train is a Series.
# SMOTE's fit_resample returns NumPy arrays
X_train, y_train = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_

# Evaluating model
y_pred = best_rf.predict(X_test) # X_test is NumPy array
accuracy = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5) # X_train, y_train are NumPy array and Series

print(f"Bagging (Random Forest) with 6 Features")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores * 100):.2f}%")

Bagging (Random Forest) with 6 Features
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy: 15.15%
Cross-Validation Accuracy: 84.46% ± 2.15%


Bagging with 4 features

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]
# data = data.dropna() # <--- Remove this line or move it

# Outlier removal
def remove_outliers(df, column):
    # Assume the column is already numeric and contains no NaNs at this point
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy() # Use .copy() to avoid SettingWithCopyWarning
    return df

numerical_cols = ['PH', 'EC', 'OC', 'N']

# --- Added steps for explicit numeric conversion and initial NaN drop ---
# Explicitly convert the numerical columns to numeric, coercing errors to NaN
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Drop rows where any of the relevant numerical columns (or the target 'CROP') have NaN
# This step should happen *before* outlier removal or before calling the outlier function
data = data.dropna(subset=numerical_cols + ['CROP'])
# --- End of added steps ---


# Apply the outlier removal function *after* cleaning NaNs
for col in numerical_cols:
    # The remove_outliers function now assumes input data is clean (no NaNs in the column)
    # The data variable is updated in each iteration.
    data = remove_outliers(data, col)


# Encoding categorical variables
le_crop = LabelEncoder()
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])


# Defining features and target
features = ['PH', 'EC', 'OC', 'N']
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# X_train and X_test should now be clean and contain only numeric values
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# Check class distribution *before* SMOTE to identify minority classes with too few samples
class_counts = y_train.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train and y_train to exclude samples from these classes
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index
    # When filtering X_train (which is a NumPy array after scaling), need to be careful with indexing
    # A robust way is to convert X_train back to a DataFrame temporarily or use boolean indexing if indices align
    # Since X_train is a NumPy array here, let's filter based on the original DataFrame index mapping
    # This assumes the order of samples was preserved from the split to scaling.
    # A more robust approach is to keep X_train as a DataFrame until after SMOTE.
    # Let's stick to the current flow but use boolean indexing if possible or reconstruct DataFrame.
    # Reconstructing DataFrame to use .loc indexing is safer:
    X_train_df = pd.DataFrame(X_train, index=y_train.index, columns=features)
    X_train_filtered_df = X_train_df.loc[train_indices_to_keep]
    y_train_filtered = y_train.loc[train_indices_to_keep]
    X_train = X_train_filtered_df.values # Convert back to NumPy array for SMOTE
    y_train = y_train_filtered # Keep y_train as Series or convert to NumPy array if SMOTE needs it


# Now apply SMOTE to the filtered training data
# X_train is now a NumPy array after filtering. SMOTE expects array-like or sparse matrix.
# y_train is a Series (or can be array).
# SMOTE's fit_resample returns NumPy arrays
X_train, y_train = smote.fit_resample(X_train, y_train)


# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_

# Evaluating model
y_pred = best_rf.predict(X_test) # X_test is NumPy array
accuracy = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5) # X_train, y_train are NumPy array and Series

print(f"Bagging (Random Forest) with 4 Features")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores * 100):.2f}%")

Bagging (Random Forest) with 4 Features
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Test Accuracy: 19.27%
Cross-Validation Accuracy: 78.30% ± 2.96%


boosting with all features

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]
# data = data.dropna() # Moved this line

# Outlier removal using IQR
def remove_outliers(df, column):
    # This function now assumes the input column is already numeric and doesn't contain NaNs
    # The numeric conversion and NaN handling are done BEFORE calling this function.
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy() # Use .copy() to avoid SettingWithCopyWarning
    return df

numerical_cols = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20', 'S', 'CU', 'FC', 'MN', 'ZN', 'BA', 'Temparature', 'Humidity', 'Rainfall']

# --- Added explicit numeric conversion and initial NaN drop steps ---
# Explicitly convert the numerical columns to numeric, coercing errors to NaN
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Now drop rows with any missing values *after* coercing non-numeric to NaN.
# Also ensure the target 'CROP' column is free of NaNs before encoding.
data = data.dropna(subset=numerical_cols + ['CROP'])
# --- End of added steps ---


# Apply the outlier removal function *after* cleaning NaNs
# The data variable is updated in each iteration.
for col in numerical_cols:
    # The remove_outliers function now assumes input data is clean (no NaNs in the column)
    data = remove_outliers(data, col)


# Encoding categorical variables
le_mandal = LabelEncoder()
le_village = LabelEncoder()
le_soil = LabelEncoder()
le_crop = LabelEncoder()
# Ensure categorical columns are also free of NaNs before encoding if they were included in the initial dropna subset
# In this case, MANDAL NAME, VILLAGE NAME, SOIL TYPE are likely clean by this point, but good practice to ensure.
# data = data.dropna(subset=['MANDAL NAME', 'VILLAGE NAME', 'SOIL TYPE']) # Optional check if necessary
data['MANDAL NAME'] = le_mandal.fit_transform(data['MANDAL NAME'])
data['VILLAGE NAME'] = le_village.fit_transform(data['VILLAGE NAME'])
data['SOIL TYPE'] = le_soil.fit_transform(data['SOIL TYPE'])
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])

# Defining features and target
# Ensure the features list only contains columns that are now numeric (encoded or converted)
features = ['MANDAL NAME', 'VILLAGE NAME', 'SOIL TYPE'] + numerical_cols
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# Apply scaler to the feature columns. Ensure all columns in X_train/X_test are numeric.
# Categorical features are now integer encoded, which is fine for StandardScaler if treated as numerical (though scaling categorical might not always be ideal).
# If you only want to scale the original numerical columns, you would need to select them here.
# Based on the 'all features' description and previous cells, it seems the intention is to scale everything in X.
# Let's assume scaling all features (encoded categorical + numerical) is the intent.
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# SMOTE works on NumPy arrays (X_train, y_train are now arrays)
# Check class distribution *before* SMOTE to identify minority classes with too few samples
# This check should use the y_train *before* SMOTE.
# The previous dropna on 'CROP' should have removed rows with NaN targets.
# However, SMOTE can still fail if some classes have < k_neighbors + 1 samples.
# Need to re-evaluate y_train value counts after splitting but before SMOTE.

# --- Added code to handle minority classes before SMOTE ---
# Convert y_train back to Series temporarily to use value_counts efficiently
y_train_series = pd.Series(y_train)
class_counts = y_train_series.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train and y_train to exclude samples from these classes
    # Need the original indices from the split to filter X_train (which is now a NumPy array)
    # A safer approach is to keep X_train/y_train as DataFrames/Series until after potential filtering.
    # Let's reconstruct DataFrame/Series from the scaled arrays and filter.
    # This assumes the order is preserved from the split and scaling.
    # Alternative: Filter before scaling if minority class removal is significant.
    # Let's filter the scaled arrays by mapping back to original indices.
    # This is tricky. A better way: Filter y_train (Series), get indices, apply to X_train (array).
    # Assuming original indices are preserved in y_train Series after split:
    train_indices_to_keep = y_train_series[~y_train_series.isin(classes_to_remove)].index
    # Filter the NumPy array X_train using boolean indexing based on the indices from the Series
    # Need to map Series index to array row index. This requires the original index.
    # If X_train is a numpy array, its index is just 0...n-1. We cannot use .loc directly.
    # Let's filter y_train (Series) first, then get the boolean mask, and apply it to X_train (ndarray).
    boolean_mask = ~y_train_series.isin(classes_to_remove)
    X_train = X_train[boolean_mask]
    y_train = y_train[boolean_mask] # Keep y_train as NumPy array after filtering
# --- End of added code ---

# Now apply SMOTE to the filtered training data
# X_train and y_train are now NumPy arrays after filtering and scaling.
X_train, y_train = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}
gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search.fit expects array-like or sparse matrix, which X_train is after SMOTE
grid_search.fit(X_train, y_train)

# Best model
best_gb = grid_search.best_estimator_

# Evaluating model
# X_test is a NumPy array after scaling
y_pred = best_gb.predict(X_test)
# accuracy_score expects array-like or Series
accuracy = accuracy_score(y_test, y_pred)
# cross_val_score expects array-like or sparse matrix for X, and array-like or Series for y
cv_scores = cross_val_score(best_gb, X_train, y_train, cv=5)

print(f"Boosting (Gradient Boosting) with All Features")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores * 100):.2f}%")

Boosting (Gradient Boosting) with All Features
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Test Accuracy: 32.20%
Cross-Validation Accuracy: 87.51% ± 6.01%


Boosting with 6 parameters

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]
# data = data.dropna() # Moved this line

# Outlier removal
def remove_outliers(df, column):
    # This function now assumes the input column is already numeric and doesn't contain NaNs
    # The numeric conversion and NaN handling are done BEFORE calling this function.
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy() # Use .copy() to avoid SettingWithCopyWarning
    return df

numerical_cols = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20']

# --- Added explicit numeric conversion and initial NaN drop steps ---
# Explicitly convert the numerical columns to numeric, coercing errors to NaN
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Now drop rows with any missing values *after* coercing non-numeric to NaN.
# Also ensure the target 'CROP' column is free of NaNs before encoding.
# Focus dropna on the columns relevant to this cell
data = data.dropna(subset=numerical_cols + ['CROP'])
# --- End of added steps ---


# Apply the outlier removal function *after* cleaning NaNs
# The data variable is updated in each iteration.
for col in numerical_cols:
    # The remove_outliers function now assumes input data is clean (no NaNs in the column)
    data = remove_outliers(data, col)


# Encoding categorical variables
le_crop = LabelEncoder()
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])

# Defining features and target
features = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20']
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# X_train and X_test should now be clean and contain only numeric values (from the numerical_cols list)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# Check class distribution *before* SMOTE to identify minority classes with too few samples
# SMOTE expects array-like or sparse matrix for X and array-like for y.
# X_train is a NumPy array after scaling. y_train is a Series. SMOTE handles this mixed input.
# However, the filtering logic needs to be careful with array indices vs Series indices.
# A robust way is to keep X_train as a DataFrame until after filtering, then convert to NumPy.

# --- Added code to handle minority classes before SMOTE ---
# Convert y_train back to Series temporarily if it became an array, otherwise keep it as Series
# In this cell, y_train starts as Series from the split, which is good.
class_counts = y_train.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train (NumPy array) and y_train (Series)
    # Filter y_train first using its index
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index

    # To filter X_train (NumPy array) based on original Series indices,
    # it's safer to reconstruct a DataFrame temporarily using the original index
    # before filtering, then get the values back.
    X_train_df = pd.DataFrame(X_train, index=y_train.index, columns=features)
    X_train_filtered_df = X_train_df.loc[train_indices_to_keep]
    y_train_filtered = y_train.loc[train_indices_to_keep]

    X_train = X_train_filtered_df.values # Convert back to NumPy array for SMOTE
    y_train = y_train_filtered # Keep y_train as Series (SMOTE accepts Series)
# --- End of added code ---


# Now apply SMOTE to the filtered training data
# X_train is now a NumPy array. y_train is a Series.
# SMOTE's fit_resample returns NumPy arrays
X_train, y_train = smote.fit_resample(X_train, y_train)


# Hyperparameter tuning for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}
gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search.fit expects array-like or sparse matrix, which X_train is after SMOTE
grid_search.fit(X_train, y_train)

# Best model
best_gb = grid_search.best_estimator_

# Evaluating model
# X_test is a NumPy array after scaling
y_pred = best_gb.predict(X_test)
# accuracy_score expects array-like or Series
accuracy = accuracy_score(y_test, y_pred)
# cross_val_score expects array-like or sparse matrix for X, and array-like or Series for y
cv_scores = cross_val_score(best_gb, X_train, y_train, cv=5)

print(f"Boosting (Gradient Boosting) with 6 Features")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores) * 100:.2f}% ± {np.std(cv_scores * 100):.2f}%")

Boosting (Gradient Boosting) with 6 Features
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Test Accuracy: 18.18%
Cross-Validation Accuracy: 80.49% ± 2.51%


Boosting with 4 features

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]
# data = data.dropna() # <--- Remove this line or move it. NaNs will be handled after coercion.

# Outlier removal
def remove_outliers(df, column):
    # This function now assumes the input column is already numeric and doesn't contain NaNs
    # The numeric conversion and NaN handling are done BEFORE calling this function.
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    # Use .copy() to avoid SettingWithCopyWarning when modifying subsets
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return df

numerical_cols = ['PH', 'EC', 'OC', 'N']

# --- Added explicit numeric conversion and initial NaN drop steps ---
# Explicitly convert the numerical columns to numeric, coercing errors to NaN
# This handles cases like "0..07" and turns them into NaN
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Now drop rows with any missing values *after* coercing non-numeric to NaN.
# Also ensure the target 'CROP' column is free of NaNs before encoding.
# Focus dropna on the columns relevant to this cell (features + target)
data = data.dropna(subset=numerical_cols + ['CROP'])
# --- End of added steps ---


# Apply the outlier removal function *after* cleaning NaNs
# The data variable is updated in each iteration.
for col in numerical_cols:
    # The remove_outliers function now assumes input data is clean (no NaNs in the column being processed)
    data = remove_outliers(data, col)


# Encoding categorical variables
le_crop = LabelEncoder()
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])


# Defining features and target
features = ['PH', 'EC', 'OC', 'N']
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# X_train and X_test should now be clean and contain only numeric values
# Ensure X_train columns are numeric before scaling (dropna above should handle this for the specified features)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# Check class distribution *before* SMOTE to identify minority classes with too few samples
# SMOTE expects array-like or sparse matrix for X and array-like for y.
# X_train is a NumPy array after scaling. y_train is a Series. SMOTE handles this mixed input.
# However, the filtering logic needs to be careful with array indices vs Series indices.
# A robust way is to keep X_train as a DataFrame until after filtering, then convert to NumPy.

# --- Added code to handle minority classes before SMOTE ---
# Convert y_train back to Series temporarily if it became an array, otherwise keep it as Series
# In this cell, y_train starts as Series from the split, which is good.
class_counts = y_train.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train (NumPy array) and y_train (Series)
    # Filter y_train first using its index
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index

    # To filter X_train (NumPy array) based on original Series indices,
    # it's safer to reconstruct a DataFrame temporarily using the original index
    # before filtering, then get the values back.
    # This assumes the order is preserved from the split to scaling/filtering.
    # It's best practice to filter the original DataFrames/Series first.
    # Let's try a direct boolean indexing on the NumPy array using the mask derived from y_train's indices.
    # This is safer if the indices of y_train Series match the row order of X_train NumPy array.
    boolean_mask = y_train.index.isin(train_indices_to_keep) # Create boolean mask from indices
    X_train = X_train[boolean_mask] # Apply boolean mask to NumPy array
    y_train = y_train.loc[train_indices_to_keep] # Filter y_train Series using .loc
# --- End of added code ---


# Now apply SMOTE to the filtered training data
# X_train is now a NumPy array. y_train is a Series.
# SMOTE's fit_resample returns NumPy arrays
X_train, y_train = smote.fit_resample(X_train, y_train)


# Hyperparameter tuning for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}
gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search.fit expects array-like or sparse matrix, which X_train is after SMOTE
grid_search.fit(X_train, y_train)

# Best model
best_gb = grid_search.best_estimator_

# Evaluating model
# X_test is a NumPy array after scaling
y_pred = best_gb.predict(X_test)
# accuracy_score expects array-like or Series
accuracy = accuracy_score(y_test, y_pred)
# cross_val_score expects array-like or sparse matrix for X, and array-like or Series for y
cv_scores = cross_val_score(best_gb, X_train, y_train, cv=5)

print(f"Boosting (Gradient Boosting) with 4 Features")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores) * 100:.2f}%")

Boosting (Gradient Boosting) with 4 Features
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Test Accuracy: 20.18%
Cross-Validation Accuracy: 71.05% ± 3.05%


Stacking with all features

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]
# data = data.dropna() # <-- REMOVED this line or MOVED it. NaNs will be handled AFTER coercion.

# Outlier removal
def remove_outliers(df, column):
    # This function now assumes the input column is already numeric and doesn't contain NaNs
    # The numeric conversion and NaN handling are done BEFORE calling this function.
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    # Use .copy() to avoid SettingWithCopyWarning when modifying subsets
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return df

numerical_cols = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20', 'S', 'CU', 'FC', 'MN', 'ZN', 'BA', 'Temparature', 'Humidity', 'Rainfall']

# --- ADDED: Explicitly convert numerical columns to numeric BEFORE outlier removal and dropna ---
# Explicitly convert the numerical columns to numeric, coercing errors to NaN
# This handles cases like "0..07" and turns them into NaN
for col in numerical_cols:
    # Added try-except for more robust conversion, although errors='coerce' should handle most
    try:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    except ValueError as e:
        print(f"Error converting column {col} to numeric: {e}")
        # If a specific value is known to cause issues, you might target it here
        # For instance: data[col] = data[col].replace('0..07', np.nan)
        data[col] = pd.to_numeric(data[col], errors='coerce') # Re-attempt with coerce
# --- END ADDED ---

# --- ADDED: Drop rows with any missing values AFTER coercing non-numeric to NaN ---
# Now drop rows with any missing values (including those created by coercion).
# Also ensure the target 'CROP' column is free of NaNs before encoding.
# Focus dropna on the columns relevant to this cell (features + target)
features_in_this_cell = ['MANDAL NAME', 'VILLAGE NAME', 'SOIL TYPE'] + numerical_cols
data = data.dropna(subset=features_in_this_cell + ['CROP'])
# --- END ADDED ---


# Apply the outlier removal function *after* cleaning NaNs
# The data variable is updated in each iteration.
# The remove_outliers function now assumes input data is clean (no NaNs in the column being processed)
# since NaNs were dropped in the step above.
for col in numerical_cols:
    data = remove_outliers(data, col)


# Encoding categorical variables
le_mandal = LabelEncoder()
le_village = LabelEncoder()
le_soil = LabelEncoder()
le_crop = LabelEncoder()
# Categorical columns should be clean of NaNs by the dropna step above, assuming they were included.
# Re-check the subset in dropna if categorical columns might have NaNs and are needed for features.
data['MANDAL NAME'] = le_mandal.fit_transform(data['MANDAL NAME'])
data['VILLAGE NAME'] = le_village.fit_transform(data['VILLAGE NAME'])
data['SOIL TYPE'] = le_soil.fit_transform(data['SOIL TYPE'])
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])

# Defining features and target
# Ensure the features list only contains columns that are now numeric (encoded or converted)
features = ['MANDAL NAME', 'VILLAGE NAME', 'SOIL TYPE'] + numerical_cols
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# Apply scaler to the feature columns. Ensure all columns in X_train/X_test are numeric.
# Categorical features are now integer encoded, which is fine for StandardScaler if treated as numerical.
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# SMOTE expects array-like or sparse matrix for X and array-like for y.
# X_train is a NumPy array after scaling. y_train is a Series. SMOTE handles this mixed input.
# Check class distribution *before* SMOTE to identify minority classes with too few samples
# Need to re-evaluate y_train value counts after splitting but before SMOTE.

# --- ADDED code to handle minority classes before SMOTE ---
# Convert y_train to Series if it's not already, to use value_counts efficiently
# It should be a Series from train_test_split here.
class_counts = y_train.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train (NumPy array) and y_train (Series)
    # Filter y_train first using its index
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index

    # To filter X_train (NumPy array) based on original Series indices,
    # use boolean indexing based on the indices from the Series
    # This is safer if the indices of y_train Series match the row order of X_train NumPy array.
    boolean_mask = y_train.index.isin(train_indices_to_keep) # Create boolean mask from indices
    X_train = X_train[boolean_mask] # Apply boolean mask to NumPy array
    y_train = y_train.loc[train_indices_to_keep] # Filter y_train Series using .loc
# --- END ADDED code ---


# Now apply SMOTE to the filtered training data
# X_train is now a NumPy array. y_train is a Series.
# SMOTE's fit_resample returns NumPy arrays
X_train, y_train = smote.fit_resample(X_train, y_train)


# Defining base models for stacking
estimators = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42, use_label_encoder=False, eval_metric='mlogloss')) # Added use_label_encoder=False for modern XGBoost
]

# Training Stacking Classifier with Random Forest meta-classifier
stacking = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(n_estimators=100, random_state=42), cv=5)
# grid_search.fit expects array-like or sparse matrix, which X_train is after SMOTE
stacking.fit(X_train, y_train) # Fit the stacking classifier

# Evaluating model
# X_test is a NumPy array after scaling
y_pred = stacking.predict(X_test)
# accuracy_score expects array-like or Series
accuracy = accuracy_score(y_test, y_pred)
# cross_val_score expects array-like or sparse matrix for X, and array-like or Series for y
cv_scores = cross_val_score(stacking, X_train, y_train, cv=5) # Evaluate the stacking classifier

print(f"Stacking Classifier with All Features")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores * 100):.2f}%")

Stacking Classifier with All Features
Test Accuracy: 35.59%
Cross-Validation Accuracy: 90.48% ± 6.12%


Stacking with 6 features

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]

# Define the numerical features for this specific cell
numerical_cols = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20']

# --- ADDED: Explicitly convert numerical columns to numeric, coercing errors ---
# This handles cases like "0..07" and turns them into NaN
for col in numerical_cols:
    # Use errors='coerce' to turn problematic values into NaN
    data[col] = pd.to_numeric(data[col], errors='coerce')
# --- END ADDED ---

# --- MODIFIED: Drop rows with any missing values AFTER coercing non-numeric to NaN ---
# Now drop rows with any missing values (including those created by coercion)
# Focus dropna on the columns relevant to this cell (numerical features + target)
data = data.dropna(subset=numerical_cols + ['CROP'])
# --- END MODIFIED ---

# Outlier removal
def remove_outliers(df, column):
    # This function now assumes the input column is already numeric and doesn't contain NaNs,
    # because the explicit conversion and dropna steps were done BEFORE calling this function.
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    # Use .copy() to avoid SettingWithCopyWarning when modifying subsets
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return df

# Apply the outlier removal function *after* cleaning NaNs
for col in numerical_cols:
    # The data variable is updated in each iteration.
    # The remove_outliers function now assumes input data is clean (no NaNs in the column being processed)
    # since NaNs were dropped in the step above.
    data = remove_outliers(data, col)


# Encoding categorical variables
le_crop = LabelEncoder()
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])

# Defining features and target
features = ['PH', 'EC', 'OC', 'N', 'P2O5', 'K20'] # Features used in this cell
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# X_train and X_test should now be clean and contain only numeric values (from the numerical_cols list)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# SMOTE expects array-like or sparse matrix for X and array-like for y.
# X_train is a NumPy array after scaling. y_train is a Series. SMOTE handles this mixed input.
# Check class distribution *before* SMOTE to identify minority classes with too few samples
# Need to re-evaluate y_train value counts after splitting but before SMOTE.

# --- ADDED code to handle minority classes before SMOTE ---
# Convert y_train to Series if it's not already, to use value_counts efficiently
# It should be a Series from train_test_split here.
class_counts = y_train.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train (NumPy array) and y_train (Series)
    # Filter y_train first using its index
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index

    # To filter X_train (NumPy array) based on original Series indices,
    # use boolean indexing based on the indices from the Series
    # This is safer if the indices of y_train Series match the row order of X_train NumPy array.
    # Create boolean mask based on original Series indices
    boolean_mask = y_train.index.isin(train_indices_to_keep)
    X_train = X_train[boolean_mask] # Apply boolean mask to NumPy array
    y_train = y_train.loc[train_indices_to_keep] # Filter y_train Series using .loc
# --- END ADDED code ---


# Now apply SMOTE to the filtered training data
# X_train is now a NumPy array. y_train is a Series.
# SMOTE's fit_resample returns NumPy arrays
X_train, y_train = smote.fit_resample(X_train, y_train)


# Defining base models for stacking
estimators = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)),
    # Added use_label_encoder=False for modern XGBoost to avoid deprecation warnings
    ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
]

# Training Stacking Classifier with Random Forest meta-classifier
stacking = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(n_estimators=100, random_state=42), cv=5)
# grid_search.fit expects array-like or sparse matrix, which X_train is after SMOTE
stacking.fit(X_train, y_train) # Fit the stacking classifier

# Evaluating model
# X_test is a NumPy array after scaling
y_pred = stacking.predict(X_test)
# accuracy_score expects array-like or Series
accuracy = accuracy_score(y_test, y_pred)
# cross_val_score expects array-like or sparse matrix for X, and array-like or Series for y
cv_scores = cross_val_score(stacking, X_train, y_train, cv=5) # Evaluate the stacking classifier

print(f"Stacking Classifier with 6 Features")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores * 100):.2f}%")

Stacking Classifier with 6 Features
Test Accuracy: 19.19%
Cross-Validation Accuracy: 85.13% ± 3.22%


Stacking with 4 features

In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Loading and preprocessing data
data = pd.read_csv('complete soil data.csv')

# Cleaning data
data = data[data['SOIL TYPE'].isin([1, 2, 3, 4, 5, 7, 8, 9, 11])]

# Define the numerical features for this specific cell
numerical_cols = ['PH', 'EC', 'OC', 'N']

# --- ADDED: Explicitly convert numerical columns to numeric, coercing errors ---
# This handles cases like "0..07" and turns them into NaN
for col in numerical_cols:
    # Use errors='coerce' to turn problematic values into NaN
    data[col] = pd.to_numeric(data[col], errors='coerce')
# --- END ADDED ---

# --- MODIFIED: Drop rows with any missing values AFTER coercing non-numeric to NaN ---
# Now drop rows with any missing values (including those created by coercion)
# Focus dropna on the columns relevant to this cell (numerical features + target)
data = data.dropna(subset=numerical_cols + ['CROP'])
# --- END MODIFIED ---

# Outlier removal
def remove_outliers(df, column):
    # This function now assumes the input column is already numeric and doesn't contain NaNs,
    # because the explicit conversion and dropna steps were done BEFORE calling this function.
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Apply the filter directly to the DataFrame passed in
    # Use .copy() to avoid SettingWithCopyWarning when modifying subsets
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return df

# Apply the outlier removal function *after* cleaning NaNs
for col in numerical_cols:
    # The data variable is updated in each iteration.
    # The remove_outliers function now assumes input data is clean (no NaNs in the column being processed)
    # since NaNs were dropped in the step above.
    data = remove_outliers(data, col)


# Encoding categorical variables
le_crop = LabelEncoder()
# The 'CROP' column is already cleaned for NaNs by the dropna call above
data['CROP'] = le_crop.fit_transform(data['CROP'])

# Defining features and target
features = ['PH', 'EC', 'OC', 'N'] # Features used in this cell
X = data[features]
y = data['CROP']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing numerical features
scaler = StandardScaler()
# X_train and X_test should now be clean and contain only numeric values (from the numerical_cols list)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
# SMOTE expects array-like or sparse matrix for X and array-like for y.
# X_train is a NumPy array after scaling. y_train is a Series. SMOTE handles this mixed input.
# Check class distribution *before* SMOTE to identify minority classes with too few samples
# Need to re-evaluate y_train value counts after splitting but before SMOTE.

# --- ADDED code to handle minority classes before SMOTE ---
# Convert y_train to Series if it's not already, to use value_counts efficiently
# It should be a Series from train_test_split here.
class_counts = y_train.value_counts()
min_samples_for_smote = smote.k_neighbors + 1 # Default k_neighbors is 5, so minimum 6 samples

# Identify classes with fewer than the minimum required samples
classes_to_remove = class_counts[class_counts < min_samples_for_smote].index

if not classes_to_remove.empty:
    print(f"Warning: Removing classes with fewer than {min_samples_for_smote} samples before SMOTE: {list(classes_to_remove)}")
    # Filter X_train (NumPy array) and y_train (Series)
    # Filter y_train first using its index
    train_indices_to_keep = y_train[~y_train.isin(classes_to_remove)].index

    # To filter X_train (NumPy array) based on original Series indices,
    # use boolean indexing based on the indices from the Series
    # This is safer if the indices of y_train Series match the row order of X_train NumPy array.
    # Create boolean mask based on original Series indices
    boolean_mask = y_train.index.isin(train_indices_to_keep)
    X_train = X_train[boolean_mask] # Apply boolean mask to NumPy array
    y_train = y_train.loc[train_indices_to_keep] # Filter y_train Series using .loc
# --- END ADDED code ---


# Now apply SMOTE to the filtered training data
# X_train is now a NumPy array. y_train is a Series.
# SMOTE's fit_resample returns NumPy arrays
X_train, y_train = smote.fit_resample(X_train, y_train)


# Defining base models for stacking
estimators = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)),
    # Added use_label_encoder=False for modern XGBoost to avoid deprecation warnings
    ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
]

# Training Stacking Classifier with Random Forest meta-classifier
stacking = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(n_estimators=100, random_state=42), cv=5)
# grid_search.fit expects array-like or sparse matrix, which X_train is after SMOTE
stacking.fit(X_train, y_train) # Fit the stacking classifier

# Evaluating model
# X_test is a NumPy array after scaling
y_pred = stacking.predict(X_test)
# accuracy_score expects array-like or Series
accuracy = accuracy_score(y_test, y_pred)
# cross_val_score expects array-like or sparse matrix for X, and array-like or Series for y
cv_scores = cross_val_score(stacking, X_train, y_train, cv=5) # Evaluate the stacking classifier

print(f"Stacking Classifier with 4 Features")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores * 100):.2f}% ± {np.std(cv_scores * 100):.2f}%")

Stacking Classifier with 4 Features
Test Accuracy: 19.27%
Cross-Validation Accuracy: 76.47% ± 2.52%
