## DATA COLLECTION AND PREPROCESSING


In [1]:
# libraries for manipulations
import pandas as pd
import numpy as np



In [2]:
# importing the datasets for operations
w1_data = pd.read_csv('water_dataset.csv')
d1_data = pd.read_csv('crop_dataset.csv')
d2_data = pd.read_csv('rainfall_soil_dataset.csv')

In [3]:
datasets = [w1_data, d1_data, d2_data]

In [4]:
data = w1_data
for column in data.columns:
    # Check if the column has numeric data
    if pd.api.types.is_numeric_dtype(data[column]):
        # Fill empty spaces with the mean of the column
        data[column].fillna(data[column].mean(), inplace=True)
    else:
        # Fill empty spaces with NA for non-numeric columns
        data[column].fillna('NA', inplace=True)
w1_data = data

In [5]:
data = d1_data
for column in data.columns:
    if pd.api.types.is_numeric_dtype(data[column]):
        data[column].fillna(data[column].mean(), inplace=True)
    else:
        data[column].fillna('NA', inplace=True)
d1_data = data

In [6]:
data = d2_data
for column in data.columns:
    if pd.api.types.is_numeric_dtype(data[column]):
        data[column].fillna(data[column].mean(), inplace=True)
    else:
        data[column].fillna('NA', inplace=True)
d2_data = data

In [7]:
# normalize the features in a dataset stored

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# computing the minimum and maximum values for each feature
numeric_columns = w1_data.select_dtypes(include=['number']).columns # only operating on the features who are numeric in nature
w1_data_normalized = scaler.fit_transform(w1_data[numeric_columns]) # transformation of the numeric feature

numeric_columns = d1_data.select_dtypes(include=['number']).columns
d1_data_normalized = scaler.fit_transform(d1_data[numeric_columns])

numeric_columns = d2_data.select_dtypes(include=['number']).columns
d2_data_normalized = scaler.fit_transform(d2_data[numeric_columns])

In [8]:
# categorical Values 

# Perform one-hot encoding for categorical columns
w1_data_encoded = pd.get_dummies(w1_data, columns=["rural_or_urban", "rural_or_urban", "ref_water_body_type_id_name", "water_body_loc_name", "water_body_ownership_name", "water_body_nature_name"])
d1_data_encoded = pd.get_dummies(d1_data, columns=["Crop", "Season"])
d2_data_encoded = pd.get_dummies(d2_data, columns=["label"])


## FEATURE ENGINEERING

In [9]:
# Engineering new features

w1_data['water_utilization_efficiency'] = w1_data['water_spread_area_of_water_body'] * w1_data['storage_capacity_water_body_present']
d1_data['yield_per_area'] = d1_data['Yield'] / d1_data['Area']
d2_data['soil_fertility_index'] = d2_data['N'] + d2_data['P'] + d2_data['K']


In [10]:
#Select relevant features

relevant_features_w1 = w1_data[['ref_water_body_type_id_name', 'water_body_ownership_name', 'water_body_nature_name', 'water_spread_area_of_water_body', 'storage_capacity_water_body_present', 'water_utilization_efficiency']]
relevant_features_d1 = d1_data[['Crop', 'Annual_Rainfall', 'Yield', 'yield_per_area']]
relevant_features_d2 = d2_data[['temperature', 'humidity', 'ph', 'rainfall', 'soil_fertility_index']]


In [11]:
# Feature Scaling

# Initialize Min-Max scaler
scaler = MinMaxScaler()

# Apply Min-Max scaling to selected features in each dataset
# only operating on the features who are numeric in nature
numeric_columns = relevant_features_w1.select_dtypes(include=['number']).columns 
scaled_features_w1 = scaler.fit_transform(relevant_features_w1[numeric_columns])

numeric_columns = relevant_features_d1.select_dtypes(include=['number']).columns
scaled_features_d1 = scaler.fit_transform(relevant_features_d1[numeric_columns])

numeric_columns = relevant_features_d2.select_dtypes(include=['number']).columns
scaled_features_d2 = scaler.fit_transform(relevant_features_d2[numeric_columns])


In [15]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# Initialize SelectKBest with f_regression scoring function
selector = SelectKBest(score_func=f_regression, k=2)

# Apply feature selection to identify top 2 features in each dataset
selected_features_w1 = selector.fit_transform(scaled_features_w1, w1_data['water_utilization_efficiency'])
selected_features_d1 = selector.fit_transform(scaled_features_d1, d1_data['yield_per_area'])
selected_features_d2 = selector.fit_transform(scaled_features_d2, d2_data['soil_fertility_index'])


## MODEL DEVELOPMENT

In [16]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression done
from sklearn.tree import DecisionTreeRegressor done
from sklearn.ensemble import RandomForestRegressor done
from sklearn.ensemble import GradientBoostingRegressor done
from sklearn.neural_network import MLPRegressor


In [17]:
# Initialize regression algorithms
linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
random_forest_reg = RandomForestRegressor()
gradient_boosting_reg = GradientBoostingRegressor()
mlp_reg = MLPRegressor()


In [27]:
from sklearn.model_selection import train_test_split

# Split the W1 dataset
X_train_w1, X_test_w1, y_train_w1, y_test_w1 = train_test_split(selected_features_w1, w1_data['water_utilization_efficiency'], test_size=0.2, random_state=42)

# Split the D1 dataset
X_train_d1, X_test_d1, y_train_d1, y_test_d1 = train_test_split(selected_features_d1, d1_data['yield_per_area'], test_size=0.2, random_state=42)

# Split the D2 dataset
X_train_d2, X_test_d2, y_train_d2, y_test_d2 = train_test_split(selected_features_d2, d2_data['soil_fertility_index'], test_size=0.2, random_state=42)


In [33]:
# Assuming you have already split your datasets into training and testing sets
# Replace X_train, y_train, X_test, and y_test with your actual data

# Fit the models to the training data
linear_reg.fit(X_train_w1, y_train_w1)
decision_tree_reg.fit(X_train_d1, y_train_d1)
random_forest_reg.fit(X_train_d2, y_train_d2)
gradient_boosting_reg.fit(X_train_w1, y_train_w1)  # Assuming you're using the same dataset for this model
mlp_reg.fit(X_train_d1, y_train_d1)  # Assuming you're using the same dataset for this model

# Evaluate model performance on test data
linear_reg_score = linear_reg.score(X_test_w1, y_test_w1)
decision_tree_score = decision_tree_reg.score(X_test_d1, y_test_d1)
random_forest_score = random_forest_reg.score(X_test_d2, y_test_d2)
gradient_boosting_score = gradient_boosting_reg.score(X_test_w1, y_test_w1)  # Assuming you're using the same dataset for this model
mlp_score = mlp_reg.score(X_test_d1, y_test_d1)  # Assuming you're using the same dataset for this model

# Print the model scores
print("Linear Regression Score:", linear_reg_score)
print("Decision Tree Score:", decision_tree_score)
print("Random Forest Score:", random_forest_score)
print("Gradient Boosting Score:", gradient_boosting_score)
print("MLP Score:", mlp_score)
e

Linear Regression Score: 0.009651601648644648
Decision Tree Score: -0.042156958051176474
Random Forest Score: 0.9999935358461886
Gradient Boosting Score: 0.08559191595866378
MLP Score: 0.05198826992840189


In [35]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(estimator=random_forest_reg, param_grid=param_grid_rf, cv=5)

# Fit GridSearchCV to training data
grid_search_rf.fit(X_train_d2, y_train_d2)

# Get best parameters and best score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

print("Best Parameters for Random Forest:", best_params_rf)
print("Best Score for Random Forest:", best_score_rf)

Best Parameters for Random Forest: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best Score for Random Forest: 0.9999893638324494


from sklearn.model_selection import GridSearchCV, train_test_split

# Split the W1 dataset
X_train_w1, X_test_w1, y_train_w1, y_test_w1 = train_test_split(selected_features_w1, w1_data['water_utilization_efficiency'], test_size=0.2, random_state=42)

# Split the D1 dataset
X_train_d1, X_test_d1, y_train_d1, y_test_d1 = train_test_split(selected_features_d1, d1_data['yield_per_area'], test_size=0.2, random_state=42)

# Split the D2 dataset
X_train_d2, X_test_d2, y_train_d2, y_test_d2 = train_test_split(selected_features_d2, d2_data['soil_fertility_index'], test_size=0.2, random_state=42)

# Define hyperparameters grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(estimator=random_forest_reg, param_grid=param_grid_rf, cv=5)

# Fit GridSearchCV to training data for W1 dataset
grid_search_rf.fit(X_train_w1, y_train_w1)

# Get best parameters and best score for W1 dataset
best_params_rf_w1 = grid_search_rf.best_params_
best_score_rf_w1 = grid_search_rf.best_score_

print("Best Parameters for Random Forest (W1 dataset):", best_params_rf_w1)
print("Best Score for Random Forest (W1 dataset):", best_score_rf_w1)

# Fit GridSearchCV to training data for D1 dataset
grid_search_rf.fit(X_train_d1, y_train_d1)

# Get best parameters and best score for D1 dataset
best_params_rf_d1 = grid_search_rf.best_params_
best_score_rf_d1 = grid_search_rf.best_score_

print("Best Parameters for Random Forest (D1 dataset):", best_params_rf_d1)
print("Best Score for Random Forest (D1 dataset):", best_score_rf_d1)

# Fit GridSearchCV to training data for D2 dataset
grid_search_rf.fit(X_train_d2, y_train_d2)

# Get best parameters and best score for D2 dataset
best_params_rf_d2 = grid_search_rf.best_params_
best_score_rf_d2 = grid_search_rf.best_score_

print("Best Parameters for Random Forest (D2 dataset):", best_params_rf_d2)
print("Best Score for Random Forest (D2 dataset):", best_score_rf_d2)


In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define preprocessing steps
preprocessing_steps = [('scaler', StandardScaler())]

# Define modeling steps
modeling_steps = [('regressor', RandomForestRegressor())]

# Create pipeline
pipeline = Pipeline(steps=preprocessing_steps + modeling_steps)

# Fit pipeline to training data
pipeline.fit(X_train_d2, y_train_d2)

# Evaluate pipeline on test data
pipeline_score = pipeline.score(X_test_d2, y_test_d2)

print("Pipeline Score:", pipeline_score)


Pipeline Score: 0.9999926578023778
