**Data Splitting

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('New_Training_data.csv')
print(data)
y = data['%Prevalence of high BLLs']
X = data.drop('%Prevalence of high BLLs', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

print("Training data:", X_train.shape)
print("Testing data:", X_test.shape)

print(X_train.head())
print(X_test.head())

             Number of garages  Presence of open sewage  Number of industries  \
Location-Id                                                                     
70                         873                        1                    17   
68                        4349                        1                    18   
27                         375                        1                    19   
27                        1261                        0                    19   
64                         603                        1                    19   
...                        ...                      ...                   ...   
32                        5389                        1                    19   
51                         555                        1                    19   
44                        1151                        0                    19   
74                        1302                        0                    18   
37                        16

In [None]:
print("First few rows of the dataset:")
print(data.head())

print("Summary statistics of the dataset:")
print(data.describe())

print("Missing values in the dataset:")
print(data.isnull().sum())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Combining X_train and y_train into a single DataFrame
df = pd.concat([X_train, y_train], axis=1)

# Created a pair plot
sns.pairplot(df, x_vars=X_train.columns, y_vars=['%Prevalence of high BLLs'], kind='scatter', height=2.5)
plt.show()

In [None]:
selected_features = ['Number of garages', 'Number of industries', '%Population that uses borehole of water', '%Prevalence of high BLLs']

selected_data = data[selected_features]
correlation_matrix = selected_data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import sklearn.preprocessing as preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import sklearn.datasets as datasets
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:

numerical_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(exclude=[np.number]).columns

encoder = OneHotEncoder(drop='first', sparse=False)
data_encoded = encoder.fit_transform(X[categorical_cols])
encoded_feature_names = encoder.get_feature_names_out(input_features=categorical_cols)

data_encoded = pd.DataFrame(data_encoded, columns=encoded_feature_names)

numerical_data = X.select_dtypes(include=[np.number])
X = pd.concat([numerical_data, data_encoded], axis=1)

# Create a pipeline with preprocessing and Random Forest Classifier
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=123))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Predict on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Best Model Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)