In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Step 1: Load the dataset
data = pd.read_csv('/content/blood_data.csv')


In [None]:
data.head()

Unnamed: 0,S. No.,Age,Sex,RBC,PCV,MCV,MCH,MCHC,RDW,TLC,PLT /mm3,HGB
0,,,,Red Blood Cell count,Packed Cell Volume,Mean Cell Volume,Mean Cell Hemoglobin,,Red Cell Distribution width,"White Blood Cell (WBC count),",Platelet,Hemoglobin
1,1.0,28.0,0.0,5.66,34,60.1,17,28.2,20,11.1,128.3,9.6
2,2.0,41.0,0.0,4.78,44.5,93.1,28.9,31.0,13,7.02,419,13.8
3,3.0,40.0,1.0,4.65,41.6,89.5,28.8,32.2,13,8.09,325,13.4
4,4.0,76.0,0.0,4.24,36.7,86.6,26.7,30.8,14.9,13.41,264,11.3


In [None]:
data.columns = data.columns.str.strip()
data_cleaned = data.iloc[1:].copy()
for col in ['RBC', 'PCV', 'MCV', 'MCH', 'RDW', 'TLC', 'PLT /mm3', 'HGB', 'Age', 'Sex', 'MCHC']:
    data_cleaned[col] = pd.to_numeric(data_cleaned[col], errors='coerce')
data_cleaned = data_cleaned.dropna()

# Droping non-feature columns:-
X = data_cleaned.drop(['S. No.'], axis=1)

# K-Means Clustering
n_clusters = 9  # One for each disease type
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)  # Explicitly set n_init
clusters = kmeans.fit_predict(X_scaled)

#cluster labels for the dataset:-
data_cleaned = data_cleaned.copy()  # Creating a copy to avoid SettingWithCopyWarning
data_cleaned['Cluster'] = clusters

# Mapping clusters to disease types
cluster_to_disease = {
    0: 'Diabetes',
    1: 'Anemia',
    2: 'Infections',
    3: 'Liver Disease',
    4: 'Kidney Disease',
    5: 'Thyroid Disorders',
    6: 'Heart Disease',
    7: 'Autoimmune Diseases',
    8: 'Cancer'
}

#  data for supervised learning
X_features = data_cleaned.drop(['S. No.', 'Cluster'], axis=1)
y_labels = data_cleaned['Cluster']

# Spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.3, random_state=42)

# Scaling the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42, k_neighbors=3)  # Reduce the number of neighbors
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Reducing the number of hyperparameter combinations
param_grid = {
    'n_estimators': [100],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)  # Use 3-fold CV for faster results
grid_search.fit(X_train_resampled, y_train_resampled)

# the best parameters from GridSearchCV is:
print("Best Parameters from GridSearchCV:", grid_search.best_params_)

# Evaluating the classifier
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print("Test Set Accuracy Score:", accuracy_score(y_test, y_pred))
print("Test Set Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters from GridSearchCV: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test Set Accuracy Score: 0.8727272727272727
Test Set Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        26
           1       0.86      0.55      0.67        11
           2       0.60      1.00      0.75         3
           3       0.89      0.80      0.84        10
           4       0.70      0.88      0.78         8
           5       0.88      0.92      0.90        25
           6       0.90      1.00      0.95         9
           7       0.00      0.00      0.00         1
           8       0.84      0.94      0.89        17

    accuracy                           0.87       110
   macro avg       0.74      0.78      0.75       110
weighted avg       0.87      0.87      0.87       110



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# prediction for new data
new_data = pd.DataFrame({
    'RBC': [3],
    'PCV': [32.0],
    'MCV': [45.0],
    'MCH': [30.5],
    'RDW': [15.0],
    'TLC': [7.5],
    'PLT /mm3': [250.0],
    'HGB': [14.0],
    'Age': [40.0],
    'Sex': [1.0],
    'MCHC': [35]
})

# Ensuring the new data has the same columns as training data
new_data = new_data[X_features.columns]  # for that we reorder columns to match the training data
new_data_scaled = scaler.transform(new_data)
predicted_cluster = best_model.predict(new_data_scaled)

# Map for predicting cluster to disease type
predicted_disease = cluster_to_disease[predicted_cluster[0]]
print("You are suffering from:", predicted_disease)


You are suffering from: Thyroid Disorders


In [None]:
# Evaluating the classifier performance/accuracy:-
print("Test Set Accuracy Score:", accuracy_score(y_test, y_pred))
print("Test Set Classification Report:\n", classification_report(y_test, y_pred))

Test Set Accuracy Score: 0.8727272727272727
Test Set Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        26
           1       0.86      0.55      0.67        11
           2       0.60      1.00      0.75         3
           3       0.89      0.80      0.84        10
           4       0.70      0.88      0.78         8
           5       0.88      0.92      0.90        25
           6       0.90      1.00      0.95         9
           7       0.00      0.00      0.00         1
           8       0.84      0.94      0.89        17

    accuracy                           0.87       110
   macro avg       0.74      0.78      0.75       110
weighted avg       0.87      0.87      0.87       110



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Saving the trained model
joblib.dump(best_model, 'blood_model.pkl')
# Saving the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [None]:
from google.colab import files
files.download('blood_model.pkl')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>