
# Hackathon - Block Hats

In [135]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle as pkl


In [136]:
path  = r"general_disease_diagnosis.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,Patient_Name,Age,Weight_kg,Height_cm,Blood_Pressure_mmHg,Disease
0,Ramesh Patel,10,29,93,102,Kidney Disease
1,Sunita Pandey,12,21,103,152,Hypertension
2,Santosh Kulkarni,11,19,112,154,Thyroid Disorder
3,Swati Verma,32,80,152,95,Tuberculosis
4,Sudha Pandey,30,57,177,95,Hypertension


# Data Preprocessing

In [137]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Patient_Name         1000 non-null   object
 1   Age                  1000 non-null   int64 
 2   Weight_kg            1000 non-null   int64 
 3   Height_cm            1000 non-null   int64 
 4   Blood_Pressure_mmHg  1000 non-null   int64 
 5   Disease              750 non-null    object
dtypes: int64(4), object(2)
memory usage: 47.0+ KB


In [138]:
num_cols = data.select_dtypes(include=np.number).columns
cat_cols = [i for i in data.columns if i not in num_cols]

In [139]:
# Applying Preprocessing on Categorical columns
train = data.iloc[:750]
test = data.iloc[750:]

y_scaler = LabelEncoder()

y_scaler.fit(train['Disease'])
train['Disease'] = y_scaler.transform(train['Disease'])
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Disease'] = y_scaler.transform(train['Disease'])


Unnamed: 0,Patient_Name,Age,Weight_kg,Height_cm,Blood_Pressure_mmHg,Disease
0,Ramesh Patel,10,29,93,102,6
1,Sunita Pandey,12,21,103,152,5
2,Santosh Kulkarni,11,19,112,154,10
3,Swati Verma,32,80,152,95,11
4,Sudha Pandey,30,57,177,95,5
...,...,...,...,...,...,...
745,Arun Chatterjee,28,84,162,112,3
746,Naveen Pillai,26,51,172,113,9
747,Anita Das,59,56,153,99,3
748,Sanjay Aggarwal,10,15,117,93,6


# Feature Extraction

In [140]:
# train and Test Split
scaler = StandardScaler()
train_x = scaler.fit_transform(train[num_cols])
X, y = train_x, train['Disease']
X_test = scaler.transform(test[num_cols])
X

array([[-1.39281399, -1.99483674, -3.44046262, -1.12673807],
       [-1.31360482, -2.49475406, -2.8833929 ,  1.29281628],
       [-1.3532094 , -2.61973339, -2.38203015,  1.38959845],
       ...,
       [ 0.5478106 , -0.30761579, -0.09804427, -1.27191133],
       [-1.39281399, -2.86969205, -2.10349528, -1.56225785],
       [ 1.7359481 , -0.74504344, -0.76652794,  1.63155389]])

In [141]:
# validation and  training split
split = int(round(X.shape[0]*0.8,0))
X_val = X[split: ]
y_val = y.iloc[split:]


# Model Selection and Training

In [142]:
model_df = {
    'SVM' : SVC(kernel = 'linear'),
    'Random Forest': RandomForestClassifier()}


In [143]:
df = pd.DataFrame()
df['model'] = model_df.keys()
accuracy = []
for i in model_df:
    model = model_df[i]
    
    model.fit(X, y)
    x_pred = model.predict(X)
    accuracy.append(accuracy_score(x_pred, y))
df['accuracy'] = accuracy
df


Unnamed: 0,model,accuracy
0,SVM,0.113333
1,Random Forest,1.0


In [144]:
# Training the best model
model = model_df[df.sort_values(by='accuracy').model[1]]
model.fit(X, y)
x_pred = model.predict(X)
print(f'Training accuracy: {accuracy_score(x_pred, y)}')

Training accuracy: 1.0


In [145]:
# F1 score
print('F1 Score is weighted: ')
score = f1_score(x_pred, y,average='weighted')
print(score)


F1 Score is weighted: 
1.0


In [146]:
# F1 score
print('F1 Score is micro: ', end = ' ')
score = f1_score(x_pred, y,average='micro')
print(score)

# F1 score
print('F1 Score is macro: ', end = ' ')
score = f1_score(x_pred, y,average='macro')
print(score)

F1 Score is micro:  1.0
F1 Score is macro:  1.0


In [147]:
print('Classification Report: ')
print(classification_report(x_pred, y))

Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60
           1       1.00      1.00      1.00        65
           2       1.00      1.00      1.00        65
           3       1.00      1.00      1.00        62
           4       1.00      1.00      1.00        58
           5       1.00      1.00      1.00        57
           6       1.00      1.00      1.00        61
           7       1.00      1.00      1.00        72
           8       1.00      1.00      1.00        76
           9       1.00      1.00      1.00        62
          10       1.00      1.00      1.00        61
          11       1.00      1.00      1.00        51

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750



In [148]:
val_pred = model.predict(X_val)
print("Validation F1 Score: ")
score = f1_score(val_pred, y_val,average='weighted')
score

Validation F1 Score: 


1.0

In [149]:
# testing 
y_pred = model.predict(X_test)
y_label = y_scaler.inverse_transform(y_pred)
y_label

array(['Hypertension', 'Tuberculosis', 'Malaria', 'Tuberculosis',
       'Respiratory Infection', 'Malaria', 'Liver Disease',
       'Thyroid Disorder', 'Kidney Disease', 'Asthma', 'Kidney Disease',
       'Anemia', 'Dengue', 'Malaria', 'Kidney Disease', 'Dengue',
       'Thyroid Disorder', 'Hypertension', 'Liver Disease', 'Anemia',
       'Kidney Disease', 'Cardiovascular Disease', 'Dengue',
       'Respiratory Infection', 'Diabetes', 'Cardiovascular Disease',
       'Asthma', 'Asthma', 'Malaria', 'Kidney Disease',
       'Respiratory Infection', 'Dengue', 'Asthma', 'Kidney Disease',
       'Diabetes', 'Liver Disease', 'Malaria', 'Thyroid Disorder',
       'Diabetes', 'Malaria', 'Anemia', 'Liver Disease', 'Kidney Disease',
       'Liver Disease', 'Dengue', 'Dengue', 'Anemia', 'Diabetes',
       'Anemia', 'Dengue', 'Hypertension', 'Thyroid Disorder',
       'Tuberculosis', 'Dengue', 'Diabetes', 'Thyroid Disorder', 'Dengue',
       'Malaria', 'Malaria', 'Dengue', 'Malaria', 'Dengue', 'A

In [150]:
# F1 score
test['Disease'] = y_label
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Disease'] = y_label


Unnamed: 0,Patient_Name,Age,Weight_kg,Height_cm,Blood_Pressure_mmHg,Disease
750,Arjun Iyer,66,53,146,97,Hypertension
751,Seema Bose,16,37,131,102,Tuberculosis
752,Neha Mishra,33,55,153,126,Malaria
753,Sudha Kumar,28,69,167,154,Tuberculosis
754,Geeta Singh,76,48,148,154,Respiratory Infection
...,...,...,...,...,...,...
995,Priya Das,80,73,159,110,Tuberculosis
996,Swati Kohli,63,48,164,111,Malaria
997,Abhinav Sharma,20,74,158,104,Respiratory Infection
998,Sudha Reddy,9,23,95,126,Dengue


In [151]:
# Saving Results

train['Disease'] = y_scaler.inverse_transform(train['Disease'])
dataset = pd.concat([train, test]).reset_index()
dataset.to_csv('Final_submission.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Disease'] = y_scaler.inverse_transform(train['Disease'])


In [152]:
# Saving The model
with open('desease_model.pkl','wb') as f:
    pkl.dump(model,f)

In [153]:
# Saving Preprocessing Models
with open('X_scaler.pkl','wb') as f:
    pkl.dump(scaler,f)

with open('y_scaler.pkl','wb') as f:
    pkl.dump(y_scaler,f)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import pickle as pkl


path = "general_disease_diagnosis.csv"
data = pd.read_csv(path)
print("Dataset info before handling missing values:")
print(data.info())


num_cols = data.select_dtypes(include=np.number).columns
cat_cols = [i for i in data.columns if i not in num_cols]


num_imputer = SimpleImputer(strategy='median')
data[num_cols] = num_imputer.fit_transform(data[num_cols])


cat_imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

print("Dataset info after handling missing values:")
print(data.info())


train = data.iloc[:750]
test = data.iloc[750:]


y_scaler = LabelEncoder()
train['Disease'] = y_scaler.fit_transform(train['Disease'])


scaler = StandardScaler()
train_x = scaler.fit_transform(train[num_cols])
X, y = train_x, train['Disease']
X_test = scaler.transform(test[num_cols])


split = int(round(X.shape[0] * 0.8, 0))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y.iloc[:split], y.iloc[split:]


model_df = {
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier()
}


results = []
for name, model in model_df.items():
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, val_pred)
    f1 = f1_score(y_val, val_pred, average='weighted')
    results.append({'model': name, 'accuracy': accuracy, 'f1_score': f1})


results_df = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
best_model_name = results_df.iloc[0]['model']
best_model = model_df[best_model_name]
print(f"Selected best model: {best_model_name}")


best_model.fit(X, y)
train_pred = best_model.predict(X)
print(f'Training accuracy: {accuracy_score(y, train_pred)}')
print('Classification Report on Training Data:')
print(classification_report(y, train_pred))


val_pred = best_model.predict(X_val)
print("Validation F1 Score:", f1_score(y_val, val_pred, average='weighted'))


test_pred = best_model.predict(X_test)
test_labels = y_scaler.inverse_transform(test_pred)
test['Disease'] = test_labels


train['Disease'] = y_scaler.inverse_transform(train['Disease'])
final_dataset = pd.concat([train, test]).reset_index()
final_dataset.to_csv('Final_submission.csv', index=False)


with open('disease_model.pkl', 'wb') as f:
    pkl.dump(best_model, f)
with open('X_scaler.pkl', 'wb') as f:
    pkl.dump(scaler, f)
with open('y_scaler.pkl', 'wb') as f:
    pkl.dump(y_scaler, f)
with open('num_imputer.pkl', 'wb') as f:
    pkl.dump(num_imputer, f)
with open('cat_imputer.pkl', 'wb') as f:
    pkl.dump(cat_imputer, f)

print("Model and preprocessing artifacts saved successfully.")


Dataset info before handling missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Patient_Name         1000 non-null   object
 1   Age                  1000 non-null   int64 
 2   Weight_kg            1000 non-null   int64 
 3   Height_cm            1000 non-null   int64 
 4   Blood_Pressure_mmHg  1000 non-null   int64 
 5   Disease              750 non-null    object
dtypes: int64(4), object(2)
memory usage: 47.0+ KB
None
Dataset info after handling missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Patient_Name         1000 non-null   object 
 1   Age                  1000 non-null   float64
 2   Weight_kg            1000 non-null   float64
 3 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Disease'] = y_scaler.fit_transform(train['Disease'])


Training accuracy: 1.0
Classification Report on Training Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60
           1       1.00      1.00      1.00        65
           2       1.00      1.00      1.00        65
           3       1.00      1.00      1.00        62
           4       1.00      1.00      1.00        58
           5       1.00      1.00      1.00        57
           6       1.00      1.00      1.00        61
           7       1.00      1.00      1.00        72
           8       1.00      1.00      1.00        76
           9       1.00      1.00      1.00        62
          10       1.00      1.00      1.00        61
          11       1.00      1.00      1.00        51

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750

Validation F1 Score: 1.0
Model and preprocessing artifacts saved succe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Disease'] = test_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Disease'] = y_scaler.inverse_transform(train['Disease'])


In [3]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pickle as pkl

path = "general_disease_diagnosis.csv"
data = pd.read_csv(path)

num_cols = data.select_dtypes(include=np.number).columns
cat_cols = [i for i in data.columns if i not in num_cols]

imputer = IterativeImputer(max_iter=10, random_state=0)
data[num_cols] = imputer.fit_transform(data[num_cols])

for col in cat_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

train = data.iloc[:750]
test = data.iloc[750:]

y_scaler = LabelEncoder()
train['Disease'] = y_scaler.fit_transform(train['Disease'])

scaler = StandardScaler()
train_x = scaler.fit_transform(train[num_cols])
X, y = train_x, train['Disease']
X_test = scaler.transform(test[num_cols])

split = int(round(X.shape[0] * 0.8, 0))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y.iloc[:split], y.iloc[split:]

model_df = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=0),
    'SVM': SVC(kernel='rbf', C=1.0)
}

best_models = {}
for name, model in model_df.items():
    if name == "Random Forest":
        params = {'n_estimators': [100, 200], 'max_depth': [10, 15]}
    elif name == "Gradient Boosting":
        params = {'n_estimators': [100, 200], 'max_depth': [5, 10]}
    elif name == "SVM":
        params = {'C': [0.5, 1.0, 1.5], 'kernel': ['rbf', 'linear']}
    
    grid = GridSearchCV(model, params, scoring='f1_weighted', cv=3)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_

results = []
for name, model in best_models.items():
    val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, val_pred)
    f1 = f1_score(y_val, val_pred, average='weighted')
    results.append({'model': name, 'accuracy': accuracy, 'f1_score': f1})

results_df = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)
best_model_name = results_df.iloc[0]['model']
best_model = best_models[best_model_name]
print(f"Selected best model: {best_model_name}")

best_model.fit(X, y)
train_pred = best_model.predict(X)
print(f'Training accuracy: {accuracy_score(y, train_pred)}')
print('Classification Report on Training Data:')
print(classification_report(y, train_pred))

val_pred = best_model.predict(X_val)
print("Validation F1 Score:", f1_score(y_val, val_pred, average='weighted'))

test_pred = best_model.predict(X_test)
test_labels = y_scaler.inverse_transform(test_pred)
test['Disease'] = test_labels

train['Disease'] = y_scaler.inverse_transform(train['Disease'])
final_dataset = pd.concat([train, test]).reset_index()
final_dataset.to_csv('Final_submission.csv', index=False)

with open('disease_model.pkl', 'wb') as f:
    pkl.dump(best_model, f)
with open('X_scaler.pkl', 'wb') as f:
    pkl.dump(scaler, f)
with open('y_scaler.pkl', 'wb') as f:
    pkl.dump(y_scaler, f)
with open('imputer.pkl', 'wb') as f:
    pkl.dump(imputer, f)

print("Model and preprocessing artifacts saved successfully.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Disease'] = y_scaler.fit_transform(train['Disease'])


Selected best model: Gradient Boosting
Training accuracy: 1.0
Classification Report on Training Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        60
           1       1.00      1.00      1.00        65
           2       1.00      1.00      1.00        65
           3       1.00      1.00      1.00        62
           4       1.00      1.00      1.00        58
           5       1.00      1.00      1.00        57
           6       1.00      1.00      1.00        61
           7       1.00      1.00      1.00        72
           8       1.00      1.00      1.00        76
           9       1.00      1.00      1.00        62
          10       1.00      1.00      1.00        61
          11       1.00      1.00      1.00        51

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750

Validation F1 Score: 1.0
Model 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Disease'] = test_labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Disease'] = y_scaler.inverse_transform(train['Disease'])
