In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 
  
# metadata 
print(cdc_diabetes_health_indicators.metadata) 
  
# variable information 
print(cdc_diabetes_health_indicators.variables) 

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

In [2]:
import pandas as pd
import numpy as np

### Balancing the Data

In [17]:
data = X.copy(deep=True)
data['Diabetes_binary'] = y

In [18]:
data

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,0,3,0,5,0,1,5,6,7,0
253676,1,1,1,18,0,0,0,0,0,0,...,0,4,0,0,1,0,11,2,4,1
253677,0,0,1,28,0,0,0,1,1,0,...,0,1,0,0,0,0,2,5,2,0
253678,1,0,1,23,0,0,0,0,1,1,...,0,3,0,0,0,1,7,5,1,0


In [23]:
positive = data[data['Diabetes_binary'] == 1]
negative = data[data['Diabetes_binary'] == 0]
print('Positive: ' + str(len(positive)))
print('Negative: ' + str(len(negative)))
print('Total: ' + str(len(data)))

Positive: 35346
Negative: 218334
Total: 253680


In [12]:
35346/0.4

88365.0

In [25]:
88365.0 * 0.6

53019.0

In [29]:
negative_sample = negative.sample(53019).reset_index(drop=True)
negative_sample

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,0,0,1,23,0,0,0,1,0,0,...,0,1,0,0,0,0,11,5,7,0
1,1,0,1,19,0,0,0,0,1,1,...,0,3,0,0,0,0,6,6,8,0
2,0,0,1,26,1,0,0,1,1,1,...,0,2,0,0,0,0,1,4,4,0
3,0,1,1,22,0,0,0,1,1,1,...,0,1,0,0,0,1,12,4,2,0
4,0,0,1,22,0,0,0,0,1,1,...,0,1,0,0,0,0,4,6,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53014,0,1,0,31,1,0,0,0,0,1,...,0,2,0,4,0,1,9,4,1,0
53015,0,0,1,26,0,0,0,0,1,1,...,0,2,0,0,0,0,6,5,8,0
53016,0,1,1,27,0,0,0,1,0,0,...,0,2,1,1,0,0,10,6,8,0
53017,0,0,0,47,0,0,0,1,0,1,...,0,2,0,0,0,0,7,6,7,0


In [33]:
df = pd.concat([negative_sample, positive])
df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,0,0,1,23,0,0,0,1,0,0,...,0,1,0,0,0,0,11,5,7,0
1,1,0,1,19,0,0,0,0,1,1,...,0,3,0,0,0,0,6,6,8,0
2,0,0,1,26,1,0,0,1,1,1,...,0,2,0,0,0,0,1,4,4,0
3,0,1,1,22,0,0,0,1,1,1,...,0,1,0,0,0,1,12,4,2,0
4,0,0,1,22,0,0,0,0,1,1,...,0,1,0,0,0,0,4,6,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253659,0,1,1,37,0,0,0,0,0,1,...,0,4,0,0,0,0,6,4,1,1
253668,0,1,1,29,1,0,1,0,1,1,...,0,2,0,0,1,1,10,3,6,1
253670,1,1,1,25,0,0,1,0,1,0,...,0,5,15,0,1,0,13,6,4,1
253676,1,1,1,18,0,0,0,0,0,0,...,0,4,0,0,1,0,11,2,4,1


In [34]:
df.to_csv('Preprocessed_data.csv')