# Programming Skills Practice with Python

## Data Pre- Processing

In [1]:
pip install ucimlrepo




In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 

# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 

X.head()
y.head()

Unnamed: 0,Diabetes_binary
0,0
1,0
2,0
3,0
4,0


In [3]:
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1,1,1,40,1,0,0,0,0,1,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,25,1,0,0,1,0,0,...,0,1,3,0,0,0,0,7,6,1
2,1,1,1,28,0,0,0,0,1,0,...,1,1,5,30,30,1,0,9,4,8
3,1,0,1,27,0,0,0,1,1,1,...,1,0,2,0,0,0,0,11,3,6
4,1,1,1,24,0,0,0,1,1,1,...,1,0,2,3,0,0,0,11,5,4


In [4]:
# metadata 
print(cdc_diabetes_health_indicators.metadata) 

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

In [5]:
# variable information 
print(cdc_diabetes_health_indicators.variables) 

                    name     role     type      demographic  \
0                     ID       ID  Integer             None   
1        Diabetes_binary   Target   Binary             None   
2                 HighBP  Feature   Binary             None   
3               HighChol  Feature   Binary             None   
4              CholCheck  Feature   Binary             None   
5                    BMI  Feature  Integer             None   
6                 Smoker  Feature   Binary             None   
7                 Stroke  Feature   Binary             None   
8   HeartDiseaseorAttack  Feature   Binary             None   
9           PhysActivity  Feature   Binary             None   
10                Fruits  Feature   Binary             None   
11               Veggies  Feature   Binary             None   
12     HvyAlcoholConsump  Feature   Binary             None   
13         AnyHealthcare  Feature   Binary             None   
14           NoDocbcCost  Feature   Binary             

In [6]:
# Divide dataset into training and test dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.head(), 'x training')
print('\n')
print(X_test.head(), 'x test')
print('\n')
print(y_train.head(), 'y training')
print('\n')
print(y_test.head(), 'y test')



        HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  \
160780       0         0          1   26       1       0   
249661       0         0          1   31       0       0   
26769        0         0          1   24       0       0   
182771       1         0          1   24       1       0   
117116       1         1          1   28       0       0   

        HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  ...  \
160780                     0             1       1        1  ...   
249661                     0             0       1        1  ...   
26769                      0             1       0        1  ...   
182771                     0             0       0        1  ...   
117116                     0             1       0        0  ...   

        AnyHealthcare  NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  \
160780              1            0        2         1         0         0   
249661              1            1        3        10         0         0   

# Decision Tree Classifier

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import time

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=0)

#Record start time
start_time = time.time()

# Fit the model to your data
clf.fit(X_train, y_train)

# Record end time
end_time = time.time()

# Calculate training time
training_time = end_time - start_time

print(f"Training Time: {training_time} seconds")

# Make predictions
predictions = clf.predict(X_test)

# Evaluate accuracy
sklearn_accuracy = accuracy_score(y_test, predictions)

print(f"scikit-learn Decision Tree Accuracy: {sklearn_accuracy}")

# Evaluate precision
sklearn_precision = precision_score(y_test, predictions)

print(f"scikit-learn Decision Tree Precision: {sklearn_precision}")

# Evaluate recall
sklearn_recall = recall_score(y_test, predictions)

print(f"scikit-learn Decision Tree Recall: {sklearn_recall}")

Training Time: 1.4348816871643066 seconds
scikit-learn Decision Tree Accuracy: 0.7956480605487228
scikit-learn Decision Tree Precision: 0.29518619436875565
scikit-learn Decision Tree Recall: 0.31549022327000414


## Vary Training and Test datasets

In [10]:
#Using 10 percent of the data set for testing

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.1, random_state=0)

In [11]:
# Create a decision tree classifier for new dataset
clf2 = DecisionTreeClassifier(random_state=0)

#Record start time
start_time2 = time.time()

# Fit the model to your data
clf2.fit(X_train2, y_train2)

# Record end time
end_time2 = time.time()

# Calculate training time
training_time2 = end_time2 - start_time2

print(f"Training Time: {training_time2} seconds")

# Make predictions
predictions2 = clf2.predict(X_test2)

# Evaluate accuracy
sklearn_accuracy2 = accuracy_score(y_test2, predictions2)

print(f"scikit-learn Decision Tree Accuracy: {sklearn_accuracy2}")

# Evaluate precision
sklearn_precision2 = precision_score(y_test2, predictions2)

print(f"scikit-learn Decision Tree Precision: {sklearn_precision2}")

# Evaluate recall
sklearn_recall2 = recall_score(y_test2, predictions2)

print(f"scikit-learn Decision Tree Recall: {sklearn_recall2}")

Training Time: 1.643310308456421 seconds
scikit-learn Decision Tree Accuracy: 0.7951356039104384
scikit-learn Decision Tree Precision: 0.28877844624640336
scikit-learn Decision Tree Recall: 0.3082077051926298


In [16]:
#Using 25 percent of the data set for testing

X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, test_size=0.25, random_state=0)

clf3 = DecisionTreeClassifier(criterion='entropy', random_state=0)

# Fit the model to your data
clf3.fit(X_train3, y_train3)

predictions3 = clf3.predict(X_test3)

In [17]:
# Test Accuracy of the result

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test3, predictions3)

In [18]:
cm

array([[47912,  6506],
       [ 6183,  2819]], dtype=int64)