In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from ucimlrepo import fetch_ucirepo, list_available_datasets

import pprint

## List Available Datasets
### **Print a list of datasets that can be imported via `fetch_ucirepo`**

In [3]:
list_available_datasets()
list_available_datasets(filter='aim-ahead')   # only list datasets for AIM-AHEAD project
list_available_datasets(search='diabe')

-------------------------------------
The following datasets are available:
-------------------------------------
Dataset Name                                                                            ID    
------------                                                                            --    
Abalone                                                                                 1     
Adult                                                                                   2     
Annealing                                                                               3     
Audiology (Standardized)                                                                8     
Auto MPG                                                                                9     
Automobile                                                                              10    
Balance Scale                                                                           12    
Balloons                       

### **Should not work for datasets that are not part of the list**

In [5]:
try:
    fetch_ucirepo(name='defungi')
    # # test invalid inputs
    # fetch_ucirepo(name='heart diseaseeeee') 
    # fetch_ucirepo(id=10000)
except Exception as e:
    print(e)

-------------------------------------
The following datasets are available:
-------------------------------------
Dataset Name                                             ID    
------------                                             --    
Heart Disease                                            45    
Parkinsons Telemonitoring                                189   
Thoracic Surgery Data                                    277   
Diabetes 130-US hospitals for years 1999-2008            296   
Diabetic Retinopathy Debrecen                            329   
HCV data                                                 571   
Myocardial infarction complications                      579   
Glioma Grading Clinical and Mutation Features Dataset    759   
Sepsis Survival Minimal Clinical Records                 827   

"iris" dataset (id=53) exists in the repository, but is not available for import.


## **Import dataset by ID**

In [6]:
diabetes = fetch_ucirepo(id=296)

### **Metadata**

In [7]:
pprint.pp(diabetes.metadata)

{'uci_id': 296,
 'name': 'Diabetes 130-US Hospitals for Years 1999-2008',
 'repository_url': 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008',
 'data_url': 'https://archive.ics.uci.edu/static/public/296/data.csv',
 'abstract': 'The dataset represents ten years (1999-2008) of clinical care at '
             '130 US hospitals and integrated delivery networks. Each row '
             'concerns hospital records of patients diagnosed with diabetes, '
             'who underwent laboratory, medications, and stayed up to 14 days. '
             'The goal is to determine the early readmission of the patient '
             'within 30 days of discharge.\n'
             'The problem is important for the following reasons. Despite '
             'high-quality evidence showing improved clinical outcomes for '
             'diabetic patients who receive various preventive and therapeutic '
             'interventions, many patients do not receive them. This can

In [8]:
diabetes.metadata.target_col

['readmitted']

In [13]:
diabetes.metadata.keys()

dict_keys(['uci_id', 'name', 'repository_url', 'data_url', 'abstract', 'area', 'tasks', 'characteristics', 'num_instances', 'num_features', 'feature_types', 'demographics', 'target_col', 'index_col', 'has_missing_values', 'missing_values_symbol', 'year_of_dataset_creation', 'last_updated', 'dataset_doi', 'creators', 'intro_paper', 'additional_info'])

In [14]:
diabetes.metadata.intro_paper.title

'Impact of HbA1c Measurement on Hospital Readmission Rates: Analysis of 70,000 Clinical Database Patient Record'

In [15]:
diabetes.metadata.additional_info.summary

'The dataset represents ten years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. It includes over 50 features representing patient and hospital outcomes. Information was extracted from the database for encounters that satisfied the following criteria.\n(1)\tIt is an inpatient encounter (a hospital admission).\n(2)\tIt is a diabetic encounter, that is, one during which any kind of diabetes was entered into the system as a diagnosis.\n(3)\tThe length of stay was at least 1 day and at most 14 days.\n(4)\tLaboratory tests were performed during the encounter.\n(5)\tMedications were administered during the encounter.\n\nThe data contains such attributes as patient number, race, gender, age, admission type, time in hospital, medical specialty of admitting physician, number of lab tests performed, HbA1c test result, diagnosis, number of medications, diabetic medications, number of outpatient, inpatient, and emergency visits in the year before the hospitaliza

### **Data**
**IDs, features, and targets are included as separate dataframes, along with an "original" that combines all of them.**

In [16]:
diabetes.data.features

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[10-20),,1,1,7,3,,,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,...,No,No,No,No,No,No,No,No,No,Yes
3,Caucasian,Male,[30-40),,1,1,7,2,,,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,Caucasian,Male,[40-50),,1,1,7,1,,,...,No,No,Steady,No,No,No,No,No,Ch,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),,1,3,7,3,MC,,...,No,No,Down,No,No,No,No,No,Ch,Yes
101762,AfricanAmerican,Female,[80-90),,1,4,5,5,MC,,...,No,No,Steady,No,No,No,No,No,No,Yes
101763,Caucasian,Male,[70-80),,1,1,7,1,MC,,...,No,No,Down,No,No,No,No,No,Ch,Yes
101764,Caucasian,Female,[80-90),,2,3,7,10,MC,Surgery-General,...,No,No,Up,No,No,No,No,No,Ch,Yes


In [None]:
diabetes.data.targets

Unnamed: 0,readmitted
0,NO
1,>30
2,NO
3,NO
4,NO
...,...
101761,>30
101762,NO
101763,NO
101764,NO


In [18]:
diabetes.data.ids   # this dataset no IDs

Unnamed: 0,encounter_id,patient_nbr
0,2278392,8222157
1,149190,55629189
2,64410,86047875
3,500364,82442376
4,16680,42519267
...,...,...
101761,443847548,100162476
101762,443847782,74694222
101763,443854148,41088789
101764,443857166,31693671


In [19]:
diabetes.data.original

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [20]:
diabetes.data.headers

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

### **Variable Info**

**Displayed in a dataframe format**

In [21]:
diabetes.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,encounter_id,ID,,,Unique identifier of an encounter,,no
1,patient_nbr,ID,,,Unique identifier of a patient,,no
2,race,Feature,Categorical,Race,"Values: Caucasian, Asian, African American, Hi...",,yes
3,gender,Feature,Categorical,Gender,"Values: male, female, and unknown/invalid",,no
4,age,Feature,Categorical,Age,"Grouped in 10-year intervals: [0, 10), [10, 20...",,no
5,weight,Feature,Categorical,,Weight in pounds.,,yes
6,admission_type_id,Feature,Categorical,,Integer identifier corresponding to 9 distinct...,,no
7,discharge_disposition_id,Feature,Categorical,,Integer identifier corresponding to 29 distinc...,,no
8,admission_source_id,Feature,Categorical,,Integer identifier corresponding to 21 distinc...,,no
9,time_in_hospital,Feature,Integer,,Integer number of days between admission and d...,,no


## Import by Name

In [27]:
glioma = fetch_ucirepo(name='glioma')
glioma.data.features

Unnamed: 0,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,MUC16,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,51.30,white,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,38.72,white,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,35.17,white,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,32.78,white,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,31.51,white,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,1,77.89,white,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
835,0,85.18,white,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
836,1,77.49,white,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
837,0,63.33,white,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [17]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
iris = fetch_ucirepo(id=189) 
  
# data (as pandas dataframes) 
X = iris.data.features 
y = iris.data.targets 
  
# metadata 
print(iris.metadata) 
  
# variable information 
print(iris.variables) 


{'uci_id': 189, 'name': 'Parkinsons Telemonitoring', 'repository_url': 'https://archive.ics.uci.edu/dataset/189/parkinsons+telemonitoring', 'data_url': 'https://archive.ics.uci.edu/static/public/189/data.csv', 'abstract': "Oxford Parkinson's Disease Telemonitoring Dataset", 'area': 'Life', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 5875, 'num_features': 26, 'attribute_types': ['Integer', 'Real'], 'target_col': ['motor_UPDRS', 'total_UPDRS'], 'index_col': ['subject#'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Thu Aug 03 2023', 'dataset_doi': '10.24432/C5ZS3N', 'creators': ['Athanasios Tsanas', 'Max Little'], 'intro_paper': {'title': "Accurate Telemonitoring of Parkinson's Disease Progression by Noninvasive Speech Tests", 'authors': 'A. Tsanas, Max A. Little, P. McSharry, L. Ramig', 'published_in': 'IEEE Transactions on Biomedical Engineering', 'year': 2010, 'url': 'https://www.semant

In [18]:
X

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex
0,72,5.6431,0.00662,0.000034,0.00401,0.00317,0.01204,0.02565,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006,0
1,72,12.6660,0.00300,0.000017,0.00132,0.00150,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810,0
2,72,19.6810,0.00481,0.000025,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014,0
3,72,25.6470,0.00528,0.000027,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277,0
4,72,33.6420,0.00335,0.000020,0.00093,0.00130,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,61,142.7900,0.00406,0.000031,0.00167,0.00168,0.00500,0.01896,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367,0
5871,61,149.8400,0.00297,0.000025,0.00119,0.00147,0.00358,0.02315,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621,0
5872,61,156.8200,0.00349,0.000025,0.00152,0.00187,0.00456,0.02499,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157,0
5873,61,163.7300,0.00281,0.000020,0.00128,0.00151,0.00383,0.01484,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204,0
