In [2]:
import math
import statistics
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import pandas as pd
from ucimlrepo import fetch_ucirepo 


In [3]:
# fetch dataset 
df = fetch_ucirepo(id=579) 
print(f"df.keys(): {df.keys()}")


df.keys(): dict_keys(['data', 'metadata', 'variables'])


In [4]:
print(f"Keys of df: {df.keys()}")
print(f"Keys of df.data: {df.data.keys()}")
print(f"Keys of df.metadata: {df.metadata.keys()}")
print(f"Keys of df.variables: {df.variables.keys()}")

Keys of df: dict_keys(['data', 'metadata', 'variables'])
Keys of df.data: dict_keys(['ids', 'features', 'targets', 'original', 'headers'])
Keys of df.metadata: dict_keys(['uci_id', 'name', 'repository_url', 'data_url', 'abstract', 'area', 'tasks', 'characteristics', 'num_instances', 'num_features', 'feature_types', 'demographics', 'target_col', 'index_col', 'has_missing_values', 'missing_values_symbol', 'year_of_dataset_creation', 'last_updated', 'dataset_doi', 'creators', 'intro_paper', 'additional_info'])
Keys of df.variables: Index(['name', 'role', 'type', 'demographic', 'description', 'units',
       'missing_values'],
      dtype='object')


In [5]:
data_features = df.data.features  # Feature matrix (X)
data_targets = df.data.targets  # Target variable (y)
data_metadata = df.metadata  # Dataset metadata (contains descriptions)
data_variables = df.variables  # Feature descriptions (contains data types)

data = pd.concat([data_features, data_targets], axis=1)

In [6]:
categorical_cols = data_variables[data_variables['type'] == 'Categorical']['name'].tolist()
integer_cols = data_variables[data_variables['type'] == 'Integer']['name'].tolist()
continuous_cols = data_variables[data_variables['type'] == 'Continuous']['name'].tolist()

In [7]:
data_variables.head(20)

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Integer,,Record ID (ID): Unique identifier. Cannot be r...,,no
1,AGE,Feature,Integer,Age,Age of patient.,,no
2,SEX,Feature,Binary,Sex,"0: female, 1: male",,no
3,INF_ANAM,Feature,Categorical,,Quantity of myocardial infarctions in the anam...,,yes
4,STENOK_AN,Feature,Categorical,,Exertional angina pectoris in the anamnesis. \...,,yes
5,FK_STENOK,Feature,Categorical,,Functional class (FC) of angina pectoris in th...,,yes
6,IBS_POST,Feature,Categorical,,"Coronary heart disease (CHD) in recent weeks, ...",,yes
7,IBS_NASL,Feature,Binary,,Heredity on CHD\n\n0: isn't burdened\n\n1: bur...,,yes
8,GB,Feature,Categorical,,Presence of an essential hypertension \n\n0: t...,,yes
9,SIM_GIPERT,Feature,Binary,,Symptomatic hypertension,,yes


In [9]:
data

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,DLIT_AG,...,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,RAZRIV,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS
0,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
3,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,3.0,...,0,0,0,0,0,0,1,0,0,0
4,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,77.0,0,0.0,4.0,2.0,1.0,,2.0,0.0,7.0,...,0,0,1,0,1,0,0,0,0,3
1696,70.0,0,0.0,6.0,2.0,1.0,,2.0,0.0,7.0,...,0,0,0,0,0,0,0,0,0,1
1697,55.0,1,3.0,6.0,2.0,2.0,,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,6
1698,79.0,0,2.0,2.0,2.0,1.0,,2.0,0.0,7.0,...,0,0,0,1,0,0,0,0,0,1


In [None]:
print(data.dtypes)

AGE          float64
SEX            int64
INF_ANAM     float64
STENOK_AN    float64
FK_STENOK    float64
              ...   
DRESSLER       int64
ZSN            int64
REC_IM         int64
P_IM_STEN      int64
LET_IS         int64
Length: 123, dtype: object


## Descriptive Stats

In [None]:
data.describe()


In [None]:
data.dtypes

In [None]:
# Data types and missing values
print(df.info())  # Check column types
print(df.isnull().sum())  # Count missing values per column
print(df.dtypes)  # Check data types of each column

In [None]:
## Distibution of numeric values
print(df.describe(include=[float, int]))

In [None]:
print(df.describe(include=[object]))  # For categorical features


## asdf()
