### Import Libraries

In [4]:
import pandas as pd
import numpy as np

### Import Dataset

In [5]:
df = pd.read_csv('../data/heart.csv')

### Summary of Dataset

In [6]:
def create_summary_df(dataframe):
    num_samples = 5
    summary_data = []

    for column in dataframe.columns:
        null_count = dataframe[column].isnull().sum()
        unique_count = dataframe[column].nunique()
        data_type = dataframe[column].dtype
        min_value = dataframe[column].min() if pd.api.types.is_numeric_dtype(data_type) else None
        q25 = dataframe[column].quantile(0.25) if pd.api.types.is_numeric_dtype(data_type) else None
        q50 = dataframe[column].quantile(0.50) if pd.api.types.is_numeric_dtype(data_type) else None
        q75 = dataframe[column].quantile(0.75) if pd.api.types.is_numeric_dtype(data_type) else None
        max_value = dataframe[column].max() if pd.api.types.is_numeric_dtype(data_type) else None
        mean = dataframe[column].mean() if pd.api.types.is_numeric_dtype(data_type) else None
        std_dev = dataframe[column].std() if pd.api.types.is_numeric_dtype(data_type) else None
        top_value = dataframe[column].mode()[0] if pd.api.types.is_object_dtype(data_type) else None
        top_value_freq = dataframe[column].value_counts().max() if pd.api.types.is_object_dtype(data_type) else None
        sample_values = df[column].sample(num_samples).tolist()
        
        summary_data.append([column, null_count, unique_count, data_type, min_value, q25, q50, q75, max_value, mean, std_dev, top_value, top_value_freq, sample_values])

    summary_df = pd.DataFrame(summary_data, columns=['Column', 'Missing Values', 'Unique Count', 'Data Type', 'Min Value','%25','Median','%75',
                                                    'Max Value', 'Mean', 'Std Dev', 'Top Value', 'Top Value Frequency', 'Sample Values'])
    
    return summary_df

def shape_of_df(dataframe):
    print(f'Shape of the dataset: {dataframe.shape}')

def sum_of_duplicate(dataframe):
    print(f'Sum of duplicated rows: {dataframe.duplicated().sum()}')

In [11]:
summary_df = create_summary_df(df)

shape_of_df(df)
sum_of_duplicate(df)
summary_df

Shape of the dataset: (918, 12)
Sum of duplicated rows: 0


Unnamed: 0,Column,Missing Values,Unique Count,Data Type,Min Value,%25,Median,%75,Max Value,Mean,Std Dev,Top Value,Top Value Frequency,Sample Values
0,Age,0,50,int64,28.0,47.0,54.0,60.0,77.0,53.510893,9.432617,,,"[44, 43, 56, 56, 38]"
1,Sex,0,2,object,,,,,,,,M,725.0,"[F, M, M, M, M]"
2,ChestPainType,0,4,object,,,,,,,,ASY,496.0,"[ASY, ASY, ATA, NAP, ASY]"
3,RestingBP,0,67,int64,0.0,120.0,130.0,140.0,200.0,132.396514,18.514154,,,"[116, 160, 150, 110, 130]"
4,Cholesterol,0,222,int64,0.0,173.25,223.0,267.0,603.0,198.799564,109.384145,,,"[259, 193, 187, 210, 254]"
5,FastingBS,0,2,int64,0.0,0.0,0.0,0.0,1.0,0.233115,0.423046,,,"[0, 0, 0, 0, 0]"
6,RestingECG,0,3,object,,,,,,,,Normal,552.0,"[Normal, LVH, Normal, Normal, Normal]"
7,MaxHR,0,119,int64,60.0,120.0,138.0,156.0,202.0,136.809368,25.460334,,,"[190, 99, 175, 118, 140]"
8,ExerciseAngina,0,2,object,,,,,,,,N,547.0,"[N, N, N, Y, N]"
9,Oldpeak,0,53,float64,-2.6,0.0,0.6,1.5,6.2,0.887364,1.06657,,,"[3.0, 1.4, 1.0, 0.0, 0.0]"
