In [1]:
# Import necessary libraries
import pandas as pd  # For handling data in tabular format
import numpy as np  # For numerical computations
from scipy.stats import norm  # For working with normal distribution

# PDF (Probability Density Function) and CDF (Cumulative Distribution Function)
# These functions help analyze the probability distribution of continuous data
# - PDF: Shows the likelihood of different values occurring
# - CDF: Represents the cumulative probability up to a given value

In [2]:
# Load the dataset from the specified file path
# Ensure the file exists at the given location before running the code
df = pd.read_csv("C:/Users/dbda.STUDENTSDC/Music/LabPractice/Notebooks/Datasets/weight-height.csv")

# Now, 'df' contains the weight-height data, which can be analyzed further.
# To check the first few rows of the dataset, you can run:
print(df.head())  # Displays the first five rows to get an overview of the data structure

   Unnamed: 0.1  Unnamed: 0 Gender     Height      Weight  Weight_zscore  \
0             0           0   Male  73.847017  241.893563       2.505797   
1             1           1   Male  68.781904  162.310473       0.027101   
2             2           2   Male  74.110105  212.740856       1.597806   
3             3           3   Male  71.730978  220.042470       1.825222   
4             4           4   Male  69.881796  206.349801       1.398750   

   Height_zscore  
0       1.944061  
1       0.627537  
2       2.012443  
3       1.394060  
4       0.913421  


In [3]:
# Calculate the mean of the 'Height' column
# This represents the average height value in the dataset
height_mean = df['Height'].mean()

# Calculate the standard deviation of the 'Height' column
# This measures the dispersion of height values around the mean
height_std = df['Height'].std()

# Print the computed values for better understanding
print("Mean Height:", height_mean)
print("Standard Deviation of Height:", height_std)

Mean Height: 66.36755975482124
Standard Deviation of Height: 3.8475281207732324


In [4]:
# Define the specific height range for which to calculate PDF values
start_height = 68  # Lower bound of the height range
end_height = 70  # Upper bound of the height range

# This range will be used in probability density calculations to analyze the likelihood
# of heights falling within this interval based on the dataset's distribution.

In [5]:
# Define the number of points to generate within the specified height range
num_points = 2  # This sets how many height values to generate

# Generate evenly spaced values between 'start_height' and 'end_height'
# np.linspace(start, stop, num) creates 'num' values from 'start' to 'stop' (inclusive)
height_values_in_range = np.linspace(start_height, end_height, num_points)

# Print the generated height values for clarity
print("Generated height values:", height_values_in_range)

Generated height values: [68. 70.]


In [6]:
# Calculate PDF (Probability Density Function) values for the specified height range
# PDF provides the likelihood of observing specific values based on a normal distribution

pdf_values_in_range = norm.pdf(height_values_in_range, height_mean, height_std)

# Breakdown of parameters:
# - height_values_in_range: The specific range of height values to analyze
# - height_mean: Mean of the height distribution (center of the normal curve)
# - height_std: Standard deviation of heights (spread of the distribution)

# To inspect the computed PDF values, print the output:
print(pdf_values_in_range)  # Displays the probability density for each height in the range

[0.09476289 0.06640193]


In [7]:
# Loop through the generated height values
# This iterates over each height and prints its corresponding PDF value
for i in range(len(height_values_in_range)):
    print(height_values_in_range[i], pdf_values_in_range[i])

# Interpretation:
# - 'height_values_in_range[i]' represents the height value at index 'i'
# - 'pdf_values_in_range[i]' contains the probability density for that height
# - Together, they show how likely each height value is under the normal distribution

68.0 0.09476288995625082
70.0 0.06640193233904933


In [8]:
# Calculate the Cumulative Distribution Function (CDF) for the specified height range
# CDF helps determine the probability that a value falls within a certain range

# Compute the exact probability that a height falls between 'start_height' and 'end_height'
# This is done by subtracting the cumulative probability up to 'start_height' from the cumulative probability up to 'end_height'
prob_range_exact = norm.cdf(end_height, height_mean, height_std) - norm.cdf(start_height, height_mean, height_std)

# Print the probability value
print(prob_range_exact)  # Displays the probability of heights falling in the given range

0.1631197592684437
