## Hands-on - Basic Statistics in Python

In [1]:
# Import necessary libraries
import pandas as pd  # for handling tabular datasets
import numpy as np  # for numerical computations
from scipy import stats  # for statistical operations
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # for advanced statistical plots

# Load dataset from GitHub URL
file_path = "https://raw.githubusercontent.com/Hamed-Ahmadinia/DASP-2025/refs/heads/main/medical_examination.csv"  # URL link to the dataset stored on GitHub

# Read the dataset into a pandas dataframe
df = pd.read_csv(file_path)  # Load the dataset as a pandas DataFrame

# Display the first few rows of the dataframe to confirm the data has been loaded correctly
print("Dataset Preview:")  # Print a label for context
print(df.head(5))  # Display the first 5 rows of the dataset

Dataset Preview:
   id    age  sex  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  18393    2     168    62.0    110     80            1     1      0   
1   1  20228    1     156    85.0    140     90            3     1      0   
2   2  18857    1     165    64.0    130     70            3     1      0   
3   3  17623    2     169    82.0    150    100            1     1      0   
4   4  17474    1     156    56.0    100     60            1     1      0   

   alco  active  cardio  
0     0       1       0  
1     0       1       1  
2     0       0       1  
3     0       1       1  
4     0       0       0  


### **Exercise 1: Display Dataset Information**
**Question:** Use df.info() to display basic information about the dataset.

In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   sex          70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB
None


### **Exercise 2: Calculate Minimum and Maximum Values**  
**Question:** Calculate the **minimum** and **maximum** values for the following columns:  

- **"height"**  
- **"weight"**  
- **"ap_hi"** (systolic blood pressure)  
- **"ap_lo"** (diasblood pressure)re)  


In [3]:

# Print min and max of weight
print(f"Minimum weight: {df['weight'].min()}")
print(f"Maximum weight: {df['weight'].max()}")

# Print min and max of height
print(f"Minimum height: {df['height'].min()}")
print(f"Maximum height: {df['height'].max()}")

# Print min and max of systolic blood pressure
print(f"Minimum systolic blood pressure: {df['ap_hi'].min()}")
print(f"Maximum systolic blood pressure: {df['ap_hi'].max()}")

# Print min and max of diastolic blood pressure
print(f"Minimum diastolic blood pressure: {df['ap_lo'].min()}")
print(f"Maximum diastolic blood pressure: {df['ap_lo'].max()}")



Minimum weight: 10.0
Maximum weight: 200.0
Minimum height: 55
Maximum height: 250
Minimum systolic blood pressure: -150
Maximum systolic blood pressure: 16020
Minimum diastolic blood pressure: -70
Maximum diastolic blood pressure: 11000


### **Exercise 3: Compute Measures of Central Tendency**  
**Question:** Calculate the **mean**, **median**, and **mode** for the specified columns:  

- **Mean:** Compute the mean of the **"weight"** column.  
- **Median:** Compute the median of the **"ap_hi"** (systolic blood pressure) column.  
- **Mode:** Compute the mode of the **"gender"** column.  

In [7]:
#Arithemetic mean
#weight
mean_weight = df["weight"].mean()
print(f"Arithmetic Mean of Weight: {mean_weight:.2f} kg")
#systolic blood pressure
mean_ap_hi = (df["ap_hi"][df["ap_hi"] > 0]).mean()
print(f"Arithmetic Mean of Systolic Blood Pressure: {mean_ap_hi:.2f}")
#Gender
mean_gender = df["sex"].mean()
print(f"Arithmetic Mean of Gender: {mean_gender:.2f}")

#Geometric mean
#weight
mean_weight_geometric = stats.gmean(df["weight"].dropna())
print(f"Geometric Mean of Weight: {mean_weight_geometric:.2f} kg")
#systolic blood pressure
mean_ap_hi_geometric = stats.gmean(df["ap_hi"][df["ap_hi"] > 0])
print(f"Geometric Mean of Systolic Blood Pressure: {mean_ap_hi_geometric:.2f}")
#gender
mean_gender_geometric =stats.gmean( df["sex"])
print(f"Geometric Mean of Gender: {mean_gender_geometric:.2f}")

#Harmonic mean
#weight
mean_weight_harmonic = stats.hmean(df["weight"].dropna())
print(f"Harmonic Mean of Weight: {mean_weight_harmonic:.2f} kg")
#systolic blood pressure
mean_ap_hi_harmonic = stats.hmean(df["ap_hi"][df["ap_hi"] > 0])
print(f"Harmonic Mean of Systolic Blood Pressure: {mean_ap_hi_harmonic:.2f}")
#gender
mean_gender_harmonic =stats.hmean( df["sex"])
print(f"Harmonic Mean of Gender: {mean_gender_harmonic:.2f}")

Arithmetic Mean of Weight: 74.21 kg
Arithmetic Mean of Systolic Blood Pressure: 128.84
Arithmetic Mean of Gender: 1.35
Geometric Mean of Weight: 72.90 kg
Geometric Mean of Systolic Blood Pressure: 125.34
Geometric Mean of Gender: 1.27
Harmonic Mean of Weight: 71.64 kg
Harmonic Mean of Systolic Blood Pressure: 121.69
Harmonic Mean of Gender: 1.21


### **Exercise 4: Compute Quantiles and Interquartile Range (IQR)**  
**Question:** Compute the **Q1**, **Q2 (median)**, and **Q3** for the **"cholesterol"** column and calculate the **Interquartile Range (IQR)**. 


In [12]:
q1_cholesterol = df["cholesterol"].quantile(0.25)
q2_cholesterol = np.median(df["cholesterol"])
q3_cholesterol = df["cholesterol"].quantile(0.75)
iqr_cholesterol = q3_cholesterol - q1_cholesterol

print(f"Q1 (25th percentile of cholesterol): {q1_cholesterol:.2f}")
print(f"Q1 (Median of cholesterol): {q2_cholesterol:.2f}")
print(f"Q3 (75th percentile of cholesterol): {q3_cholesterol:.2f}")
print(f"Interquartile Range (IQR) of cholesterol: {iqr_cholesterol:.2f}")

Q1 (25th percentile of cholesterol): 1.00
Q1 (Median of cholesterol): 1.00
Q3 (75th percentile of cholesterol): 2.00
Interquartile Range (IQR) of cholesterol: 1.00


### **Exercise 5: Grouped Analysis by Gender**  
**Question:** Calculate the **mean** of the **"weight"** column grouped by **"gender"**. 


In [14]:
weights = df["sex"]
weighted_mean_weight = np.average(df["weight"], weights=weights)
print(f"Weighted Mean of Weight (by gender): {weighted_mean_weight:.2f} kg")

Weighted Mean of Weight (by gender): 75.00 kg
