# Assignment 1: Height Classification using Histograms and Gaussian Models

In [None]:
from google.colab import files
uploaded = files.upload()

### Step 2: Load Data

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

xls = pd.ExcelFile('Assignment_1_Data_and_Template.xlsx')
print("Available sheets:", xls.sheet_names)

# Assuming the data is in sheet named 'Data'
df = pd.read_excel(xls, sheet_name='Data')
df = df[['Gender', 'Height']].dropna()
df.head()

### Step 3: Split by Gender

In [None]:
male_heights = df[df['Gender'] == 'Male']['Height']
female_heights = df[df['Gender'] == 'Female']['Height']

### Step 4: Create Histograms with 32 Bins

In [None]:
male_hist, bins = np.histogram(male_heights, bins=32)
female_hist, _ = np.histogram(female_heights, bins=bins)

print("Male histogram counts:", male_hist.tolist())
print("Female histogram counts:", female_hist.tolist())

### Step 5: Classify Heights using Histogram Counts

In [None]:
def classify_using_histogram(height):
    bin_index = np.digitize(height, bins) - 1
    bin_index = min(max(bin_index, 0), 31)  # Clamp to valid range

    p_male = male_hist[bin_index] / male_hist.sum()
    p_female = female_hist[bin_index] / female_hist.sum()

    prior_male = len(male_heights) / len(df)
    prior_female = len(female_heights) / len(df)

    numerator_male = p_male * prior_male
    numerator_female = p_female * prior_female

    if numerator_male + numerator_female == 0:
        return "Unknown", 0.0

    posterior_male = numerator_male / (numerator_male + numerator_female)
    posterior_female = 1 - posterior_male

    label = "Male" if posterior_male > posterior_female else "Female"
    return label, max(posterior_male, posterior_female)

print("\nHistogram Classifier:")
for h in [55, 60, 65, 70, 75, 80]:
    label, prob = classify_using_histogram(h)
    print(f"Height {h}: Predicted = {label}, Posterior Probability = {prob:.4f}")

### Step 6: Fit Gaussian Models

In [None]:
male_mu, male_sigma = male_heights.mean(), male_heights.std()
female_mu, female_sigma = female_heights.mean(), female_heights.std()

### Step 7: Classify Heights using Gaussian Model

In [None]:
prior_male = len(male_heights) / len(df)
prior_female = len(female_heights) / len(df)

print("\nBayesian Gaussian Classifier:")
for h in [55, 60, 65, 70, 75, 80]:
    p_h_male = norm.pdf(h, male_mu, male_sigma)
    p_h_female = norm.pdf(h, female_mu, female_sigma)

    numerator_male = p_h_male * prior_male
    numerator_female = p_h_female * prior_female

    posterior_male = numerator_male / (numerator_male + numerator_female)
    posterior_female = 1 - posterior_male

    label = "Male" if posterior_male > posterior_female else "Female"
    print(f"Height {h}: Predicted = {label}, P(Male|h) = {posterior_male:.4f}, P(Female|h) = {posterior_female:.4f}")

### Step 8: Repeat for First 50 Samples

In [None]:
print("\n--- Using Only First 50 Samples ---")
df_50 = df.iloc[:50]
male_heights_50 = df_50[df_50['Gender'] == 'Male']['Height']
female_heights_50 = df_50[df_50['Gender'] == 'Female']['Height']

male_hist_50, bins_50 = np.histogram(male_heights_50, bins=32)
female_hist_50, _ = np.histogram(female_heights_50, bins=bins_50)

male_mu_50, male_sigma_50 = male_heights_50.mean(), male_heights_50.std()
female_mu_50, female_sigma_50 = female_heights_50.mean(), female_heights_50.std()

prior_male_50 = len(male_heights_50) / len(df_50)
prior_female_50 = len(female_heights_50) / len(df_50)

print("\nBayesian Classifier on 50 Samples:")
for h in [55, 60, 65, 70, 75, 80]:
    p_h_male = norm.pdf(h, male_mu_50, male_sigma_50)
    p_h_female = norm.pdf(h, female_mu_50, female_sigma_50)

    numerator_male = p_h_male * prior_male_50
    numerator_female = p_h_female * prior_female_50

    posterior_male = numerator_male / (numerator_male + numerator_female)
    posterior_female = 1 - posterior_male

    label = "Male" if posterior_male > posterior_female else "Female"
    print(f"Height {h}: Predicted = {label}, P(Male|h) = {posterior_male:.4f}, P(Female|h) = {posterior_female:.4f}")

### Observations
- Histogram classifier is sensitive to bin size and number of samples.
- Bayesian classifier provides smoother and more general results.
- With fewer data points (50 entries), Gaussian model remains more stable.

In [None]:
print("\n--- Using Only First 50 Samples ---")
df_50 = df.iloc[:50]
male_heights_50 = df_50[df_50['Gender'] == 'Male']['Height']
female_heights_50 = df_50[df_50['Gender'] == 'Female']['Height']

male_hist_50, bins_50 = np.histogram(male_heights_50, bins=32)
female_hist_50, _ = np.histogram(female_heights_50, bins=bins_50)

male_mu_50, male_sigma_50 = male_heights_50.mean(), male_heights_50.std()
female_mu_50, female_sigma_50 = female_heights_50.mean(), female_heights_50.std()

prior_male_50 = len(male_heights_50) / len(df_50)
prior_female_50 = 1 - prior_male_50

In [None]:
print("\nBayesian Classifier on 50 Samples:")
for h in [55, 60, 65, 70, 75, 80]:
    p_h_male = norm.pdf(h, male_mu_50, male_sigma_50)
    p_h_female = norm.pdf(h, female_mu_50, female_sigma_50)
    numerator_male = p_h_male * prior_male_50
    numerator_female = p_h_female * prior_female_50
    posterior_male = numerator_male / (numerator_male + numerator_female)
    posterior_female = 1 - posterior_male
    label = "Male" if posterior_male > posterior_female else "Female"
    print(f"Height {h}: Predicted = {label}, P(Male|h) = {posterior_male:.4f}, P(Female|h) = {posterior_female:.4f}")

### Observations
- Histogram classifier is sensitive to bin size and number of samples.
- Bayesian classifier provides smoother and more general results.
- With fewer data points (50 entries), Gaussian model remains more stable.