In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import re

In [None]:
#download the dataset and extract it to a folder named FGNET_Dataset right outside the current folder. 
train_labels = pd.read_csv("../FGNET_Dataset/Index/Train.csv")
print(train_labels.head())

In [None]:
def extract_person_id(filename):
    match = re.match(r"(\d+)_", filename)
    return match.group(1) if match else filename

train_labels["person_id"] = train_labels["filename"].apply(extract_person_id)

In [None]:
train_labels.head()

In [None]:
# Group by person_id and calculate the age difference and file count
grouped = train_labels.groupby('person_id').agg(
    age_diff=('age', lambda x: x.max() - x.min()),
    file_count=('filename', 'count')
).reset_index()

In [None]:
# Filter the grouped dataframe to include only people with more than 1 photo
filtered_grouped = grouped[grouped['file_count'] > 1]

# Initialize a dictionary to store the results
age_diff_stats = {}

# Calculate the average age difference for each n in the range of 2 to 40
for n in range(2, 40):
    subset = filtered_grouped[filtered_grouped['file_count'] == n]
    if not subset.empty:
        avg_age_diff = subset['age_diff'].mean()
        age_diff_stats[n] = avg_age_diff

# Print the statistics
for n, avg_age_diff in age_diff_stats.items():
    print(f"Number of photos: {n}, Average age difference: {avg_age_diff:.2f}")

In [None]:
# Convert the age_diff_stats dictionary to a DataFrame
age_diff_df = pd.DataFrame(list(age_diff_stats.items()), columns=['Number of Photos', 'Average Age Difference'])

# Plot the DataFrame
plt.figure(figsize=(10, 6))
plt.plot(age_diff_df['Number of Photos'], age_diff_df['Average Age Difference'], marker='o')
plt.xlabel('Number of Photos')
plt.ylabel('Average Age Difference')
plt.title('Average Age Difference vs. Number of Photos')
plt.grid(True)
plt.show()

In [None]:
# Filter the DataFrame to include only people with at least 2 photos
at_least_2_photos = filtered_grouped[filtered_grouped['file_count'] >= 2]

# Calculate the average age difference
avg_age_diff_at_least_2_photos = at_least_2_photos['age_diff'].mean()

print(f"The average age difference for anyone with at least 2 photos is: {avg_age_diff_at_least_2_photos:.2f}")

In [None]:
# Group by age_diff and count the number of people for each age difference
age_diff_counts = grouped['age_diff'].value_counts().sort_index()

# Plot the number of people with each age difference
plt.figure(figsize=(10, 6))
plt.bar(age_diff_counts.index, age_diff_counts.values)
plt.xlabel('Age Difference')
plt.ylabel('Number of People')
plt.title('Number of People with Each Age Difference')
plt.xticks(rotation=90)
plt.grid(True)
plt.show()