In [26]:
import scipy.io
import pandas as pd

def process_mat(mat_file, dataset_key):
    mat = scipy.io.loadmat(mat_file)
    data = mat[dataset_key][0, 0]

    file_paths = [fp[0] for fp in data["full_path"][0]]
    dob = data["dob"][0]
    photo_taken = data["photo_taken"][0]
    gender = data["gender"][0]
    face_score = data["face_score"][0]
    second_face_score = data["second_face_score"][0]

    # Correct age calculation
    age = photo_taken - (dob / 365).astype(int)

    df = pd.DataFrame({
        "file_path": file_paths,
        "dob": dob,
        "photo_taken": photo_taken,
        "age": age,
        "gender": gender,
        "face_score": face_score,
        "second_face_score": second_face_score,
    })

    # Filters
    df = df[
        (df["face_score"] > 0.0) &
        (df["age"].notna()) &
        ((df["second_face_score"].isna()) | (df["second_face_score"] <= 3.0)) &
        (df["age"] >= 0) & (df["age"] <= 100)
    ]

    # Age binning
    age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 101]
    age_labels = ["0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80", "80-90", "90-100"]
    df["age_group"] = pd.cut(df["age"], bins=age_bins, labels=age_labels, right=False)

    return df


In [None]:
# Process both datasets
wiki_df = process_mat("wiki_crop/wiki.mat", "wiki")
wiki_df["source"] = "wiki"

imdb_df = process_mat("imdb_crop/imdb.mat", "imdb")
imdb_df["source"] = "imdb"



# Combine and shuffle
combined_df = pd.concat([wiki_df, imdb_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)



In [42]:
# print(len(wiki_df))
# print(len(imdb_df))

print(len(combined_df[combined_df['source'] == 'wiki']))
print(len(combined_df[combined_df['source'] == 'imdb']))


print(len(combined_df))

print(combined_df["age_group"].value_counts().sort_index())

43546
332112
375658
age_group
0-10        2589
10-20      26754
20-30     102203
30-40     116531
40-50      70584
50-60      33373
60-70      15699
70-80       6013
80-90       1666
90-100       246
Name: count, dtype: int64


In [43]:
import pandas as pd

# Assume combined_df already exists with 'age_group' column
# If not, load it from your previous cleaning step
# combined_df = pd.read_csv("cleaned_combined_imdb_wiki.csv")

# Define sample limits per age group
sample_limits = {
    "0-10": 2000,
    "10-20": 3000,
    "20-30": 3000,
    "30-40": 3000,
    "40-50": 2000,
    "50-60": 3000,
    "60-70": 2000,
    "70-80": 1500,
    "80-90": 1000,
    "90-100": 250,
}

# Perform sampling
balanced_df = pd.concat([
    group.sample(n=min(sample_limits[str(age_group)], len(group)), random_state=42)
    for age_group, group in combined_df.groupby("age_group")
    if str(age_group) in sample_limits
])

# Reset index
balanced_df.reset_index(drop=True, inplace=True)
print(len(balanced_df))
balanced_df


20746


  for age_group, group in combined_df.groupby("age_group")


Unnamed: 0,file_path,dob,photo_taken,age,gender,face_score,second_face_score,age_group,source
0,84/nm0566084_rm1803328256_1992-9-28_2003.jpg,727835,2003,9,0.0,5.185875,,0-10,imdb
1,53/nm0182853_rm2364267776_1965-6-17_1971.jpg,717870,1971,5,0.0,1.653958,,0-10,imdb
2,76/nm0001576_rm2829682432_1975-12-27_1982.jpg,721715,1982,5,0.0,1.352922,,0-10,imdb
3,88/nm1600688_rm2620758016_1994-9-30_2005.jpg,728567,2005,9,1.0,2.063284,,0-10,imdb
4,43/nm2215143_rm2254935808_1999-11-10_2007.jpg,730434,2007,6,0.0,1.860613,1.501391,0-10,imdb
...,...,...,...,...,...,...,...,...,...
20741,35/nm0926235_rm745521408_1921-10-1_2015.jpg,701905,2015,92,1.0,1.875124,,90-100,imdb
20742,03/317803_1920-03-03_2011.jpg,701328,2011,90,1.0,2.126308,,90-100,wiki
20743,04/11470604_1923-04-28_2014.jpg,702479,2014,90,1.0,2.483511,1.689925,90-100,wiki
20744,91/21901391_1910-03-25_2007.jpg,697697,2007,96,1.0,3.320927,,90-100,wiki


In [44]:
print(len(balanced_df[balanced_df['source'] == 'wiki']))
print(len(balanced_df[balanced_df['source'] == 'imdb']))


print(len(balanced_df))

2932
17814
20746


In [45]:
# Save to CSV
balanced_df.to_csv("balanced_filtered_dataset.csv", index=False)

print("✅ Balanced dataset saved as 'balanced_filtered_dataset.csv'")
print("📊 Final group distribution:")
print(balanced_df["age_group"].value_counts().sort_index())

✅ Balanced dataset saved as 'balanced_filtered_dataset.csv'
📊 Final group distribution:
age_group
0-10      2000
10-20     3000
20-30     3000
30-40     3000
40-50     2000
50-60     3000
60-70     2000
70-80     1500
80-90     1000
90-100     246
Name: count, dtype: int64
