In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.io
import datetime

# Wiki Loading

In [None]:
# Load the .mat file
wiki_mat = scipy.io.loadmat('Datasets/IMBD-Wiki/wiki/wiki.mat')

print(wiki_mat.keys())

In [None]:
# Extract the column keys
wiki = wiki_mat['wiki']
print(wiki.dtype)

In [None]:
# Define the fields to extract
fields = ["dob", "photo_taken", "full_path", "gender", "name", "face_score", "second_face_score"]

# Extract fields using a loop
data = {field: wiki[field][0][0].flatten() for field in fields}

# Convert MATLAB serial date to datetime
data["dob"] = [datetime.datetime.fromordinal(int(d)) if d > 0 else None for d in data["dob"]]

# Compute age at the time of the photo
data["age"] = [photo - d.year if d is not None else None for photo, d in zip(data["photo_taken"], data["dob"])]

# Convert name and image path to readable format
data["name"] = [n[0] if len(n) > 0 else None for n in data["name"]]
data["full_path"] = [p[0] for p in data["full_path"]]  # Convert NumPy array of strings

# Create DataFrame
df_wiki = pd.DataFrame(data, columns=["name", "age", "photo_taken", "face_score", "second_face_score", "gender", "full_path"])

# Drop invalid ages (e.g., negative or unrealistic ages)
df_wiki = df_wiki[(df_wiki["age"] > 0) & (df_wiki["age"] < 100)]

# Remove rows where no face was detected
df_wiki = df_wiki[df_wiki["face_score"] > 0]


In [None]:
df_wiki.head()

In [None]:
df_wiki.info()

# IMDB Loading

In [None]:
# Load the .mat file
imdb_mat = scipy.io.loadmat('Datasets/IMBD-Wiki/imdb/imdb.mat')

print(imdb_mat.keys())

In [None]:
# Extract the column keys
imdb = imdb_mat['imdb']
print(imdb.dtype)

In [None]:
# Define the fields to extract
fields = ["dob", "photo_taken", "full_path", "gender", "name", "face_score", "second_face_score"]

# Extract fields using a loop
data = {field: imdb[field][0][0].flatten() for field in fields}

# Convert MATLAB serial date to datetime
data["dob"] = [datetime.datetime.fromordinal(int(d)) if d > 0 else None for d in data["dob"]]

# Compute age at the time of the photo
data["age"] = [photo - d.year if d is not None else None for photo, d in zip(data["photo_taken"], data["dob"])]

# Convert name and image path to readable format
data["name"] = [n[0] if len(n) > 0 else None for n in data["name"]]
data["full_path"] = [p[0] for p in data["full_path"]]  # Convert NumPy array of strings

# Create DataFrame
df_imdb = pd.DataFrame(data, columns=["name", "age", "photo_taken", "face_score", "second_face_score", "gender", "full_path"])

# Drop invalid ages (e.g., negative or unrealistic ages)
df_imdb = df_imdb[(df_imdb["age"] > 0) & (df_imdb["age"] < 100)]

# Remove rows where no face was detected
df_imdb = df_imdb[df_imdb["face_score"] > 0]


In [None]:
df_imdb.head()

In [None]:
df_imdb.info()

# Data Merging

In [None]:
for wiki_col, imdb_col in zip(df_wiki.columns, df_imdb.columns):
    if wiki_col != imdb_col:
        print('Column Mismatch')
    else:
        print('Column Match')

In [None]:
df_wiki['source'] = 'wiki'
df_imdb['source'] = 'imdb'

In [None]:
df = pd.concat([df_wiki, df_imdb], ignore_index=True)

In [None]:
df.info()

In [None]:
plt.figure(figsize=(10, 6))  # Increase figure size for better readability
sns.histplot(df["age"], bins=30, color="royalblue", edgecolor="black", kde=True, alpha=0.8)
plt.xlabel("Age", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.title("Age Distribution", fontsize=16, fontweight="bold")
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Dashed gridlines for readability

plt.show()


In [None]:
image_counts = df["name"].value_counts()
repeat_individuals = image_counts[image_counts > 1].index
repeat_df = df[df["name"].isin(repeat_individuals)]
age_diffs = repeat_df.groupby("name")["age"].agg(["min", "max"])
age_diffs["age_diff"] = age_diffs["max"] - age_diffs["min"]

print("Total repeat individuals:", len(repeat_individuals))
print("Average age difference among repeat faces:", age_diffs["age_diff"].mean())


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(age_diffs["age_diff"].dropna(), bins=30, color="royalblue", edgecolor="black", kde=True, alpha=0.8)
plt.xlabel("Age Difference (Years)", fontsize= 14)
plt.ylabel("Number of Individuals", fontsize=14)
plt.title("Age Difference Among Repeat Faces", fontsize=16, fontweight="bold")
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Dashed gridlines for readability

plt.show()


In [None]:
df.head()

In [None]:
plt.figure(figsize=(10, 6))
plt.pie(df['gender'].value_counts(), labels= ['Male', 'Female'], startangle=90, wedgeprops={"edgecolor": "black"},  autopct='%1.1f%%')
plt.title("Gender Distribution", fontsize=14, fontweight="bold")

plt.show()


In [None]:
df_nonnull = df.copy()
df_nonnull = df_nonnull.dropna()

In [None]:
df_nonnull.info()

In [None]:
plt.figure(figsize=(10, 6))  # Increase figure size for better readability
sns.histplot(df["face_score"], bins=30, color="royalblue", edgecolor="black", kde=True, alpha=0.8)
plt.xlabel("Face Score", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.title("Face Score Distribution", fontsize=16, fontweight="bold")
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Dashed gridlines for readability

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  # Increase figure size for better readability
sns.histplot(df["second_face_score"], bins=30, color="royalblue", edgecolor="black", kde=True, alpha=0.8)
plt.xlabel("Second Face Score", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.title("Second Face Score Distribution", fontsize=16, fontweight="bold")
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Dashed gridlines for readability

plt.show()