In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [None]:
!pip install opencv-python-headless  # Headless for Colab (no GUI)
import os
import zipfile
import cv2
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split



In [None]:
import os
import cv2
import pandas as pd
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Remount for fresh access

base_dir = '/content/drive/MyDrive/Smilage_Project_Data'
utkface_img_dir = os.path.join(base_dir, 'UTKFace/UTKFace')
celeba_img_dir = os.path.join(base_dir, 'celeba/img_align_celeba')
celeba_anno_path = os.path.join(base_dir, 'celeba/Anno/list_attr_celeba.txt')

# Verify files
print(f"UTKFace images: {len([f for f in os.listdir(utkface_img_dir) if f.endswith('.jpg')]) if os.path.exists(utkface_img_dir) else 'Path issue'}")
print(f"CelebA images: {len([f for f in os.listdir(celeba_img_dir) if f.endswith('.jpg')]) if os.path.exists(celeba_img_dir) else 'Path issue'}")
print(f"Annotations exist: {os.path.exists(celeba_anno_path)}")
print("Sample CelebA files:", os.listdir(celeba_img_dir)[:5] if os.path.exists(celeba_img_dir) else "No files")

# Check resolutions (sample 100 images each)
def get_image_resolution(img_path):
    img = cv2.imread(img_path)
    if img is None:
        return None
    return (img.shape[1], img.shape[0])  # Width, Height

print("\nChecking UTKFace resolutions...")
utkface_resolutions = []
if os.path.exists(utkface_img_dir):
    utkface_files = [f for f in os.listdir(utkface_img_dir) if f.endswith('.jpg')][:100]
    for f in utkface_files:
        res = get_image_resolution(os.path.join(utkface_img_dir, f))
        if res:
            utkface_resolutions.append(res)
    unique_utkface_res = set(utkface_resolutions)
    print(f"UTKFace sample size: {len(utkface_resolutions)}")
    print(f"Unique resolutions: {unique_utkface_res}")
    print(f"Resolution counts: {pd.Series(utkface_resolutions).value_counts().to_dict()}")

print("\nChecking CelebA resolutions...")
celeba_resolutions = []
if os.path.exists(celeba_img_dir):
    celeba_files = [f for f in os.listdir(celeba_img_dir) if f.endswith('.jpg')][:100]
    for f in celeba_files:
        res = get_image_resolution(os.path.join(celeba_img_dir, f))
        if res:
            celeba_resolutions.append(res)
    unique_celeba_res = set(celeba_resolutions)
    print(f"CelebA sample size: {len(celeba_resolutions)}")
    print(f"Unique resolutions: {unique_celeba_res}")
    print(f"Resolution counts: {pd.Series(celeba_resolutions).value_counts().to_dict()}")

# Check Drive usage
!df -h /content/drive

Mounted at /content/drive
UTKFace images: 23708
CelebA images: 10002
Annotations exist: True
Sample CelebA files: ['009080.jpg', '009008.jpg', '009086.jpg', '009065.jpg', '009018.jpg']

Checking UTKFace resolutions...
UTKFace sample size: 100
Unique resolutions: {(200, 200)}
Resolution counts: {(200, 200): 100}

Checking CelebA resolutions...
CelebA sample size: 100
Unique resolutions: {(178, 218)}
Resolution counts: {(178, 218): 100}
Filesystem      Size  Used Avail Use% Mounted on
drive            15G  9.2G  5.9G  61% /content/drive


In [None]:
import pandas as pd

base_dir = '/content/drive/MyDrive/Smilage_Project_Data'
utkface_img_dir = os.path.join(base_dir, 'UTKFace/UTKFace')
celeba_img_dir = os.path.join(base_dir, 'celeba/img_align_celeba')
celeba_anno_path = os.path.join(base_dir, 'celeba/Anno/list_attr_celeba.txt')

# UTKFace
utkface_data = []
for filename in os.listdir(utkface_img_dir):
    if filename.endswith('.jpg'):
        parts = filename.split('_')
        if len(parts) >= 4:
            try:
                age = int(parts[0])
                img_path = os.path.join(utkface_img_dir, filename)
                utkface_data.append({'path': img_path, 'age': age})
            except ValueError:
                pass
utkface_df = pd.DataFrame(utkface_data)
print(f"UTKFace loaded: {len(utkface_df)} images, age range: {utkface_df['age'].min()} to {utkface_df['age'].max()}")

# CelebA
try:
    celeba_df = pd.read_csv(celeba_anno_path)  # Handles CSV format
    celeba_df['smile'] = celeba_df['Smiling'].replace({-1: 0, 1: 1})
    celeba_df['filename'] = celeba_df.iloc[:, 0]  # First column is image filename
    celeba_df['path'] = [os.path.join(celeba_img_dir, fn) for fn in celeba_df['filename']]
    extracted_files = set(os.listdir(celeba_img_dir))
    celeba_sample = celeba_df[celeba_df['filename'].isin(extracted_files)].copy()
    print(f"CelebA loaded: {len(celeba_sample)} images, smile distribution: {celeba_sample['smile'].value_counts().to_dict()}")
except FileNotFoundError:
    print(f"Annotation file not found at {celeba_anno_path}")
    raise
except pd.errors.ParserError:
    print("Error parsing CelebA annotations. Checking file:")
    !head -n 5 "{celeba_anno_path}"
    raise

UTKFace loaded: 23705 images, age range: 1 to 116
CelebA loaded: 10002 images, smile distribution: {0: 5179, 1: 4823}


In [None]:
import os
import cv2
import pandas as pd
from google.colab import drive
from multiprocessing import Pool # Import Pool for multiprocessing
drive.mount('/content/drive', force_remount=True)

base_dir = '/content/drive/MyDrive/Smilage_Project_Data'
utkface_img_dir = os.path.join(base_dir, 'UTKFace/UTKFace')
celeba_img_dir = os.path.join(base_dir, 'celeba/img_align_celeba')

# Load face cascade (uses GPU-accelerated DNN if available)
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def is_valid_image(img_path, blur_threshold=50):
    img = cv2.imread(img_path)
    if img is None:
        return False
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 5, minSize=(30, 30))
    if len(faces) != 1:
        return False
    laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()
    return laplacian_var >= blur_threshold

# Get all image paths
utkface_paths = [os.path.join(utkface_img_dir, f) for f in os.listdir(utkface_img_dir) if f.endswith('.jpg')]
celeba_paths = [os.path.join(celeba_img_dir, f) for f in os.listdir(celeba_img_dir) if f.endswith('.jpg')]

# Clean UTKFace using multiprocessing
print("Cleaning UTKFace...")
utkface_clean_data = []
with Pool(processes=4) as pool: # Use a pool of processes
    valid_utkface_paths = pool.map(is_valid_image, utkface_paths)

for i, is_valid in enumerate(valid_utkface_paths):
    if is_valid:
        path = utkface_paths[i]
        filename = os.path.basename(path)
        parts = filename.split('_')
        if len(parts) >= 4:
            try:
                age = int(parts[0])
                utkface_clean_data.append({'path': path, 'age': age})
            except ValueError:
                continue
utkface_clean_df = pd.DataFrame(utkface_clean_data)
print(f"UTKFace cleaned: {len(utkface_clean_df)} images")

# Clean CelebA using multiprocessing (load labels first for smile)
print("Cleaning CelebA...")
celeba_anno_path = os.path.join(base_dir, 'celeba/Anno/list_attr_celeba.txt')
celeba_df_labels = pd.read_csv(celeba_anno_path) # Load labels separately
celeba_df_labels['smile'] = celeba_df_labels['Smiling'].replace({-1: 0, 1: 1})
celeba_df_labels['filename'] = celeba_df_labels.iloc[:, 0]

celeba_clean_data = []
with Pool(processes=4) as pool: # Use a pool of processes
    valid_celeba_paths = pool.map(is_valid_image, celeba_paths)

for i, is_valid in enumerate(valid_celeba_paths):
    if is_valid:
        path = celeba_paths[i]
        filename = os.path.basename(path)
        if filename in celeba_df_labels['filename'].values:
            smile = celeba_df_labels.loc[celeba_df_labels['filename'] == filename, 'smile'].iloc[0]
            celeba_clean_data.append({'path': path, 'smile': smile})
celeba_clean_df = pd.DataFrame(celeba_clean_data)
print(f"CelebA cleaned: {len(celeba_clean_df)} images")


# Save cleaned DataFrames
utkface_clean_df.to_csv(os.path.join(base_dir, 'utkface_clean.csv'), index=False)
celeba_clean_df.to_csv(os.path.join(base_dir, 'celeba_clean.csv'), index=False)
print("Cleaning complete. Saved to CSV files.")

Mounted at /content/drive
Cleaning UTKFace...
UTKFace cleaned: 4209 images
Cleaning CelebA...
CelebA cleaned: 9254 images
Cleaning complete. Saved to CSV files.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split  # Added missing import
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

base_dir = '/content/drive/MyDrive/Smilage_Project_Data'
utkface_clean_path = os.path.join(base_dir, 'utkface_clean.csv')
celeba_clean_path = os.path.join(base_dir, 'celeba_clean.csv')

# Load cleaned DataFrames
utkface_clean_df = pd.read_csv(utkface_clean_path)
celeba_clean_df = pd.read_csv(celeba_clean_path)

print(f"UTKFace loaded: {len(utkface_clean_df)} images")
print(f"CelebA loaded: {len(celeba_clean_df)} images")

# Balance CelebA to match UTKFace size (stratified by smile)
target_size = len(utkface_clean_df)  # 4209
if len(celeba_clean_df) > target_size:
    _, celeba_balanced_df = train_test_split(celeba_clean_df, train_size=target_size, stratify=celeba_clean_df['smile'], random_state=42)
    celeba_clean_df = celeba_balanced_df
    print(f"CelebA balanced to match UTKFace: {len(celeba_clean_df)} images")

# Save balanced cleaned DataFrames
utkface_clean_df.to_csv(os.path.join(base_dir, 'utkface_clean.csv'), index=False)
celeba_clean_df.to_csv(os.path.join(base_dir, 'celeba_clean.csv'), index=False)
print("Balancing complete. Saved to CSV files.")

Mounted at /content/drive
UTKFace loaded: 4209 images
CelebA loaded: 9254 images
CelebA balanced to match UTKFace: 5045 images
Balancing complete. Saved to CSV files.


In [None]:
import os
import cv2
import numpy as np
from multiprocessing import Pool
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

base_dir = '/content/drive/MyDrive/Smilage_Project_Data'
utkface_clean_path = os.path.join(base_dir, 'utkface_clean.csv')
celeba_clean_path = os.path.join(base_dir, 'celeba_clean.csv')
utkface_output_dir = os.path.join(base_dir, 'processed_utkface')
celeba_output_dir = os.path.join(base_dir, 'processed_celeba')
os.makedirs(utkface_output_dir, exist_ok=True)
os.makedirs(celeba_output_dir, exist_ok=True)

# Load cleaned DataFrames
utkface_df = pd.read_csv(utkface_clean_path)
celeba_df = pd.read_csv(celeba_clean_path)

# Preprocessing function
def preprocess_image(img_path):
    img = cv2.imread(img_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml').detectMultiScale(gray, 1.1, 5)
    if len(faces) == 1:
        x, y, w, h = faces[0]
        face = img[max(0, y-20):y+h+20, max(0, x-20):x+w+20]
        resized = cv2.resize(face, (128, 128))
        normalized = resized / 255.0
        filename = os.path.basename(img_path).replace('.jpg', '.npy')
        save_path = os.path.join(utkface_output_dir if 'UTKFace' in img_path else celeba_output_dir, filename)
        np.save(save_path, normalized)
        return save_path
    return None

# Parallel processing
with Pool(processes=4) as pool:
    utkface_results = pool.map(preprocess_image, utkface_df['path'].tolist())
    celeba_results = pool.map(preprocess_image, celeba_df['path'].tolist())

# Update DataFrames with processed paths and align with valid results
utkface_df = utkface_df.copy()  # Avoid SettingWithCopyWarning
celeba_df = celeba_df.copy()
utkface_df['processed_path'] = utkface_results  # Assign all results, including None
celeba_df['processed_path'] = celeba_results
utkface_df = utkface_df.dropna(subset=['processed_path'])  # Drop rows where processing failed
celeba_df = celeba_df.dropna(subset=['processed_path'])

# Save updated DataFrames
utkface_df.to_csv(os.path.join(base_dir, 'utkface_processed.csv'), index=False)
celeba_df.to_csv(os.path.join(base_dir, 'celeba_processed.csv'), index=False)
print(f"Preprocessed: UTKFace {len(utkface_df)} images, CelebA {len(celeba_df)} images")

Mounted at /content/drive
Preprocessed: UTKFace 4208 images, CelebA 5041 images


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

base_dir = '/content/drive/MyDrive/Smilage_Project_Data'
utkface_processed_path = os.path.join(base_dir, 'utkface_processed.csv')
celeba_processed_path = os.path.join(base_dir, 'celeba_processed.csv')

# Load preprocessed DataFrames
utkface_df = pd.read_csv(utkface_processed_path)
celeba_df = pd.read_csv(celeba_processed_path)

print(f"UTKFace loaded: {len(utkface_df)} images")
print(f"CelebA loaded: {len(celeba_df)} images")

# Split UTKFace
utk_train, utk_temp = train_test_split(utkface_df, test_size=0.3, random_state=42)
utk_val, utk_test = train_test_split(utk_temp, test_size=0.5, random_state=42)
print(f"UTKFace split: Train {len(utk_train)}, Val {len(utk_val)}, Test {len(utk_test)}")

# Split CelebA (stratified by smile)
celeba_train, celeba_temp = train_test_split(celeba_df, test_size=0.3, random_state=42, stratify=celeba_df['smile'])
celeba_val, celeba_test = train_test_split(celeba_temp, test_size=0.5, random_state=42, stratify=celeba_temp['smile'])
print(f"CelebA split: Train {len(celeba_train)}, Val {len(celeba_val)}, Test {len(celeba_test)}")

# Save splits
for split, name in [(utk_train, 'utk_train'), (utk_val, 'utk_val'), (utk_test, 'utk_test'),
                    (celeba_train, 'celeba_train'), (celeba_val, 'celeba_val'), (celeba_test, 'celeba_test')]:
    split.to_csv(os.path.join(base_dir, f'{name}.csv'), index=False)
print("Splitting complete. Saved to CSV files.")

Mounted at /content/drive
UTKFace loaded: 4208 images
CelebA loaded: 5041 images
UTKFace split: Train 2945, Val 631, Test 632
CelebA split: Train 3528, Val 756, Test 757
Splitting complete. Saved to CSV files.


In [None]:
!head -n 5 "/content/drive/MyDrive/Smilage_Project_Data/celeba/Anno/list_attr_celeba.txt"

image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,Blond_Hair,Blurry,Brown_Hair,Bushy_Eyebrows,Chubby,Double_Chin,Eyeglasses,Goatee,Gray_Hair,Heavy_Makeup,High_Cheekbones,Male,Mouth_Slightly_Open,Mustache,Narrow_Eyes,No_Beard,Oval_Face,Pale_Skin,Pointy_Nose,Receding_Hairline,Rosy_Cheeks,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,-1,1,-1,-1,1,-1,-1,1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,1
000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,1,-1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,1
000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,1,1,-1,-1,1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,1
000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,1,-1,-1,-1,-1,1,-1,1,-1,1,1,-1,1


In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

base_dir = '/content/drive/MyDrive/Smilage_Project_Data'
celeba_anno_path = os.path.join(base_dir, 'celeba/Anno/list_attr_celeba.txt')
celeba_processed_path = os.path.join(base_dir, 'celeba_clean.csv')

# Load annotation file with proper separator
anno_df = pd.read_csv(celeba_anno_path, sep=',')
print(f"Number of columns in annotation file: {len(anno_df.columns)}")
print(f"Column names: {anno_df.columns.tolist()}")

# Verify and set the smile column
if 'Smiling' not in anno_df.columns:
    raise ValueError("'Smiling' column not found in the annotation file. Please check the column names.")
anno_df['filename'] = anno_df.iloc[:, 0]  # First column is image filename, assuming it is always the first column
anno_df['smile'] = anno_df['Smiling'].replace({-1: 0, 1: 1})

# Load preprocessed CelebA to get the subset of filenames
processed_df = pd.read_csv(celeba_processed_path)
processed_filenames = set(os.path.basename(p) for p in processed_df['path'])

# Filter annotations to match preprocessed images
subset_anno_df = anno_df[anno_df['filename'].isin(processed_filenames)].copy() # Use .copy() to avoid SettingWithCopyWarning

# Calculate ratio
smile_count = subset_anno_df['smile'].sum()  # Count of 1s (smiling)
non_smile_count = len(subset_anno_df) - smile_count  # Count of 0s (non-smiling)
total_count = len(subset_anno_df)

print(f"Total images in subset: {total_count}")
print(f"Smiling images (1): {smile_count}")
print(f"Non-smiling images (0): {non_smile_count}")

# Avoid division by zero if there are no non-smiling images
if non_smile_count > 0:
    print(f"Ratio of smiling to non-smiling: {smile_count / non_smile_count:.2f}:1")
else:
    print("No non-smiling images found in the subset.")

Mounted at /content/drive
Number of columns in annotation file: 41
Column names: ['image_id', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Attractive', 'Bags_Under_Eyes', 'Bald', 'Bangs', 'Big_Lips', 'Big_Nose', 'Black_Hair', 'Blond_Hair', 'Blurry', 'Brown_Hair', 'Bushy_Eyebrows', 'Chubby', 'Double_Chin', 'Eyeglasses', 'Goatee', 'Gray_Hair', 'Heavy_Makeup', 'High_Cheekbones', 'Male', 'Mouth_Slightly_Open', 'Mustache', 'Narrow_Eyes', 'No_Beard', 'Oval_Face', 'Pale_Skin', 'Pointy_Nose', 'Receding_Hairline', 'Rosy_Cheeks', 'Sideburns', 'Smiling', 'Straight_Hair', 'Wavy_Hair', 'Wearing_Earrings', 'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace', 'Wearing_Necktie', 'Young']
Total images in subset: 5045
Smiling images (1): 2500
Non-smiling images (0): 2545
Ratio of smiling to non-smiling: 0.98:1


In [None]:
### Using SDG Classifier Model for smile and age prediction on the datasets
# Import necessary libraries for smile prediction (added SGDClassifier for epochs)
import numpy as np
import pandas as pd
from skimage.feature import hog
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier  # For epoch-based training
from sklearn.model_selection import GridSearchCV  # For tuning to improve accuracy
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive, files  # Added files for upload
import cv2  # For image processing in inference
import joblib
!pip install scikit-image numpy tqdm
# Mount Google Drive to access data
drive.mount('/content/drive', force_remount=True)

# Define base directory for data
base_dir = '/content/drive/MyDrive/Smilage_Project_Data'

print("Libraries imported and Drive mounted.")


Mounted at /content/drive
Libraries imported and Drive mounted.


In [None]:
# Load single celeba_clean.csv and create train, val, test splits
from sklearn.model_selection import train_test_split

celeba_clean_path = f'{base_dir}/celeba_clean.csv'

# Read CSV into DataFrame
celeba_df = pd.read_csv(celeba_clean_path)

# Verify columns
print("Columns in celeba_clean.csv:", celeba_df.columns.tolist())
print("First few rows:")
print(celeba_df.head())

# Split into train (70%), val (15%), test (15%)
train_df, temp_df = train_test_split(celeba_df, test_size=0.3, random_state=42, stratify=celeba_df['smile'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['smile'])

celeba_train_df = train_df
celeba_val_df = val_df
celeba_test_df = test_df

# Print dataset sizes and smile distribution
print(f"Loaded CelebA: Total {len(celeba_df)}")
print(f"Train: {len(celeba_train_df)}, Val: {len(celeba_val_df)}, Test: {len(celeba_test_df)}")
print(f"Smile distribution in train: {celeba_train_df['smile'].value_counts().to_dict()}")

Columns in celeba_clean.csv: ['path', 'smile']
First few rows:
                                                path  smile
0  /content/drive/MyDrive/Smilage_Project_Data/ce...      0
1  /content/drive/MyDrive/Smilage_Project_Data/ce...      1
2  /content/drive/MyDrive/Smilage_Project_Data/ce...      0
3  /content/drive/MyDrive/Smilage_Project_Data/ce...      0
4  /content/drive/MyDrive/Smilage_Project_Data/ce...      0
Loaded CelebA: Total 5045
Train: 3531, Val: 757, Test: 757
Smile distribution in train: {0: 1781, 1: 1750}


In [None]:
import os
label_file = '/content/drive/MyDrive/Smilage_Project_Data/celeba/Anno/list_attr_celeba.txt'
with open(label_file, 'r') as f:
    lines = f.readlines()
    print("First 3 lines of list_attr_celeba.txt:")
    for i, line in enumerate(lines[:3]):
        print(f"Line {i+1}: {line.strip()}")
    if len(lines) > 1:
        header = lines[1].strip().split()
        print("\nHeader attributes:", header)

First 3 lines of list_attr_celeba.txt:
Line 1: image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,Blond_Hair,Blurry,Brown_Hair,Bushy_Eyebrows,Chubby,Double_Chin,Eyeglasses,Goatee,Gray_Hair,Heavy_Makeup,High_Cheekbones,Male,Mouth_Slightly_Open,Mustache,Narrow_Eyes,No_Beard,Oval_Face,Pale_Skin,Pointy_Nose,Receding_Hairline,Rosy_Cheeks,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
Line 2: 000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,-1,1,-1,-1,1,-1,-1,1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,1
Line 3: 000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,1,-1,1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,1

Header attributes: ['000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,1,1,-1,1,-1,-1,1,-1,-1,1,-1,-1,-1,1,1,-1,1,-1,1,-1,-1,1']


In [None]:
import os
import numpy as np
from skimage import io, color, transform
from skimage.feature import hog
from tqdm import tqdm

# Define paths
data_dir = '/content/drive/MyDrive/Smilage_Project_Data/celeba/img_align_celeba/'
label_file = '/content/drive/MyDrive/Smilage_Project_Data/celeba/Anno/list_attr_celeba.txt'

# Load labels
def load_labels(label_file, attribute_name='Smiling'):
    try:
        with open(label_file, 'r') as f:
            lines = f.readlines()
        # First line is the header, split on commas
        header = lines[0].strip().split(',')  # Attribute names including 'image_id'
        try:
            smile_idx = header.index(attribute_name)  # Index of smile attribute
        except ValueError:
            print(f"Error: '{attribute_name}' not found in header: {header}")
            print("Available attributes:", header)
            raise
        labels = {}
        for line in lines[1:]:  # Start from second line (data)
            parts = line.strip().split(',')
            if len(parts) > smile_idx:  # Ensure enough columns
                img_name = parts[0]  # image_id
                smile_label = int(parts[smile_idx])  # Smile attribute value
                labels[img_name] = 1 if smile_label == 1 else 0  # Convert to binary (0: non-smiling, 1: smiling)
            else:
                print(f"Skipping malformed line: {line.strip()}")
        return labels
    except FileNotFoundError:
        print(f"Error: Label file {label_file} not found.")
        raise
    except Exception as e:
        print(f"Error reading label file: {str(e)}")
        raise

# Load labels
labels = load_labels(label_file, attribute_name='Smiling')

# Function to extract HOG features from an image
def extract_hog(image_path):
    try:
        img = io.imread(image_path)
        if len(img.shape) == 3:
            img = color.rgb2gray(img)
        img = transform.resize(img, (64, 64), anti_aliasing=True)
        hog_features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        return hog_features
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Load data and extract HOG features
X, y = [], []
image_files = [f for f in os.listdir(data_dir) if f.endswith('.jpg')]

for img_name in tqdm(image_files, desc="Extracting HOG features..."):
    if img_name in labels:
        img_path = os.path.join(data_dir, img_name)
        hog_features = extract_hog(img_path)
        if hog_features is not None:
            X.append(hog_features)
            y.append(labels[img_name])
        else:
            print(f"Skipping {img_path} due to processing error")

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

print(f"Extracted HOG features for {len(X)} images")

Extracting HOG features...: 100%|██████████| 10004/10004 [04:32<00:00, 36.66it/s]


Extracted HOG features for 10002 images


In [None]:
from sklearn.model_selection import train_test_split

# Split data into train (70%), validation (15%), and test (15%)
X_train_smile, X_temp, y_train_smile, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val_smile, X_test_smile, y_val_smile, y_test_smile = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train shape: {X_train_smile.shape}, Val shape: {X_val_smile.shape}, Test shape: {X_test_smile.shape}")

Train shape: (7001, 1764), Val shape: (1500, 1764), Test shape: (1501, 1764)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Standardize and reduce dimensionality with PCA
scaler = StandardScaler()
pca = PCA(n_components=200, random_state=42)

# Apply scaling and PCA to smile data
X_train_smile = scaler.fit_transform(X_train_smile)
X_val_smile = scaler.transform(X_val_smile)
X_test_smile = scaler.transform(X_test_smile)
X_train_smile = pca.fit_transform(X_train_smile)
X_val_smile = pca.transform(X_val_smile)
X_test_smile = pca.transform(X_test_smile)

print(f"Preprocessed feature shapes: Train {X_train_smile.shape}, Val {X_val_smile.shape}, Test {X_test_smile.shape}")

Preprocessed feature shapes: Train (7001, 200), Val (1500, 200), Test (1501, 200)


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the SGDClassifier
sgd_clf = SGDClassifier(loss='log_loss', random_state=42, max_iter=20)

# Define hyperparameter grid for tuning
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1],  # Regularization strength
    'learning_rate': ['constant', 'optimal', 'invscaling'],
    'eta0': [0.001, 0.01, 0.1],  # Initial learning rate (for constant/invscaling)
}

# Perform grid search with cross-validation on validation set
grid_search = GridSearchCV(
    sgd_clf,
    param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=1
)

# Fit grid search on training data
grid_search.fit(X_train_smile, y_train_smile)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Use best model for validation set prediction
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val_smile)
val_accuracy = accuracy_score(y_val_smile, y_val_pred)
print(f"Validation accuracy with best model: {val_accuracy:.4f}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters: {'alpha': 0.001, 'eta0': 0.01, 'learning_rate': 'invscaling'}
Best cross-validation accuracy: 0.8124559476179299
Validation accuracy with best model: 0.8173


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define SGDClassifier with fixed best parameters
sgd_clf = SGDClassifier(
    loss='log_loss',
    alpha=0.0001,
    learning_rate='invscaling',
    eta0=0.01,
    random_state=42
)

# Define grid for max_iter
param_grid = {
    'max_iter': [20, 50, 100, 200]
}

# Perform grid search
grid_search_epochs = GridSearchCV(
    sgd_clf,
    param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Fit on training data
grid_search_epochs.fit(X_train_smile, y_train_smile)

# Print results
print("Best epoch parameters:", grid_search_epochs.best_params_)
print("Best cross-validation accuracy:", grid_search_epochs.best_score_)

# Evaluate on validation set
best_model_epochs = grid_search_epochs.best_estimator_
y_val_pred = best_model_epochs.predict(X_val_smile)
val_accuracy = accuracy_score(y_val_smile, y_val_pred)
print(f"Validation accuracy with best epochs: {val_accuracy:.4f}")

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best epoch parameters: {'max_iter': 20}
Best cross-validation accuracy: 0.8123130700639937
Validation accuracy with best epochs: 0.8173


In [None]:
from sklearn.linear_model import SGDClassifier
import numpy as np

# Combine training and validation sets for final training
X_train_val_smile = np.vstack((X_train_smile, X_val_smile))
y_train_val_smile = np.hstack((y_train_smile, y_val_smile))

# Initialize SGDClassifier with best parameters and increased epochs
sgd_clf = SGDClassifier(
    loss='log_loss',
    alpha=0.0001,
    learning_rate='invscaling',
    eta0=0.01,
    max_iter=50,  # Increased from 20 to 50 for better convergence
    random_state=42
)

# Train the model
sgd_clf.fit(X_train_val_smile, y_train_val_smile)

print("Model training completed")

Model training completed


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on test set
y_test_pred = sgd_clf.predict(X_test_smile)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test_smile, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Generate classification report
print("Classification Report:")
print(classification_report(y_test_smile, y_test_pred, target_names=['Non-Smiling', 'Smiling']))

Test Accuracy: 0.8168
Classification Report:
              precision    recall  f1-score   support

 Non-Smiling       0.82      0.84      0.83       777
     Smiling       0.82      0.80      0.81       724

    accuracy                           0.82      1501
   macro avg       0.82      0.82      0.82      1501
weighted avg       0.82      0.82      0.82      1501



In [None]:
import joblib

# Save the model
joblib.dump(sgd_clf, '/content/drive/MyDrive/Smilage_Project_Data/sgd_clf_model.pkl')
print("Model saved to /content/drive/MyDrive/Smilage_Project_Data/sgd_clf_model.pkl")

Model saved to /content/drive/MyDrive/Smilage_Project_Data/sgd_clf_model.pkl


In [None]:
from skimage import io, color, transform
from skimage.feature import hog

# Function to preprocess a single image
def preprocess_image(image_path, scaler, pca):
    try:
        img = io.imread(image_path)
        if len(img.shape) == 3:
            img = color.rgb2gray(img)
        img = transform.resize(img, (64, 64), anti_aliasing=True)
        hog_features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features = scaler.transform([hog_features])  # Standardize
        hog_features = pca.transform(hog_features)  # Apply PCA
        return hog_features
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Predict on a new image
image_path = '/content/drive/MyDrive/Smilage_Project_Data/test_image.jpg'  # Replace with your uploaded image path
hog_features = preprocess_image(image_path, scaler, pca)
if hog_features is not None:
    prediction = sgd_clf.predict(hog_features)
    confidence = sgd_clf.predict_proba(hog_features)[0][prediction[0]]
    label = 'Smiling' if prediction[0] == 1 else 'Non-Smiling'
    print(f"Prediction: {label} (Confidence: {confidence:.4f})")
else:
    print("Failed to process the image")

Prediction: Smiling (Confidence: 0.7844)


In [None]:
from skimage import io, color, transform
from skimage.feature import hog
import numpy as np

# Function to preprocess a single image (matches CelebA training pipeline)
def preprocess_image(image_path, scaler, pca):
    try:
        # Load and preprocess image
        img = io.imread(image_path)
        if len(img.shape) == 3:
            img = color.rgb2gray(img)  # Convert to grayscale
        img = transform.resize(img, (64, 64), anti_aliasing=True)  # Resize to 64x64
        # Extract HOG features (same parameters as Cell 3)
        hog_features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        # Standardize and apply PCA (using scaler and pca from Cell 4)
        hog_features = scaler.transform([hog_features])
        hog_features = pca.transform(hog_features)
        return hog_features
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# Specify path to your uploaded .jpg image
image_path = '/content/drive/MyDrive/Smilage_Project_Data/test1_image.jpg'  # Replace with your image path

# Preprocess and predict
hog_features = preprocess_image(image_path, scaler, pca)
if hog_features is not None:
    prediction = sgd_clf.predict(hog_features)
    confidence = sgd_clf.predict_proba(hog_features)[0][prediction[0]]
    label = 'Smiling' if prediction[0] == 1 else 'Non-Smiling'
    print(f"Prediction: {label} (Confidence: {confidence:.4f})")
else:
    print("Failed to process the image. Check file path or image format.")

Prediction: Non-Smiling (Confidence: 0.7278)
