In [None]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!pip install -q kaggle

!kaggle datasets download -d jessicali9530/lfw-dataset -p /content/lfw --unzip

import os
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


########## HANDLING LARGE UNLABELED DATASET ############

def resize_images(input_folder, output_folder, size=(48, 48)):
  for root, _, filenames in os.walk(input_folder):
    for fname in filenames:
      if fname.lower().endswith('.jpg'):
        img_path = os.path.join(root, fname)
        # Compute the relative path to preserve subdirectory structure
        rel_dir = os.path.relpath(root, input_folder)
        output_dir = os.path.join(output_folder, rel_dir)
        if not os.path.exists(output_dir):
          os.makedirs(output_dir)
        output_path = os.path.join(output_dir, fname)
        img = Image.open(img_path)
        img_resized = img.resize(size, Image.Resampling.LANCZOS)
        img_resized.save(output_path)

resize_images('/content/lfw', '/content/lfw_resized')


def vectorizing(resizedfolder):
  resized_img_paths = []
  for root, _, filenames in os.walk(resizedfolder):
    for fname in filenames:
      if fname.endswith('.jpg'):
      #concatenating root with fname gives the complete path to the file
        resized_img_paths.append(os.path.join(root, fname))

  vectors = []
  for img in resized_img_paths:
    image = Image.open(img)
    gray_image = image.convert('L')
    gray_array = np.array(gray_image)
    gray_array = gray_array.flatten()
    vectors.append(gray_array)
  return vectors

unlabeleddata_vectors = vectorizing('/content/lfw_resized')

######### HANDLING SMALL LABELED DATASET ##########

df = pd.read_csv('ckextended.csv')
df = df[df["emotion"] != 6]
df["pixels"] = df["pixels"].apply(lambda x: np.fromstring(x, sep=' '))

labeleddata_vectors = df["pixels"].tolist()
labels = df["emotion"].tolist()

########## PCA ###########

vectors = unlabeleddata_vectors + labeleddata_vectors
df1 = pd.DataFrame(vectors)
x = df1.values
x = StandardScaler().fit_transform(x)

pca = PCA(n_components=0.99)
pca.fit(x)
print(pca.n_components_)


######## SVM #########

from sklearn.svm import SVC

training_df = df[df["Usage"] == "Training"]
new_x = training_df["pixels"].values
new_x_2d = np.array(list(new_x))
new_x_scaled = StandardScaler().fit_transform(new_x_2d)
new_x_pca = pca.transform(new_x_scaled)
training_labels = training_df["emotion"].tolist()

test_df = df[df["Usage"] != "Training"]
test_features = test_df["pixels"].values
test_features_array = np.stack(test_features)
test_features_scaled = StandardScaler().fit_transform(test_features_array)
test_features_pca = pca.transform(test_features_scaled)
test_labels = test_df["emotion"].tolist()

svm = SVC(kernel='linear')
svm.fit(new_x_pca, training_labels)
pred_labels = svm.predict(test_features_pca)

from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, pred_labels))

# import matplotlib.pyplot as plt

# x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
# y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
# xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
#                      np.arange(y_min, y_max, 0.02))


# plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
# plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
# plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
#             s=100, facecolors='none', edgecolors='k', linewidths=1.5, label='Support Vectors')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.title('SVM Decision Boundary')
# plt.legend()
# plt.show()









Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/jessicali9530/lfw-dataset
License(s): other
Downloading lfw-dataset.zip to /content/lfw
 71% 80.0M/112M [00:00<00:00, 830MB/s]
100% 112M/112M [00:00<00:00, 771MB/s] 
1007
0.7910447761194029
