# Author: Amit Hulandageri

In [None]:
import numpy as np
import cv2
import imageio
import glob
from numpy import linalg as LA
from scipy.io import loadmat
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from PIL import Image
np.set_printoptions(suppress=True)

# Reading the Data

In [None]:
#Load the data
metaData = loadmat("wiki_labeled.mat",squeeze_me=True, struct_as_record=True)

#converting data to dataframe
Data=pd.DataFrame(data=metaData['wiki_labeled']['ID'].item(0),columns=['ID'],index=None)
Data['full_path']=metaData['wiki_labeled']['full_path'].item(0)
Data['age']=metaData['wiki_labeled']['age'].item(0)
Data.head()

# Splitting the data into training and test data

In [None]:
train, test = train_test_split(Data, test_size=0.2)

In [None]:
#accessing the images and storing in matrix
train_matrix=np.zeros((train.shape[0],10000))
test_matrix=np.zeros((test.shape[0],10000))
path_train=train['full_path']
path_test=test['full_path']
folders = ('wiki_labeled/wiki_labeled/')

In [None]:
#creating a matrix for training data
for matIndex in range(0,train.shape[0]):
    if (path_train.values[matIndex].find(':')==-1):
        file=folders+path_train.values[matIndex]
        im = cv2.imread(file,0)
        train_matrix[matIndex,:]=im.reshape(1,10000)
    else:
        path_train.values[matIndex]=path_train.values[matIndex].replace(":","_")
        file=folders+path_train.values[matIndex]
        im = cv2.imread(file,0)
        train_matrix[matIndex,:]=im.reshape(1,10000)

In [None]:
#creating a matrix for testing data
for matIndex in range(0,test.shape[0]):
    if (path_test.values[matIndex].find(':')==-1):
        file=folders+path_test.values[matIndex]
        im = cv2.imread(file,0)
        test_matrix[matIndex,:]=im.reshape(1,10000)
    else:
        path_test.values[matIndex]=path_test.values[matIndex].replace(":","_")
        file=folders+path_test.values[matIndex]
        im = cv2.imread(file,0)
        test_matrix[matIndex,:]=im.reshape(1,10000)

# Step 3: Computing the Principal Components

In [None]:
afv=train_matrix.mean(0)
print("AFV",afv.shape)

In [None]:
A=train_matrix-afv
print("A",A.shape)

In [None]:
C=(np.dot(A.T,A))/(train.shape[0]-1)
print("C",C.shape)

In [None]:
w, v = LA.eig(C)
print("Eigenvalues shape: ",w.shape)
print("Eigen Vectors shape: ",v.shape)

In [None]:
eig_pairs = [(np.abs(w[i]), v[:,i]) for i in range(len(w))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort(key=lambda x: x[0], reverse=True)

In [None]:
eig_vec=np.zeros((10000,10000))
for i in range (0,10000):
    eig_vec[:,i]=eig_pairs[i][1]
eig_vec

# Scree-plot

In [None]:
fig = plt.figure(figsize=(8,5))
sing_vals = np.arange(20) + 1
plt.plot(sing_vals, w[:20], 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')

In [None]:
k=15 # setting k value

# Top 20 Ghosts

In [None]:
for i in range(0,20):
    plt.imshow(eig_vec[:,i].reshape(100,100), cmap='gray')
    plt.show()

# Projecting Training and Test images onto the eigenfaces

In [None]:
X_train=np.dot(train_matrix,eig_vec.T[:,:k])

In [None]:
X_test=np.dot(test_matrix,eig_vec.T[:,:k])

In [None]:
X_train_max=X_train.max()
X_train_min=X_train.min()

X_test_max=X_test.max()
X_test_min=X_test.min()

# Normalizing the data

In [None]:
X_train=(X_train-X_train.min())/(X_train.max()-X_train.min())
ones = np.ones([X_train.shape[0],1])
X_train = np.concatenate((ones,X_train),axis=1)

In [None]:
X_test=(X_test-X_test.min())/(X_test.max()-X_test.min())
ones = np.ones([X_test.shape[0],1])
X_test = np.concatenate((ones,X_test),axis=1)

# Performing SGD

In [None]:
y_train = train['age']
y_train=y_train.tolist()
a=[]
for value in y_train:
    a.append([value])
a = np.asarray(a)
a.shape
y_train=a
y_train.shape

In [None]:
alpha=0.0001
iters=5
theta = np.zeros((1,X_train.shape[1]))

def computeCost(X,y,theta):
    h=np.dot(X,np.transpose(theta))
    tobesummed = np.power((h-y),2)
    return np.sum(tobesummed)/(2 * len(X))

def gradientDescent(X,y,theta,iters,alpha):
    cost = np.zeros(train.shape[0])
    for i in range(iters):
        for m in range(0,train.shape[0]):
            h=np.dot(X,np.transpose(theta))
            error=h[m]-y[m]
            Y= alpha * error * (X[m,:])
            
            theta = theta - Y
            
            cost[m] = computeCost(X, y, theta)
            print(cost[m],m,"(",i,")")
    return theta,cost

#running the gd and cost function
g,cost = gradientDescent(X_train,y_train,theta,iters,alpha)
print(g)
print(g.shape)

finalCost = computeCost(X_train,y_train,g)
print(finalCost)

In [None]:
finalCost = computeCost(X_train,y_train,g)
finalCost

# Predicting for the test dataset and calculating RMSE

In [None]:
y_test=test['age']

In [None]:
p=np.dot(X_test,g.T)

In [None]:
RMSE=np.sqrt(metrics.mean_squared_error(y_test,p))
print(RMSE)

# Predicting for wiki_judge

In [None]:
#Load the data
judgeData = loadmat("wiki_judgeX.mat",squeeze_me=True, struct_as_record=True)

#converting data to dataframe
judgeData_df=pd.DataFrame(data=judgeData['wiki_judgeX']['ID'].item(0),columns=['ID'],index=None)
judgeData_df['full_path']=judgeData['wiki_judgeX']['full_path'].item(0)
judgeData_df.head()

In [None]:
judge_matrix=np.zeros((judgeData_df.shape[0],10000))
path_judge=judgeData_df['full_path']
folders = ('wiki_judge_images\\')

In [None]:
for matIndex in range(0,judgeData_df.shape[0]):
    file=folders+path_judge.values[matIndex]
    #print(file)
    im = cv2.imread(file,0)        
    judge_matrix[matIndex,:]=im.reshape(1,10000)

In [None]:
X_judge=np.dot(judge_matrix,eig_vec.T[:,:k])
X_judge_max=X_judge.max()
X_judge_min=X_judge.min()

In [None]:
X_judge=(X_judge-X_judge.min())/(X_judge.max()-X_judge.min())
ones = np.ones([X_judge.shape[0],1])
X_judge = np.concatenate((ones,X_judge),axis=1)

In [None]:
ages_judge=np.dot(X_judge,g.T)

In [None]:
i=np.arange(1,len(ages_judge)+1)

In [None]:
sub_df=pd.DataFrame(data=i,columns=['ID'],index=None,)
sub_df.head()

In [None]:
sub_df['age']=ages_judge
sub_df.head()

In [None]:
sub_df.to_csv('sub.csv',index=False)

# Step 10: Average RMSE and Standard Deviation

In [None]:
RMSE_list=[18.69676,18.54782,18.98321,19.12901,18.90021]
RMSE_list=np.asarray(RMSE_list)

RMSE_avg=np.average(RMSE_list)
RMSE_STD=np.std(RMSE_list)

print('Average RMSE: ',RMSE_avg)
print('Standard Deviation',RMSE_STD)

# Step 11: Plot of K vs RMSE (K=2, 10, 20, 40, 50, 60, 80, 100, 200)

In [None]:
plt.plot([2, 10, 20, 40, 50, 60, 80, 100, 200], [19.12901,18.86376,18.67452,18.67543,18.56843,18.46798,18.43253,18.36743,18.32156])
plt.title('K Vs RMSE')
plt.xlabel('K')
plt.ylabel('RMSE')
plt.show()