In [1]:
#MSE = sum of the squared difference between the two images

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import scipy
import os
import random
from scipy import stats

In [3]:
imageX = 64
imageY = 64

In [4]:
def forceLandscape(image):
    if image.shape[0] > image.shape[1]:
        return np.rot90(image)
    return image

def resizeImage(image, x,y):
    return scipy.misc.imresize(image,(x,y), interp='nearest')

def grayscale(image):
    return np.dot(image[...,:3], [0.21, 0.72, 0.07])

def reshape(image):
    return image.flatten()

In [5]:
def formatImage(image):
    image = forceLandscape(image)
    image = resizeImage(image, imageX, imageY)
    image = grayscale(image)
    return image

In [6]:
#Load Images
datasetPath = '../../../TERC/WinEarthPhotosByKeyword/'
imageClass = 'Movies3of36PhotosEach'
imageDirectory = os.listdir(datasetPath + imageClass)
if imageDirectory[0] == '.DS_Store':
    imageDirectory.pop(0)
data = imageDirectory

In [7]:
#Load Random Images
datasetPath2 = '../../../BU10000SetA/'
imageDirectory2 = [random.choice(os.listdir(datasetPath2)) for x in range(0,108)]
if imageDirectory2[0] == '.DS_Store':
    imageDirectory2.pop(0)
randomData = imageDirectory2

In [9]:
for i in range(0, len(data)):
    imagePath = datasetPath + imageClass + "/" +  data[i]
    img = mpimg.imread(imagePath)
    img = formatImage(img)
    img = reshape(img)        #flatten the img
    data[i] = img
    
print("loading complete")

loading complete


In [10]:
#random data loading
for i in range(0, len(randomData)):
    imagePath2 = datasetPath2 +  randomData[i]
    img2 = mpimg.imread(imagePath2)
    img2 = formatImage(img2)
    img2 = reshape(img2)        #flatten the img
    randomData[i] = img2
    
print("loading complete")

loading complete


In [11]:
#mean square error
#the lower the MSE, the more "similar" the two images are

def mse(imageA, imageB):
    err = ((imageA - imageB) ** 2).mean(axis=None)
    return err

#images must have the same dimension
#with ax=0 the average is performed along the row, for each column, returning an array
#with ax=1 the average is performed along the column, for each row, returning an array
#with ax=None the average is performed element-wise along the array, returning a single value

In [12]:
def getResult(dataset):
    results = []
    for i in range(0,len(dataset)-1):
        results.append(mse(dataset[i],dataset[i+1]))
    return (results)

In [13]:
#Calculate mean measure within data set
set1 = data[:36]
set2 = data[36:72]
set3 = data[72:]

results = [getResult(set1),getResult(set2),getResult(set3)]
mean = [np.mean(results[0]),np.mean(results[1]),np.mean(results[2])]
std = [np.std(results[0]),np.std(results[1]),np.std(results[2])]

In [14]:
#Calculate mean measure within random data set
rset1 = randomData[:36]
rset2 = randomData[36:72]
rset3 = randomData[72:]

randomResults = [getResult(rset1),getResult(rset2),getResult(rset3)]
randomMean = [np.mean(randomResults[0]),np.mean(randomResults[1]),np.mean(randomResults[2])]
randomStd = [np.std(randomResults[0]),np.std(randomResults[1]),np.std(randomResults[2])]

In [15]:
# print(results)
print("Mean MSE for each Movie set:")
print(mean)
print("")

print("Standard Deviation of MSE for each Movie set:")
print(std)
print("")

# print(randomResults)
print("Mean MSE for some random set:")
print(randomMean)
print("")

print("Standard Deviation of MSE for some random set:")
print(randomStd)

Mean MSE for each Movie set:
[120.45013461077006, 274.28712412458145, 788.22095293805796]

Standard Deviation of MSE for each Movie set:
[17.123181973182938, 22.247068708451604, 347.62305874261222]

Mean MSE for some random set:
[8471.0240386070018, 10090.292642047991, 6235.0035556466237]

Standard MSE of Entropy for some random set:
[6760.1707122884181, 5618.7932498191831, 4902.5995057476575]


In [16]:
#Compare similarity
ttest = stats.ttest_ind(results[0] + results[1] + results[2], randomResults[0] + randomResults[1] + randomResults[2])
print("T-test Results for 2 samples:")
print(ttest)

T-test Results for 2 samples:
Ttest_indResult(statistic=-13.306605468171886, pvalue=1.2349464564406418e-29)


In [17]:
#Null Hypothesis = There is difference between samples
#alpha = 0.05
#p-value = 1.2349464564406418e-29
#Since p-value <<< alpha, reject the null hypothesis. 
#Therefore, the difference between the population means is statistically significant