Copyright 2015, Brown University, Providence, RI.

                        All Rights Reserved

Permission to use, copy, modify, and distribute this software and its
documentation for any purpose other than its incorporation into a
commercial product is hereby granted without fee, provided that the
above copyright notice appear in all copies and that both that
copyright notice and this permission notice appear in supporting
documentation, and that the name of Brown University not be used in
advertising or publicity pertaining to distribution of the software
without specific, written prior permission.

BROWN UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ANY
PARTICULAR PURPOSE.  IN NO EVENT SHALL BROWN UNIVERSITY BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.


In [9]:
# perform PCA on all labeled images

import numpy as np
from matplotlib import pyplot as plt
import os
from scipy.misc import imsave
from scipy.misc import imresize
from PIL import Image
from scipy.stats import chisquare
import csv
import pickle

# size of the long side of the image (usually 10% edge is cut out)
# for feature detection, img_size = 400 is recommended
img_size = 200
var_ret = [0.70,0.75,0.80,0.85,0.90,0.95,0.99]
filename = '../PCA_whale_faces/PCA_face_'+str(img_size)+'x'+str(img_size/2)+'_all_var0'

mean = 111.031397463 
std = 18.4898562094

def read_data_csv(name):
   """
   read cvs file and return contents
   """

   file_name = []
   whale_ID = []
   
   with open(name, 'rb') as f:
      reader = csv.reader(f)

      # skip header
      reader.next()

      for row in reader:  
         file_name.append(row[0])
         whale_ID.append(row[1])
           
   file_name = np.array(file_name)
   whale_ID = np.array(whale_ID)
   
   return file_name,whale_ID


def PCA(X, varRetained = [0.95],filename = 'PCA_data.dat',label = [], files = []):
    '''
    Performs the Principal Coponent analysis of the Matrix X
    Matrix must be n * m dimensions
    where n is # features
    m is # examples
    '''
    # Compute Covariance Matrix Sigma
    (n, m) = X.shape

    Sigma = 1.0 / float(m) * np.dot(X, np.transpose(X))
    # Compute eigenvectors and eigenvalues of Sigma
    U, s, V = np.linalg.svd(Sigma)

    # compute the value k: number of minumum features that 
    # retains the given variance
    s_tot = np.sum(s)
        
    var_i = np.array([np.sum(s[: i + 1]) / s_tot * 100.0 for i in range(n)])
    
    k = np.zeros(len(varRetained))
    for i in range(len(k)):
        k[i] = len(var_i[var_i < (varRetained[i] * 100e0)])

        print '%.2f %% variance retained in %d dimensions' % (var_i[k[i]], k[i])

        # compute the reduced dimensional features 
        U_reduced = U[:, : k[i]]
        Z = np.dot(np.transpose(U_reduced),X)

        # pickle dump the results
        f = open(filename+str(int(varRetained[i]*100e0))+'.dat','w')
        pickle.dump([Z, U_reduced, k[i],label, files],f)
        f.close() 

    return 


name,ID = read_data_csv('../train.csv')

faces = np.unique(ID)

# find files in these directories
files = []

for i in range(len(faces)):
    for file in os.listdir("../PCA_whale_faces/"+faces[i]):
        if file.endswith('.jpg'):
            files.append(faces[i]+'/'+file)

imgs = np.zeros([len(files),img_size*0.8,(img_size/2e0 - img_size*0.3)])
IDs = np.zeros([len(files)])


print 'loading images'

for j in range(len(files)):
    img = imresize(np.array(Image.open('../whale_faces/'+files[j][12:]).convert('L')),[img_size,img_size/2])[img_size/10:-img_size/10,img_size*0.15:-img_size*0.15]
    # standardize image
    img = (img - np.mean(img)) / np.std(img)
    # restandardize image
    img = (img*std*3e0)+mean
    img[img < 0e0] = 0e0
    img[img > 255e0] = 255e0
    img = np.array(img, dtype='uint8')    

    #if j%2 == 0:
    #    # flip the image
    #    imgs[j,:] = np.fliplr(img)
    #else:
    imgs[j,:] = img
    
    for i in range(len(faces)):
        if files[j][:11] == faces[i]:
            IDs[j] = faces[i][6:]

            
print 'doing PCA'

# standardize the images
imgs_standard = np.zeros(np.shape(imgs))
mean = np.mean(imgs,axis=(1,2))
std = np.std(imgs,axis=(1,2))
for i in range(len(files)):
    imgs_standard[i,:,:] = (imgs[i,:,:] - mean[i] ) / std[i] 

imgs_l = np.reshape(imgs_standard,[len(files),np.prod(np.shape(imgs[0,:,:]))])
imgs_learn = np.transpose(imgs_l)

print np.shape(imgs_learn)
print np.shape(imgs)

# do PCA and save the results
PCA(imgs_learn,varRetained = var_ret,filename = filename,label = IDs, files = files)

print 'finished'

loading images
doing PCA
(6400, 4544)




(4544, 160, 40)
70.02 % variance retained in 19 dimensions
75.28 % variance retained in 31 dimensions
80.10 % variance retained in 50 dimensions
85.07 % variance retained in 85 dimensions
90.00 % variance retained in 157 dimensions
95.01 % variance retained in 357 dimensions
99.00 % variance retained in 1115 dimensions
finished




In [1]:
# generate a submission file using imgs from whale_faces, PCA with nearest neighbor approach

from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle
from scipy.misc import imsave
import glob
import csv
from scipy.misc import imresize
from PIL import Image

img_size = 300

print 'loading PCA data and training nearest neighbors'

f = open('../PCA_whale_faces/PCA_face_'+str(img_size)+'x'+str(img_size/2)+'_all_var090.dat','r')
[Z, U_reduced, k, label, files] = pickle.load(f)
f.close()

X_train = np.transpose(Z)
Y_train = label
whale_IDs, ID_counts = np.unique(label,return_counts=True)

nbrs = NearestNeighbors(n_neighbors=10).fit(X_train)

print 'reading images'

files_test = []
with open('../sample_submission.csv', 'rb') as f:
    reader = csv.reader(f)

    # skip header
    header = reader.next()
    
    for row in reader:  
        files_test.append(row[0])
    
# read images
imgs = np.zeros([len(files_test),img_size*0.8,(img_size/2e0 - img_size*0.3)])
mean = np.zeros(len(files_test))
std = np.zeros(len(files_test))
for j in range(len(files_test)):
    img = imresize(np.array(Image.open('../whale_faces/'+files_test[j]).convert('L')),[img_size,img_size/2])[img_size/10:-img_size/10,img_size*0.15:-img_size*0.15]
    # standardize image
    mean[j] = np.mean(img)
    std[j] = np.std(img)
    img = (img - mean[j]) / std[j]
    imgs[j,:] = img
    
# reshape array
imgs_r = np.reshape(imgs,[len(files_test),np.prod(np.shape(imgs[0,:,:]))])
# project imgs_r to U_reduced
X_test = np.transpose(np.dot(np.transpose(U_reduced),np.transpose(imgs_r)))

print 'run kneighbors'
distances, indcs = nbrs.kneighbors(X_test)
print 'finished'

loading PCA data and training nearest neighbors
reading images




run kneighbors
finished


In [1]:

print 'create submission file'

def read_data_csv(name):
   """
   read cvs file and return contents
   """

   file_name = []
   whale_ID = []
   
   with open(name, 'rb') as f:
      reader = csv.reader(f)

      # skip header
      reader.next()

      for row in reader:  
         file_name.append(row[0])
         whale_ID.append(int(row[1][-5:]))
           
   file_name = np.array(file_name)
   whale_ID = np.array(whale_ID)
   
   return file_name,whale_ID


name,ID = read_data_csv('../train.csv')
whale_IDs_all, ID_counts_all = np.unique(ID,return_counts=True)


# scaling constants to try
const1 = np.max(1e0*ID_counts/float(len(whale_IDs))) / np.min(1e0/distances[:,0])
const2 = np.max(1e0*ID_counts/float(len(whale_IDs))) / np.min(1e0/distances[:,0]**2e0)
const3 = np.max(1e0*ID_counts/float(len(whale_IDs))) / np.min(1e0/distances[:,0]**3e0)
const4 = np.max(1e0*ID_counts/float(len(whale_IDs))) / np.min(1e0/distances[:,0]**4e0)
const5 = np.max(1e0*ID_counts/float(len(whale_IDs))) / np.min(1e0/distances[:,0]**5e0)

const32 = np.min(1e0*ID_counts/float(len(whale_IDs))) / np.min(1e0/distances[:,0]**3e0)


with open('../submissions/submission_newcascade_leancut_sorted_300_varret90_c4\d4_nonuni.csv', 'wb') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    # loop through the images
    for i in range(len(files_test)):
        # generate scores for the image
        scores = np.zeros(len(whale_IDs_all))

        # loop through the whale IDs
        for j in range(len(whale_IDs_all)):
            scores[j] = float(ID_counts_all[j])/float(len(whale_IDs_all))
        
        for j in range(len(whale_IDs_all)):
            
            if Y_train[indcs[i,0]] == whale_IDs_all[j]:
                scores[j] = 4e0*const4/(distances[i,0]**4e0)
            #if Y_train[indcs[i,1]] == whale_IDs[j]:
            #    scores[j] = 1
            #if Y_train[indcs[i,2]] == whale_IDs[j]:
            #    scores[j] = 1
        writer.writerow([files_test[i]]+list(scores))
print 'done'

create submission file


NameError: global name 'csv' is not defined