# Preprocessing

In [12]:
import numpy as np
from matplotlib import pyplot as plt 
import glob
from skimage.io import imread

daisy,dandelion,rose,sunflower,tulip = [],[],[],[],[]

for filename in glob.glob('flowerscomp2/daisy/*.jpg'): # for daisies
    im = imread(filename,True).flatten()
    daisy.append(im)    

for filename in glob.glob('flowerscomp2/dandelion/*.jpg'): # for dandelion
    im = imread(filename,True).flatten()
    dandelion.append(im)
    
for filename in glob.glob('flowerscomp2/rose/*.jpg'): # for roses
    im = imread(filename,True).flatten()
    rose.append(im)
    
for filename in glob.glob('flowerscomp2/sunflower/*.jpg'): # for sunflowers
    im = imread(filename,True).flatten()
    sunflower.append(im)
    
for filename in glob.glob('flowerscomp2/tulip/*.jpg'): # for tulip
    im = imread(filename,True).flatten()
    tulip.append(im)
    
daisy_arr=np.asarray(daisy)
dandelion_arr=np.asarray(dandelion)
rose_arr=np.asarray(rose)
sunflower_arr=np.asarray(sunflower)
tulip_arr=np.asarray(tulip)

# daisy = 0, dandelion = 1, rose = 2, sunflower = 3, tulip = 4
daisy_labels=np.zeros((len(daisy_arr),1))
dandelion_labels=np.ones((len(dandelion_arr),1))
rose_labels=2*np.ones((len(rose_arr),1))
sunflower_labels=3*np.ones((len(sunflower_arr),1))
tulip_labels=4*np.ones((len(tulip_arr),1))


# Here are the labels and the dataset 
labels=np.vstack((daisy_labels,dandelion_labels,rose_labels,sunflower_labels,tulip_labels))
dataset=np.vstack((daisy_arr,dandelion_arr,rose_arr,sunflower_arr,tulip_arr))

In [6]:
def normalize(A):
    stds = [np.std(A[r,:]) for r in range(A.shape[0])]
    for r in range(A.shape[0]):
        A[r,:] /= stds[r]
    return A

def centralize(A):
    means = [np.mean(A[r,:]) for r in range(A.shape[0])]
    for r in range(A.shape[0]):
        A[r,:] -= means[r]
    return A

def unisonShuffledCopies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def PCA(A, numComponents):
    # First, center the data
    A = normalize(centralize(A))
    
    # Now, compute the SVD, and reduce the dimensions of A.
    U, Sigma, Vt = np.linalg.svd(A)
    D = np.matmul(A.T, U[:,:numComponents])
    
    return D.T

In [None]:
# means of the photos
fig=plt.figure(figsize=(20, 20))
fig.add_subplot(1,5,1)
plt.imshow(np.reshape(daisy_arr.mean(0),(100,100)),cmap='bone')
plt.title("Daisy, mean")
fig.add_subplot(1,5,2)
plt.imshow(np.reshape(dandelion_arr.mean(0),(100,100)),cmap='bone')
plt.title("Dandelion, mean")
fig.add_subplot(1,5,3)
plt.imshow(np.reshape(rose_arr.mean(0),(100,100)),cmap='bone')
plt.title("Rose, mean")
fig.add_subplot(1,5,4)
plt.imshow(np.reshape(sunflower_arr.mean(0),(100,100)),cmap='bone')
plt.title("Sunflower, mean")
fig.add_subplot(1,5,5)
plt.imshow(np.reshape(tulip_arr.mean(0),(100,100)),cmap='bone')
plt.title("Tulip, mean");

In [None]:
# Printing top principal components 
fig=plt.figure(figsize=(8,8))
centered_daisy = daisy_arr.T-np.reshape(daisy_arr.T.mean(1),(10000,1))
centered_dandelion = dandelion_arr.T-np.reshape(dandelion_arr.T.mean(1),(10000,1))
u1,s1,v1 = np.linalg.svd(centered_daisy)
u2,s2,v2 = np.linalg.svd(centered_dandelion)
component1,component2 = u1[:,[0]],u1[:,[1]]
component3,component4 = u2[:,[0]],u2[:,[1]]
fig.add_subplot(221)
plt.imshow(np.reshape(component1,(100,100)),cmap='bone')
plt.title("Daisy, first component")
fig.add_subplot(222)
plt.imshow(np.reshape(component2,(100,100)),cmap='bone')
plt.title("Daisy, second component");
fig.add_subplot(223)
plt.imshow(np.reshape((component3),(100,100)),cmap='bone')
plt.title("Dandelion, first component")
fig.add_subplot(224)
plt.imshow(np.reshape((component4),(100,100)),cmap='bone')
plt.title("Dandelion, second component");


In [None]:
# Centering the dataset 
mean=np.reshape(dataset.T.mean(1),(len(dataset.T),1))
centered=dataset.T-mean

i = 0
stds = [np.std(row) for row in centered]
normalized_samples=[]
for row in centered:
    normalized_samples.append(row/stds[i])
    i+=1

# Here I used centered to mean centered and normalized
centered = np.asarray(normalized_samples)

U,S,V=np.linalg.svd(centered)

# Determining Variation
total_s = 0
fnorm = np.linalg.norm(centered,'fro')
for k in range(2,12):
    total_s += S[k]**2
    print(k,(total_s/fnorm**2)*100)


In [None]:
# Visualization of the datapoints 


p2c = (U[:,[0,1]].T@centered).T
points = [p2c[0:768],p2c[768:1820],p2c[1820:2603],p2c[2603:3337],p2c[3337:4321]]
colors = ['r','b','g','k','m']
which_flower = ['daisy','dandelion','rose','sunflower','tulip']
fig = plt.figure()

for i in range(5):
    x,y = points[i][:,0], points[i][:,1]
    plt.scatter(x,y,c=colors[i],label=which_flower[i])
plt.title('Two Dimensional PCA Features')
plt.legend(loc='upper right');

# k-Nearest Neighbors (Bill Lee)

In [None]:
from heapq import heappush, heappop
from scipy import stats

def kNN(k,testset,testlabels,trainset,trainlabels):
    # distance stores threeple (euclidan distance, training class, actual test class)
    confusion = np.zeros((5,5))
    for i in range(len(testset)):
        distances = []
        for j in range(len(trainset)):
            heappush(distances,(np.linalg.norm(testset[i]-trainset[j]),trainlabels[j]))
    
        # now take the k nearest
        tally = []
        for near in range(k):
            tally.append(heappop(distances)[1])
        
        winner = int(stats.mode(tally)[0])
        confusion[int(testlabels[i])][winner] += 1
    
    return confusion
        

def fiveFoldCV(data,labels):
    mega_confusion=[]
    for k in range(5,10,2):
        total_confusion = np.zeros((5,5))
        for i in range(5):
            sindex1 = (i*864)
            sindex2 = 4321 if i==4 else (i+1)*864
            test_set = data[sindex1:sindex2]
            test_labels = labels[sindex1:sindex2]
            training_set = np.concatenate((data[sindex2:4321],data[0:sindex1]))
            training_labels = np.concatenate((labels[sindex2:4321],labels[0:sindex1]))
            temp_confusion = kNN(k, test_set, test_labels, training_set, training_labels)
            total_confusion += temp_confusion
        mega_confusion.append(total_confusion)
        
    return mega_confusion
 
mega_confusion = fiveFoldCV(workable_samples.T,workable_labels)


In [None]:
finalTopk=U[:,0:10]
new_dataset=finalTopk.T@centered

def scramble(samples,labels):
    fix_labels=np.reshape(labels,(1,labels.shape[0]))
    mashed = np.vstack((samples,fix_labels)).T
    np.random.shuffle(mashed)
    labels_out = mashed.T[mashed.T.shape[0]-1]
    samples_out = mashed.T[0:mashed.T.shape[0]-1]
    return [samples_out,labels_out]
    
workable_samples, workable_labels = scramble(new_dataset,labels)

In [None]:
for i in range(len(mega_confusion)):
    print("k (nearest neighbors)",((2*(i+1))+1),"with confusion matrix\n",mega_confusion[i])

In [None]:
k = 3
for matrix in mega_confusion:
    correct = 0
    total = 0
    for i in range(5):
        for j in range(5):
            if(i==j):
                correct+=matrix[i][j]
            total += matrix[i][j]
    print(k, "with correct classification percentage", (correct/total)*100)
    k += 2
                            

# Support Vector Machine (Lester Fan)

In [None]:
def makeScatterPlot(title, pointsLists, colors, markers, xlabel = '', ylabel = ''):
    # Create plot
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
     
    for i, pointsList in enumerate(pointsLists):
        currColor = colors[i]
        currMarker = markers[i]
        for point in pointsList:
            x, y = point
            ax.scatter(x, y, c=currColor, marker=currMarker)
        
    # Temporary, add line
    axes = plt.gca()
    x_vals = np.array(axes.get_xlim())
    y_vals = 0 + 1 * x_vals
    plt.plot(x_vals, y_vals, '--')
    
    # plt.xlim(0, 100)
    # plt.ylim(0, 100)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(loc=2)
    plt.show()

def calcError(y1, y2):
    return 1 if y1 != y2 else 0

def KCVsplit(k, X, y):
    """
    Splits the training data (X) into k folds in which one fold is used for testing
    and all the others are used for training. Does this k times. 
    
    :param int k
    :param np.array X
    :param np.array y
    :return: (folds, Xlist, ylist, table) where folds is a list of all 
             the folds, Xlist is a list of np.arrays
             corresponding to the X used for training in each iteration 
             (all the folds except for index i),
             ylist is a list of np.arrays corresponding to the y used for training
             and table is a hash table mapping rows in X to their corresponding
             label y.
    :rtype: tuple of (list of np.array, np.array, np.array, dict)
    """
    Xlist = []
    ylist = []
    table = {}
    
    # Set table up
    for i, row in enumerate(X):
        table[str(row)] = y[i]
    
    # Split X up into folds
    stepSize = int(X.shape[0] / k) + 1
    steps = [min(X.shape[0], stepSize * n) for n in range(1, k)]
    folds = np.split(X, steps)
    foldLabels = np.split(y, steps)
    
    for i, currFold in enumerate(folds):
        # Get XTrain and yTrain with all the other folds except currFold
        otherFolds = [otherFold for j, otherFold in enumerate(folds) if j != i]
        otherFoldLabels = [label for j, label in enumerate(foldLabels) if j != i]
        XTrain = otherFolds[0]
        yTrain = otherFoldLabels[0]
        for j in range(1, len(otherFolds)):
            XTrain = np.append(XTrain, otherFolds[j], axis=0)
            yTrain = np.append(yTrain, otherFoldLabels[j], axis=0)
        Xlist.append(XTrain)
        ylist.append(yTrain)
    
    return (folds, Xlist, ylist, table)

def performValidation(k, X, y, model, plotDiffs = False):
    """
    Performs cross validation
    """
    folds, Xlist, ylist, table = KCVsplit(k, X, y)
    
    confusionMatrices = []
    foldSizes = []
    
    # For each fold i, use i as testing, and use all the other
    # i - 1 folds for training. Calculate the errors for each and store
    # them in avgErrors
    predTable = {}
    avgErrors = []
    for i in range(len(Xlist)):
        currFold = folds[i]
        XTrain = Xlist[i]
        yTrain = ylist[i]
        
        # Make a confusion matrix
        confusionMatrix = [[0 for c in range(5)] for r in range(5)]
        
        # Train the model
        model.fit(XTrain, yTrain)
        
        # Now, evaluate the trained model against the actual labels,
        # keep track of total error!
        totalError = 0
        for row in currFold:
            
            # Get the current predicted value
            currRow = row.reshape(1, -1)
            yPredicted = model.predict(currRow)
            
            predicted = yPredicted[0]
            actual = table[str(row)][0]
            
            # Update the confusion matrix
            confusionMatrix[int(predicted)][int(actual)] += 1
            
            totalError += calcError(predicted, actual)
            predTable[str(row)] = yPredicted
        # totalError /= len(currFold)
        avgErrors.append(totalError)
        foldSizes.append(len(currFold))
        confusionMatrices.append(np.array(confusionMatrix))
          
    predictions = []
    actuals = []
    plotPoints = []
    for row in X:
        hashVal = str(row)
        predictions.append(predTable[hashVal][0])
        actuals.append(table[hashVal][0])
    plotPoints = [(predictions[i], actuals[i]) for i in range(len(actuals))]

    # Optionally, plot the measured vs predicted values
    if plotDiffs:
        makeScatterPlot("Measured vs Predicted", 
                        [plotPoints], 
                        ['r'], 
                        ['.'],
                        'Predicted',
                        'Actual')
    return avgErrors, foldSizes, confusionMatrices

In [19]:
from sklearn.svm import SVC
model = SVC()
print(dataset.shape)
pcaData = PCA(dataset.T, 100).T
pcaData = dataset
print(pcaData.shape)
k = 5

(4321, 10000)
(4321, 10000)


In [None]:
pcaData, labels = unisonShuffledCopies(pcaData, labels)
errors, foldSizes, confusionMatrices = performValidation(k, pcaData, labels, model)
for i, confusionMatrix in enumerate(confusionMatrices):
    print("Confusion Matrix {}".format(i))
    print(confusionMatrix)
    print("Accuracy rate = {}".format(1 - float(errors[i]) / foldSizes[i]))
    print("")


# Naive Bayes Classifier (Anton Maliev)

In [16]:
import numpy.linalg as la;
def Gaussian(x, mean, cov, dim) :
    return 1/np.sqrt((2*np.pi)**dim*la.norm(x))*np.exp(-np.matmul(np.matmul(np.transpose(x-mean),np.inv(cov)),x-mean)/2);
def NBClassifier(training, trlabels, test, dim) :
    # Split training data and calculate means and standard deviations
    train = []; means = []; stds = [];
    for i in range(5) :
        train.append(training[np.ix_((0,1),np.where(trlabels==0)[1])]);
        means.append(train[i].mean(1));
        stds.append(train[i].std(1));
#     means = np.concatenate((np.concatenate((train0.mean(1),train1.mean(1)),1),train2.mean(1)),1);
#     stds = np.concatenate((np.concatenate((train0.std(1),train1.std(1)),1),train2.std(1)),1);
    # Classify each test datum based on Gaussian output
    result = [];
    for i in range(len(result)) :
        x = [];
        for j in range(dim) :
            x.append(test[j,i]);
#         x1 = test[0,i];
#         x2 = test[1,i];
        prob = [];
        for j in range(5) :
            gauss = 1;
            for k in range(dim) :
                gauss *= Gaussian(x[k],means[k,j],stds[k,j],np.shape(x)[1])
            prob.append();
#         prob[0] = Gaussian(x1,means[0,0],stds[0,0]) * Gaussian(x2,means[1,0],stds[1,0]);
#         prob[1] = Gaussian(x1,means[0,1],stds[0,1]) * Gaussian(x2,means[1,1],stds[1,1]);
#         prob[2] = Gaussian(x1,means[0,2],stds[0,2]) * Gaussian(x2,means[1,2],stds[1,2]);
        result.append(np.argmax(prob));
    return result;

In [23]:
pcad, labels = unisonShuffledCopies(pcaData, labels);
print(np.shape(pcad),np.shape(labels));

N = np.shape(pcad)[0];
M = np.shape(pcad)[1];
folds = [0,N//5,2*N//5,3*N//5,4*N//5,N];

kcross = [];
for i in range(5) :
    print(i);
    trs = np.concatenate((pcad[0:M,0:folds[i+1]],pcad[0:M,folds[i+1]:folds[5]]),1);
    lbs = np.concatenate((pcad[0:folds[i+1]],pcad[folds[i+1],folds[5]]),1);
    tst = pcad[0:M,folds[i]:folds[i+1]];
    kcross.append(NBClassifier(trs,lbs,tst));

# kcross1 = NBClassifier(feat2a[0:2,36:178],feat2a[2,36:178],feat2a[0:2,0:36]);
# train2 = np.concatenate((feat2a[0:2,0:36],feat2a[0:2,72:178]),1);
# labels2 = np.concatenate((feat2a[2,0:36],feat2a[2,72:178]),1);
# kcross2 = NBClassifier(train2,labels2,feat2a[0:2,36:72]);
# train3 = np.concatenate((feat2a[0:2,0:72],feat2a[0:2,108:178]),1);
# labels3 = np.concatenate((feat2a[2,0:72],feat2a[2,108:178]),1);
# kcross3 = NBClassifier(train3,labels3,feat2a[0:2,72:108]);
# train4 = np.concatenate((feat2a[0:2,0:108],feat2a[0:2,143:178]),1);
# labels4 = np.concatenate((feat2a[2,0:108],feat2a[2,143:178]),1);
# kcross4 = NBClassifier(train4,labels4,feat2a[0:2,108:143]);
# kcross5 = NBClassifier(feat2a[0:2,0:143],feat2a[2,0:143],feat2a[0:2,143:178]);

accs = [];
for i in range(5) :
    accs.append(len(np.where(kcross[i]-feat2a[2,folds[i]:folds[i+1]] == 0)[1])/(folds[i+1]-folds[i]));
#     accs.append(len(np.where(kcross[i]-feat2a[2,36:72] == 0)[1])/36);
#     accs.append(len(np.where(kcross[i]-feat2a[2,72:108] == 0)[1])/36);
#     accs.append(len(np.where(kcross[i]-feat2a[2,108:143] == 0)[1])/36);
#     accs.append(len(np.where(kcross[i]-feat2a[2,143:178] == 0)[1])/36);

print("The accuracies for the five folds are: ",accs);
print("Mean: ",np.mean(accs));
print("Standard deviation: ",np.std(accs));

(4321, 10000) (4321, 1)
0


IndexError: too many indices for array