In [1]:
import numpy as np
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
import os
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from plot_confusion_matrix import plot_confusion_matrix
import matplotlib.pyplot as plt



In [20]:
from os import listdir
from os.path import isfile, join


BIG_CITIES = "big.cities/"
BEACHES = "beaches_n_resorts/"
FOREST = "forest/"

data_path = os.getcwd()[:-6] + 'Data/'

big_cities_path = data_path + BIG_CITIES
forest_path = data_path + FOREST
beaches_path = data_path + BEACHES

big_cities_images = [Image.open(big_cities_path+img) 
                         for img in listdir(big_cities_path)
                             if Image.open(big_cities_path+img).size == (640,640)]

forest_images = [Image.open(forest_path+img) 
                         for img in listdir(forest_path)
                             if Image.open(forest_path+img).size == (640,640)]

beaches_images = [Image.open(beaches_path+img) 
                         for img in listdir(beaches_path)
                             if Image.open(beaches_path+img).size == (640,640)]

In [21]:
beaches_hist = [img.histogram() for img in beaches_images]
forest_hist = [img.histogram() for img in forest_images]
big_cities_hist = [img.histogram() for img in big_cities_images]


In [22]:
def hist_sum_cumulative(hist, set_bin=10):
    return [sum(hist[z:z+256][i:i+set_bin]) for z in xrange(0,768,256) for i in xrange(0,256,set_bin)]

beaches_hist_cumlative = [hist_sum_cumulative(hist) for hist in beaches_hist]
forest_hist_cumlative  =  [hist_sum_cumulative(hist) for hist in forest_hist]
big_cities_hist_cumlative = [hist_sum_cumulative(hist) for hist in big_cities_hist]

## Making some visualizations using the histogram by each class 

In [23]:
beaches_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*beaches_hist_cumlative)]
forest_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*forest_hist_cumlative)]
big_cities_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*big_cities_hist_cumlative)]

In [24]:
# import matplotlib.pyplot as plt
# hist = Image.open(beaches_path+'29095393_381193472357970_6309400991837978624_n.jpg').histogram()
# red, green, blue = hist[0:256],hist[256:512],hist[512:768]

In [25]:
# red_sum_cumulative= [sum(red[i:i+10]) for i in xrange(0,len(red),10)]
# gree_sum_cumulative= [sum(green[i:i+10]) for i in xrange(0,len(red),10)]
# blue_sum_cumulative=[sum(blue[i:i+10]) for i in xrange(0,len(red),10)]

## Beaches

In [27]:
beaches_red = beaches_hist_cumlative_mean[0:26]

for i in range(0, 26):

    plt.bar(i, beaches_red[i], color = "red",  alpha=0.3)

plt.show()

In [28]:
beaches_green = beaches_hist_cumlative_mean[26:52]
for i in range(0, 26):

    plt.bar(i, beaches_green[i], color = "green",  alpha=0.3)

plt.show()

In [29]:
beaches_blue = beaches_hist_cumlative_mean[52:78]
for i in range(0, 26):

    plt.bar(i, beaches_blue[i], color = "blue", alpha=0.3)

plt.show()

## Forest

In [30]:
forest_red = forest_hist_cumlative_mean[0:26]

for i in range(0, len(forest_red)):

    plt.bar(i, forest_red[i], color = "red",  alpha=0.3)

plt.show()

In [95]:
forest_green = forest_hist_cumlative_mean[26:52]

for i in range(0, len(forest_green)):

    plt.bar(i, forest_green[i], color = "green",  alpha=0.3)

plt.show()

In [31]:
forest_blue = forest_hist_cumlative_mean[52:78]

for i in range(0, len(forest_blue)):

    plt.bar(i, forest_blue[i], color = "blue",  alpha=0.3)

plt.show()

## Cities

In [97]:
big_cities_red = big_cities_hist_cumlative_mean[0:26]

for i in range(0, 26):

    plt.bar(i, big_cities_red[i], color = "red",  alpha=0.3)

plt.show()

In [98]:
big_cities_green = big_cities_hist_cumlative_mean[26:52]

for i in range(0, 26):

    plt.bar(i, big_cities_green[i], color = "green",  alpha=0.3)

plt.show()

In [99]:
big_cities_blue = big_cities_hist_cumlative_mean[52:78]

for i in range(0, 26):

    plt.bar(i, big_cities_blue[i], color = "blue",  alpha=0.3)

plt.show()

In [101]:
len(big_cities_hist_cumlative), len(beaches_hist_cumlative), len(forest_hist_cumlative)

(259, 394, 292)

In [4]:
# big_cities_array = [np.array(img).flatten() for img in big_cities_images]
# forest_array = [np.array(img).flatten() for img in forest_images]
# beaches_array= [np.array(img).flatten() for img in beaches_images]

In [32]:
big_cities_label = [0 for i in range(len(big_cities_hist_cumlative))]
forest_label = [1 for i in range(len(beaches_hist_cumlative))]
beaches_label = [2 for i in range(len(forest_hist_cumlative))]

In [33]:
len(big_cities_label), len(forest_label), len(beaches_label)

(259, 394, 292)

In [34]:
data = pd.DataFrame({"Features": big_cities_hist_cumlative + forest_hist_cumlative + beaches_hist_cumlative,
                     "Label"   : big_cities_label+ forest_label + beaches_label
                    })

X_all = list(data["Features"])
y_all = list(data["Label"])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=11)


In [36]:
len(X_train), len(y_train)

(756, 756)

In [37]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [38]:
len(X_test), len(y_test)

(189, 189)

In [39]:
pred=knn.predict(X_test)
cm=confusion_matrix(y_test, pred)
cm

array([[34,  5, 10],
       [12, 60, 15],
       [15,  7, 31]])

In [40]:
plot_confusion_matrix(cm, classes=["big_cities", "forest", "beaches"],
                      title='Confusion matrix, without normalization')
plt.show()

Confusion matrix, without normalization
[[34  5 10]
 [12 60 15]
 [15  7 31]]


In [41]:
print "Accuracy: ", sum(cm.diagonal())/float(len(X_test))

Accuracy:  0.6613756613756614
