In [2]:
'''
 - Importing Libraries
'''

import numpy as np
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
import os
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from plot_confusion_matrix import plot_confusion_matrix
import matplotlib.pyplot as plt

In [3]:
'''
 - Importing the Images
'''

from os import listdir
from os.path import isfile, join


BIG_CITIES = "big.cities/"
BEACHES = "beaches_n_resorts/"
FOREST = "forest/"

data_path = os.getcwd()[:-6] + 'Data/'

big_cities_path = data_path + BIG_CITIES
forest_path = data_path + FOREST
beaches_path = data_path + BEACHES

big_cities_images = [Image.open(big_cities_path+img).resize((32,32),Image.ANTIALIAS)
                         for img in listdir(big_cities_path)
                             if Image.open(big_cities_path+img).size == (640,640)]

forest_images = [Image.open(forest_path+img).resize((32,32), Image.ANTIALIAS)
                         for img in listdir(forest_path)
                             if Image.open(forest_path+img).size == (640,640)]

beaches_images = [Image.open(beaches_path+img).resize((32,32), Image.ANTIALIAS)
                         for img in listdir(beaches_path)
                             if Image.open(beaches_path+img).size == (640,640)]

In [7]:
print "Amount of Urban images: ", len(big_cities_images)
print "Amount of Forest images: ", len(forest_images)
print "Amount of Beaches images: ", len(beaches_images)

Amount of Urban images:  259
Amount of Forest images:  292
Amount of Beaches images:  394


In [8]:
'''
- Creating a histogram for each image
'''

beaches_hist = [img.histogram() for img in beaches_images]
forest_hist = [img.histogram() for img in forest_images]
big_cities_hist = [img.histogram() for img in big_cities_images]

In [9]:
'''
- Because we dont have enough computional resources, we compress the histogram by setting a higher bin.
- The function below does that. It gets each histogram calculates a new one based in a bin equals to ten.
'''

def hist_sum_cumulative(hist, set_bin=10):
    return [sum(hist[z:z+256][i:i+set_bin]) for z in xrange(0,768,256) for i in xrange(0,256,set_bin)]

beaches_hist_cumlative = [hist_sum_cumulative(hist) for hist in beaches_hist]
forest_hist_cumlative  =  [hist_sum_cumulative(hist) for hist in forest_hist]
big_cities_hist_cumlative = [hist_sum_cumulative(hist) for hist in big_cities_hist]

## Making some visualizations using the histogram by each class 

In [10]:
beaches_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*beaches_hist_cumlative)]
forest_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*forest_hist_cumlative)]
big_cities_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*big_cities_hist_cumlative)]

## Plotting Histogram for the beaches pictures

In [11]:
for i in range(0, 78):
    if i <= 25:
        plt.bar(i, beaches_hist_cumlative_mean[i], color = "red",  alpha=0.3)
    elif i >= 26 and i <= 51:
        plt.bar(i, beaches_hist_cumlative_mean[i], color = "green",  alpha=0.3)
    else:
         plt.bar(i, beaches_hist_cumlative_mean[i], color = "blue", alpha=0.3)
plt.show()

## Plotting Histogram for the Forest pictures

In [12]:
for i in range(0, 78):
    if i <= 25:
        plt.bar(i, forest_hist_cumlative_mean[i], color = "red",  alpha=0.3)
    elif i >= 26 and i <= 51:
        plt.bar(i, forest_hist_cumlative_mean[i], color = "green",  alpha=0.3)
    else:
         plt.bar(i, forest_hist_cumlative_mean[i], color = "blue", alpha=0.3)
plt.show()

## Cities

In [13]:

for i in range(0, 78):
    if i <= 25:
        plt.bar(i, big_cities_hist_cumlative_mean[i], color = "red",  alpha=0.3)
    elif i >= 26 and i <= 51:
        plt.bar(i, big_cities_hist_cumlative_mean[i], color = "green",  alpha=0.3)
    else:
         plt.bar(i, big_cities_hist_cumlative_mean[i], color = "blue", alpha=0.3)
plt.show()

In [16]:
'''
- Creating labels
    * 0 means Urban
    * 1 means Forests
    * 2 means Beaches
'''
big_cities_label = [0 for i in range(len(big_cities_hist_cumlative))]
forest_label = [1 for i in range(len(beaches_hist_cumlative))]
beaches_label = [2 for i in range(len(forest_hist_cumlative))]

## Going to Pandas DataFrame and Spliting into train and test

In [17]:
data = pd.DataFrame({"Features": big_cities_hist_cumlative + forest_hist_cumlative + beaches_hist_cumlative,
                     "Label"   : big_cities_label+ forest_label + beaches_label
                    })

X_all = list(data["Features"])
y_all = list(data["Label"])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=11)


In [20]:
print "Amount of train data:", len(X_train)
print "Amount of test data: ", len(X_test)

Amount of train data: 756
Amount of test data:  189


### Training our model

In [21]:
knn = KNeighborsClassifier()

In [22]:
from sklearn.grid_search import GridSearchCV
clf = GridSearchCV(knn, {"n_neighbors": range(1, 16),
                         "p": [1,2,3,4,5]})
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'p': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [25]:
print "Best model settings: \n", clf.best_estimator_

Best model settings: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=12, p=2,
           weights='uniform')


### Making prediction and evaluating results

In [27]:
pred=clf.predict(X_test)
cm=confusion_matrix(y_test, pred)
acc = int(sum(cm.diagonal())/float(len(X_test))*100)
plot_confusion_matrix(cm, classes=["big_cities", "forest", "beaches"],
                      title='Confusion matrix - Accuracy: {}%'.format(acc))
plt.show()

Confusion matrix, without normalization
[[26 12 11]
 [13 52 22]
 [14  8 31]]


In [30]:
'''
- Ploting Recall by label
'''
recall_array = [float(cm[i][z])/sum(cm[i]) for i in xrange(len(cm)) for z in xrange(len(cm)) if i == z]
fig, ax = plt.subplots()   
barlist=plt.bar(range(3), recall_array)


axes = plt.gca()
axes.set_ylim([0.,1])

barlist[0].set_color('grey')
barlist[1].set_color('green')
barlist[2].set_color('blue')
plt.xticks([.5,1.5,2.5], ["Urbano", "Floresta", "Praias"])
plt.title("Recall by label")

temp = 0
for rect in barlist:
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width()/2.0, height, round(recall_array[temp],2), ha='center', va='bottom')
    temp += 1

plt.show()

In [29]:
'''
- Ploting Precision by label
'''
precision_array = [float(cm[i][z])/sum(cm)[i] for i in xrange(len(cm)) for z in xrange(len(cm)) if i == z]
barlist=plt.bar(range(3), precision_array)
barlist[0].set_color('grey')
barlist[1].set_color('green')
barlist[2].set_color('blue')
plt.xticks([.5,1.5,2.5], ["Urbano", "Floresta", "Praias"])
axes = plt.gca()
axes.set_ylim([0.,1.])
plt.title("Precision by label")

temp = 0
for rect in barlist:
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width()/2.0, height, round(precision_array[temp],2), ha='center', va='bottom')
    temp += 1
plt.show()