In [2]:
'''
 - Importing Libraries
'''

import numpy as np
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
import os
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from plot_confusion_matrix import plot_confusion_matrix
import matplotlib.pyplot as plt

In [3]:
'''
 - Importing the Images
'''

from os import listdir
from os.path import isfile, join


BIG_CITIES = "big.cities/"
BEACHES = "beaches_n_resorts/"
FOREST = "forest/"

data_path = os.getcwd()[:-6] + 'Data/'

big_cities_path = data_path + BIG_CITIES
forest_path = data_path + FOREST
beaches_path = data_path + BEACHES

big_cities_images = [Image.open(big_cities_path+img).resize((32,32),Image.ANTIALIAS)
                         for img in listdir(big_cities_path)
                             if Image.open(big_cities_path+img).size == (640,640)]

forest_images = [Image.open(forest_path+img).resize((32,32), Image.ANTIALIAS)
                         for img in listdir(forest_path)
                             if Image.open(forest_path+img).size == (640,640)]

beaches_images = [Image.open(beaches_path+img).resize((32,32), Image.ANTIALIAS)
                         for img in listdir(beaches_path)
                             if Image.open(beaches_path+img).size == (640,640)]

In [4]:
len(big_cities_images), len(forest_images), len(beaches_images)

(259, 292, 394)

In [5]:
beaches_hist = [img.histogram() for img in beaches_images]
forest_hist = [img.histogram() for img in forest_images]
big_cities_hist = [img.histogram() for img in big_cities_images]


In [66]:
len(beaches_hist[0])

768

In [6]:
'''
- We
'''

def hist_sum_cumulative(hist, set_bin=10):
    return [sum(hist[z:z+256][i:i+set_bin]) for z in xrange(0,768,256) for i in xrange(0,256,set_bin)]

beaches_hist_cumlative = [hist_sum_cumulative(hist) for hist in beaches_hist]
forest_hist_cumlative  =  [hist_sum_cumulative(hist) for hist in forest_hist]
big_cities_hist_cumlative = [hist_sum_cumulative(hist) for hist in big_cities_hist]

In [67]:
len(beaches_hist_cumlative[0])

78

## Making some visualizations using the histogram by each class 

In [7]:
beaches_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*beaches_hist_cumlative)]
forest_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*forest_hist_cumlative)]
big_cities_hist_cumlative_mean = [sum(i)/len(i) for i in zip(*big_cities_hist_cumlative)]

## Plotting Histogram for the beaches pictures

In [11]:
for i in range(0, 78):
    if i <= 25:
        plt.bar(i, beaches_hist_cumlative_mean[i], color = "red",  alpha=0.3)
    elif i >= 26 and i <= 51:
        plt.bar(i, beaches_hist_cumlative_mean[i], color = "green",  alpha=0.3)
    else:
         plt.bar(i, beaches_hist_cumlative_mean[i], color = "blue", alpha=0.3)
plt.show()

## Plotting Histogram for the Forest pictures

In [12]:
for i in range(0, 78):
    if i <= 25:
        plt.bar(i, forest_hist_cumlative_mean[i], color = "red",  alpha=0.3)
    elif i >= 26 and i <= 51:
        plt.bar(i, forest_hist_cumlative_mean[i], color = "green",  alpha=0.3)
    else:
         plt.bar(i, forest_hist_cumlative_mean[i], color = "blue", alpha=0.3)
plt.show()

## Cities

In [13]:

for i in range(0, 78):
    if i <= 25:
        plt.bar(i, big_cities_hist_cumlative_mean[i], color = "red",  alpha=0.3)
    elif i >= 26 and i <= 51:
        plt.bar(i, big_cities_hist_cumlative_mean[i], color = "green",  alpha=0.3)
    else:
         plt.bar(i, big_cities_hist_cumlative_mean[i], color = "blue", alpha=0.3)
plt.show()

## Creating my labels

In [14]:
big_cities_label = [0 for i in range(len(big_cities_hist_cumlative))]
forest_label = [1 for i in range(len(beaches_hist_cumlative))]
beaches_label = [2 for i in range(len(forest_hist_cumlative))]

## Going to Pandas DataFrame

In [15]:
data = pd.DataFrame({"Features": big_cities_hist_cumlative + forest_hist_cumlative + beaches_hist_cumlative,
                     "Label"   : big_cities_label+ forest_label + beaches_label
                    })

X_all = list(data["Features"])
y_all = list(data["Label"])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=11)


In [17]:
len(X_train), len(y_train)

(756, 756)

In [45]:
knn = KNeighborsClassifier()
# knn.fit(X_train, y_train)

In [74]:
from sklearn.grid_search import GridSearchCV
clf = GridSearchCV(knn, {"n_neighbors": range(1, 16),
                         "p": [1,2,3,4,5]})
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'p': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [75]:
clf.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=12, p=2,
           weights='uniform')

In [71]:
len(X_test), len(y_test)

(189, 189)

In [72]:
pred=clf.predict(X_test)
cm=confusion_matrix(y_test, pred)
cm

array([[26, 12, 11],
       [13, 52, 22],
       [14,  8, 31]])

In [86]:
plot_confusion_matrix(cm, classes=["big_cities", "forest", "beaches"],
                      title='Confusion matrix, without normalization')
plt.show()

Confusion matrix, without normalization
[[26 12 11]
 [13 52 22]
 [14  8 31]]


In [76]:
print "Accuracy: ", sum(cm.diagonal())/float(len(X_test))

Accuracy:  0.5767195767195767


In [93]:
print "##### Recall por Classe #####"
print """Urbano: {}\nFloresta: {}\nPraias: {}\n""".format(*[float(cm[i][z])/sum(cm[i]) for i in xrange(len(cm)) for z in xrange(len(cm)) if i == z])

##### Recall por Classe #####
Urbano: 0.530612244898
Floresta: 0.597701149425
Praias: 0.584905660377



In [104]:
print "##### Precision por Classe #####"
print """Urbano: {}\nFloresta: {}\nPraias: {}\n""".format(*[float(cm[i][z])/sum(cm)[i] for i in xrange(len(cm)) for z in xrange(len(cm)) if i == z])

##### Precision por Classe #####
Urbano: 0.490566037736
Floresta: 0.722222222222
Praias: 0.484375



In [121]:
precision_array = [float(cm[i][z])/sum(cm)[i] for i in xrange(len(cm)) for z in xrange(len(cm)) if i == z]
precision_df = pd.DataFrame(precision_array)
plt.bar(range(3), precision_array)
plt.show()

In [111]:
precision_array

array([0.49056604, 0.72222222, 0.484375  ])

In [126]:
barlist=plt.bar(["1",2,3,4], [1,2,3,4])
# barlist[0].set_color('r')
plt.show()

In [122]:

df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df2

Unnamed: 0,a,b,c,d
0,0.210864,0.28375,0.999268,0.815722
1,0.371582,0.888191,0.388134,0.902058
2,0.90653,0.985154,0.069128,0.073556
3,0.496712,0.125646,0.649369,0.180437
4,0.735241,0.01032,0.62921,0.593488
5,0.548428,0.550477,0.591045,0.395415
6,0.212809,0.540006,0.507136,0.969433
7,0.072424,0.552103,0.350495,0.669906
8,0.105646,0.981972,0.070129,0.925266
9,0.973554,0.069413,0.179034,0.991953
