# Action plan

## Goal

The goal here is to take in a very large point cloud and use mini batch k means to cluster the points into texture/land types. 

To do this, I must be able to read in the data in batches.

The data I want to read in are points with associated patches about them that have features for composed of their neighbours information. 

e.g. 

for a point at (lon, lat) = (x0, y0), I take it and its closest 8 neighbours { (x1, y1), ..., (x8, y8) } and use them to poll the data to construct the feature vector/array ((z0, r0, g0, b0), (z1, r1, g1, b1), ..., (z8, r8, g8, b8)). 

If I could get this data for 1000 points at a time, then I could do 1000-pixel minibatches of minibatchKMeans. 

# Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
from DataLoader import DataLoader

In [None]:
from helper_functions import hist3d

# Read in data

In [None]:
POINT_RESOLUTION = 1e-6 # ~10 cm
dl = DataLoader()

In [None]:
dl.readData()

In [None]:
dl.data[0].head()

# Construct neighbours graph

In [None]:
dl.saveKNeighboursBatch(1000000)

In [None]:
lonlat = dl.getLonLatPairs()

In [None]:
lon_unique, lat_unique = dl.uniqueLonLat()

In [None]:
lonlat_unique = dl.uniqueLonLatPairs()

In [None]:
# A little show of sparsity:
point_sparsity = lonlat_unique.shape[0] / (lat_unique.size * lon_unique.size)
print('density of recorded unique pairs is only {}.'.format(np.round(point_sparsity, 3)))

In [None]:
hist3d(dl.getScaledRgbArray(), azim=-105)

# Finding lonlat neighbours

In [None]:
from sklearn.neighbors import NearestNeighbors

## Nearest Neighbours

In [None]:
NBR_RADIUS = 1e-5 # ~1 metre?

neigh = NearestNeighbors(n_neighbors=10, radius=NBR_RADIUS, n_jobs=-1)
neigh.fit(lonlat)

In [None]:
BATCH_SIZE = 1000000
batch_number = 0
start_idx = batch_number * BATCH_SIZE
end_idx = (batch_number + 1) * BATCH_SIZE

while end_idx < lonlat_unique.shape[0]-1:
    kneighBatch = neigh.kneighbors(lonlat_unique[start_idx:end_idx], 
                                   return_distance=False)
    fileName = './radNeigh_lonlatUnique_{}_{}.npy'.format(dl.group_number, batch_number)
    np.save(fileName, kneighBatch)
    print('Batch {} complete.'.format(batch_number))
    start_idx = end_idx
    batch_number += 1
    end_idx = (batch_number + 1) * BATCH_SIZE

end_idx = lonlat_unique.shape[0]
kneighBatch = neigh.kneighbors(lonlat_unique[start_idx:end_idx], 
                               return_distance=False)
fileName = './radNeigh_lonlatUnique_{}_{}.npy'.format(dl.group_number, batch_number)
np.save(fileName, kneighBatch)
print('Batch {} complete.'.format(batch_number))

In [None]:
!ls -lh radNeigh_lonlatUnique*

In [None]:
from sklearn.metrics import jaccard_similarity_score

In [None]:
def _jaccard(arr1, arr2, assume_unique=True):
    AcapB = np.intersect1d(arr1, arr2, assume_unique=assume_unique).size
    AcupB = np.union1d(arr1, arr2).size
    return AcapB/AcupB

In [None]:
import scipy.sparse as sparse

In [None]:
def groupJaccard(arrList, verbose=False):
    L = len(arrList)
    jaccArr = sparse.dok_matrix((L,L))
    for k1 in range(L):
        if verbose and ((k1 % 50) == 0):
            print('\n{} of {}'.format(k1+1, L))
        for k2 in range(k1+1, L):
            if verbose and ((k2 % 1000) == 0):
                print('\t{} of {}'.format(k2-k1, L-k1), end='\r')
            jaccArr[k1, k2] = _jaccard(arrList[k1], arrList[k2])
    return jaccArr

In [None]:
shuffling

In [None]:
jaccArr = groupJaccard(radNeigh_lonlatUnique, verbose=True)

In [None]:
np.setdiff1d(neigh.radius_neighbors(lonlat[:1, :], 
                                    radius=5e-6, 
                                    return_distance=False)[0],
             neigh.radius_neighbors(lonlat[:1, :], 
                                    radius=1e-10, 
                                    return_distance=False)[0])

In [None]:
th_lon = smallestDiff(lon_unique, random=False)
th_lat = smallestDiff(lat_unique, random=False)
th_rad = np.linalg.norm([th_lon, th_lat])
print((th_lon, th_lat, th_rad))
# Expecting (9.9999999747524271e-07, 9.9999999747524271e-07, 1.4142135588025491e-06) for this

# define neighbourhood radius
rho = 3
th_nbr = rho * th_rad
print('neighbourhood radius: {}'.format(th_nbr))

In [None]:
near_equal(lonlat[0,0], lonlat[0,1], lon_unique, lat_unique, 1e-5)

Next up: 
* define a method that gets the colour data for each of these points
* define a method that will do this for the first N points of lonlat
* turn all of that into a generator that will work with mbKMeans partial_fit

In [None]:
def getNeighbours(lonlat, lon_vec, lat_vec, th=None):
    if (isinstance(lonlat, tuple) or isinstance(lonlat, list)) and (len(lonlat) == 2):
        # assume lonlat is a tuple of lists
        return {(lon, lat): near_equal(lon, lat, lon_vec, lat_vec, th) 
                for lon in lonlat[0] for lat in lonlat[1]}
    elif isinstance(lonlat, np.ndarray):
        return {(lon, lat): near_equal(lon, lat, lon_vec, lat_vec, th) 
                for lon,lat in lonlat}
    else:
        raise ValueError('Unsure how to parse unknown type lonlat')

In [None]:
viridis_ = plt.cm.viridis(np.arange(300))[::30]

In [None]:
for j, (k,v) in enumerate(getNeighbours(lonlat[:10,:], lon_unique, lat_unique, th_nbr).items()):
    plt.plot(v[:,0], v[:,1], color=viridis_[j])

In [None]:
asdf = {1: np.array([1,2,3]), 2: np.array([1,2,3]), 4: np.array([1,2,3])}

In [None]:
np.vstack(asdf.values())

In [None]:
def getColour(lon, lat):
    return colour

# Converting colours

In [None]:
from skimage import color

In [None]:
color.rgb2lab(np.random.rand(2, 1, 3))

MiniBatch KMeans

In [None]:
from sklearn.cluster import MiniBatchKMeans as mbKMeans

In [None]:
mbkm = mbKMeans(n_clusters=10, verbose=True)

In [None]:
for k in sorted(data.keys()):
    mbkm.partial_fit(data[k].loc[:, ['r', 'g', 'b']].values)

Plot resulting centers

In [None]:
y = [0 for _ in range(10)]
x = range(10)

plt.scatter(x, y, c=mbkm.cluster_centers_/255, s=1000)