In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def read_data(**kwargs):
    from pandas import read_csv as _csv
    total_rows = 57891462
    column_names = kwargs.get('column_names',
                              ['lon', 'lat', 'z', 'r',
                               'g', 'b', 'j', 'k', 'l'])
    max_rows = kwargs.get('max_rows', int(15 * 1e6))
    nrows = kwargs.get('nrows', int(5 * 1e5))
    skip = kwargs.get('skip', 0)
    verbose = kwargs.get('verbose', False)
    fp = kwargs.get('filepath', 
                    '~/data/4-Vadeboncoeur/davis-bay.txt')
    data = {}
    ctr = 0
    while skip + nrows < np.min([total_rows, max_rows]):
        data[ctr] = _csv(fp, sep=" ", header=None, 
                         skiprows=skip, nrows=nrows)
        data[ctr].columns = column_names
        skip += nrows
        ctr += 1
        if verbose:
            print('\rrows read: {}'.format(skip))
    return data

In [None]:
data = read_data()

In [None]:
def _getLonLatPairs(df):
    return df.loc[:, ['lon', 'lat']].values

def getLonLatPairs(dfDict, concat=False):
    if concat:
        return np.vstack([_getLonLatPairs(v) for v in dfDict.values()])
    else:
        return {k: _getLonLatPairs(v) for k, v in dfDict.items()}

In [None]:
lonlat = getLonLatPairs(data, concat=True)

In [None]:
lon_unique = np.unique(lonlat[:,0])
lat_unique = np.unique(lonlat[:,1])

In [None]:
def smallestDiff(vec, patience=50):
    smallest = np.inf
    vvec = np.random.permutation(vec)
    for j, v in enumerate(vvec):
        for k, w in enumerate(vvec[j+1:]):
            diff = np.abs(v - w)
            if diff < smallest:
                smallest = diff
            else:
                patience -= 1
            if patience <= 0:
                break
    return smallest

In [None]:
th_lon = smallestDiff(lon_unique)
th_lat = smallestDiff(lat_unique)
print((th_lon, th_lat, np.linalg.norm([th_lon, th_lat])))

We should be expecting (9.9999999747524271e-07, 9.9999999747524271e-07, 1.4142135588025491e-06) here. 

In [None]:
rho = 3
th_nbr = rho * np.linalg.norm([th_lon, th_lat])

In [None]:
from sklearn.cluster import MiniBatchKMeans as mbKMeans

In [None]:
mbkm = mbKMeans(n_clusters=10, verbose=True)

In [None]:
for k in sorted(data.keys()):
    mbkm.partial_fit(data[k].loc[:, ['r', 'g', 'b']].values)

In [None]:
y = [0 for _ in range(10)]
x = range(10)

In [None]:
mbkm.cluster_centers_

In [None]:
plt.scatter(x, y, c=mbkm.cluster_centers_/255, s=1000)

In [None]:
def getNeighbourList(arr, th):
    for row in arr:
        

In [None]:
data[1].head()

In [None]:
column_names = ['lon', 'lat', 'z', 'r', 'g', 'b', 'j', 'k', 'l']
data.columns = column_names
print('data.shape = {}'.format(data.shape))
data.head()

In [None]:
data.describe().loc[['min', 'max'], ['lat','lon']]

In [None]:
data_by_lon = data.groupby('lon')
lon_123_729523 = data_by_lon.get_group(-123.729523)
lon_123_729523_by_lat = lon_123_729523.groupby('lat')
lon_123_729523_lat_49_442196 = lon_123_729523_by_lat.get_group(49.442196)

In [None]:
plt.scatter(lon_123_729523_lat_49_442196['j'].values, lon_123_729523_lat_49_442196['k'].values, c=lon_123_729523_lat_49_442196.loc[:,['r', 'g', 'b']].values/255, s=10000)