# flexible environment curve generation

In [None]:
import astropy as ap
from astropy import coordinates
import bisect
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import pickle as pkl

how many cores I'm allowed to use q-:

In [None]:
nps = 20

## isolate usable sample from GAMA

In [None]:
toppath = '/media/CRP6/Cosmology/recidivator/GAMA/'

split by field for speed, omit field with nonuniform coverage

sky coordinate limits came from [GAMA website](https://www.astro.ljmu.ac.uk/~ikb/research/gama_fields/) -- must include DEC for this to work!!!

NOTE: this fails on G15 field because it's got too many galaxies

In [None]:
xlim = {}
xlim['G02'] = (30.2, 38.8)
xlim['G09'] = (129.0, 141.0)
xlim['G12'] = (174.0, 186.0)
xlim['G15'] = (211.5, 223.5)

ylim = {}
ylim['G02'] = (-10.5, -3.72)
ylim['G09'] = (-2., 3.)
ylim['G12'] = (-3., 2.)
ylim['G15'] = (-2., 3.)

maxang = ap.units.Quantity(2.5, 'deg')

filter for data quality

In [None]:
zlim = (0., 0.5)
newpath = '/media2/CRP6/Cosmology/envirocurves/GAMA/'

all_q = pd.read_csv(toppath+'test/one_slice/SpecObjPhot.csv', index_col=['CATAID'])
high_q = all_q[all_q['NQ'] > 2]
high_q = high_q.rename(columns={"Unnamed: 0": "SpecObjPhot_index"})

gf = {}
for key in xlim.keys():
    gf[key] = high_q.loc[(high_q['RA'] > xlim[key][0]) & (high_q['RA'] < xlim[key][1]) & 
                         (high_q['DEC'] > ylim[key][0]) & (high_q['DEC'] < ylim[key][1]) & 
                         (high_q['Z'] > zlim[0]) & (high_q['Z'] < zlim[1])]
    
n_gal_kept = {}
for key in gf.keys():
    n_gal_kept[key] = len(gf[key])
    gf[key].to_csv(newpath+'field'+key+'high_q.csv')
# print(n_gal_kept)

# easy_inds = {}
# for key in gf.keys():
#     easy_inds[key] = pd.read_csv(toppath+'flexible_envirocurves/field'+key+'high_q.csv')
#     print((key, len(easy_inds[key])))
#     easy_inds[key].index.rename('field_index', inplace=True)

break down files as needed (fast)

In [None]:
#max number of galaxies per chunked file
maxfile = 2**14
share_index = range(maxfile)
print(maxfile)

In [None]:
def break_down_neighbor_input(keyno):
    big_in = pd.read_csv(newpath+'field'+field+'high_q.csv', index_col=['CATAID'])
    big_out = big_in.copy()
    nbig = len(big_in)# total number of galaxies in big file
    nfile = 0
    paircount = []
    while nfile * maxfile <= nbig:
        glob_ind_min = nfile * maxfile# min real index for this chunk
        glob_ind_max = (nfile + 1) * maxfile
        if nbig < glob_ind_max:
            chunk_ind_max = nbig - glob_ind_min
            glob_ind_max = nbig
        else:
            chunk_ind_max = maxfile
        paircount.append((glob_ind_min, glob_ind_max))
        small_piece = big_in[glob_ind_min : glob_ind_max]
        big_out['chunk_'+str(nfile)] = np.nan
        big_out['chunk_'+str(nfile)][glob_ind_min : glob_ind_max] = share_index[:chunk_ind_max]
        small_piece.to_csv(newpath+field+'chunk'+str(nfile)+'.csv')
        print('separated out chunk '+str(nfile)+' of '+field+': '+str(paircount[-1]))
        nfile += 1
#     paircount = nfile
    big_out.to_csv(newpath+field+'allchunks.csv')
    minx, maxx = min(big_in['RA']), max(big_in['RA'])
    miny, maxy = min(big_in['DEC']), max(big_in['DEC'])
    extrema = (minx, maxx, miny, maxy)
    return nfile, paircount, extrema

In [None]:
for field in xlim.keys():
#     nps = 1#len(gf.keys()) #mp.cpu_count()
    pool = mp.Pool(nps)
    paircounts = pool.map(break_down_neighbor_input, range(1))#range(len(gf.keys())))
    # print(paircounts)
    with open(newpath+field+'chunkinfo.csv', 'wb') as chunkinfo:
        pkl.dump(paircounts, chunkinfo)

In [None]:
# testfile = pd.read_csv(toppath+'flexible_envirocurves/'+'G15'+'allchunks.csv')

In [None]:
# testfile.columns

In [None]:
# testfile[:10]

## find the neighbors within max angular distance set by GAMA footprint

In [None]:
# def get_neighbors(keyno):
#     field = list(gf.keys())[keyno]
#     small_piece = pd.read_csv(toppath+'flexible_envirocurves/field'+field+'high_q.csv')
# #     print(small_piece.iloc[99])
#     coords = ap.coordinates.SkyCoord(small_piece['RA'], small_piece['DEC'], unit='deg')
#     #warning: slow!!! (not actually that slow)
#     output = ap.coordinates.search_around_sky(coords, coords, maxang)
#     with open(toppath+'flexible_envirocurves/neighbors_'+field+'_allz.pkl', 'wb') as outfile:
#         pkl.dump(output, outfile)
#     return(output)

In [None]:
# #still kinda slow
# nps = len(gf.keys()) #mp.cpu_count()
# pool = mp.Pool(nps)
# pool.map(get_neighbors, range(len(gf.keys())))

This is prep for the slow step. Have to do separately (not in a loop) for each field due to memory errors.

In [None]:
# allfields = xlim.keys()
field = 'G15'#change me!

In [None]:
with open(newpath+field+'chunkinfo.csv', 'rb') as chunkinfo:
    paircounts = pkl.load(chunkinfo)

nfile = paircounts[0][0]
indends = paircounts[0][1]
(minx, maxx, miny, maxy) = paircounts[0][2]

chunk_pairs = {}
pairkey = 0
for i in range(nfile):
    for j in range(nfile):
        chunk_pairs[pairkey] = (i, j)
        pairkey += 1

In [None]:
def get_neighbors_breakup(keyno):
    (a, b) = chunk_pairs[keyno]
    small_a = pd.read_csv(newpath+field+'chunk'+str(a)+'.csv', index_col=['CATAID'])
    small_b = pd.read_csv(newpath+field+'chunk'+str(b)+'.csv', index_col=['CATAID'])
#     big_piece = pd.read_csv(toppath+'flexible_envirocurves/LOS'+field+'inputs.csv', index_col=['CATAID'])
#     small_piece = break_down_neighbor_input(big_piece)
#     print(small_piece.iloc[99])
    coords_a = ap.coordinates.SkyCoord(small_a['RA'], small_a['DEC'], unit='deg')
    coords_b = ap.coordinates.SkyCoord(small_b['RA'], small_b['DEC'], unit='deg')
    #warning: slow!!! (not actually that slow)
#     print('started finding neighbors')
    output = ap.coordinates.search_around_sky(coords_a, coords_b, maxang)
#     print('finished finding neighbors')
    filepath = newpath+'neighbors_'+field+'chunk'+str(a)+'Xchunk'+str(b)+'allz.pkl'
    pkl.dump(output, open(filepath, 'wb'))
    print(filepath)
    return

This is the slow step!!! Repeat for each field to avoid memory errors.

In [None]:
# for field in xlim.keys():
#     get_neighbors_one_field(field)
# nps = 25#pairkey #mp.cpu_count()
pool = mp.Pool(nps)
pool.map(get_neighbors_breakup, range(pairkey))

## parse neighbor info and evaluate at distances

memory problems if doing these steps separately, sadly slow

In [None]:
# try with many more choices for this!
ndist = 100# 128
# distance_evaluation_points = np.exp(np.linspace(np.log(2.5 / 60. / 60.), np.log(2.5), ndist+2)[1:-1])
distance_evaluation_points = np.linspace(0., maxang.value, ndist+2)[1:-1]
# pos = bisect.bisect(distance_evaluation_points, maxang.value)
# distance_evaluation_points = distance_evaluation_points[:pos]
# n_dists = len(distance_evaluation_points)
# iter_over_dists = enumerate(distance_evaluation_points)
# ndist = range(len(distance_evaluation_points))
# with open(toppath+'flexible_envirocurves/eval'+str(ndist)+'dists_LOS'+field+'.pkl', 'wb') as savedists:
#     pkl.dump(distance_evaluation_points, savedists)
# print(distance_evaluation_points)

In [None]:
def segment(r, d, theta=None):
    if theta == None:
        theta = 2. * np.arccos(d / r)
    return r**2 * (theta - np.sin(theta)) / 2.

def sector(r, d, theta=None):
    if theta == None:
        theta = np.arcsin(d / r)
    return r**2 * theta / 2.

# this throws an error at the points used to define minx, maxx, miny, maxy
def area(r, x, y, minx, maxx, miny, maxy, vb=True):
    lx = x - minx
    ux = maxx - x
    ly = y - miny
    uy = maxy - y
    distances = np.array([lx, ux, ly, uy])
    #print(distances)
    condition = (distances >= r)
    ntrue = sum(condition)
    if ntrue == 4:
        return np.pi * r**2
    elif ntrue == 3:
        return np.pi * r**2 - segment(r, min(distances))
    elif ntrue == 2:
        if vb: print('radii should be chosen so that these cannot be parallel, \
                but will at some point add in a check for this')
        distx = min(distances[:2])
        disty = min(distances[-2:])
        if np.sqrt(distx**2 + disty**2) < r:
            thetax = np.arcsin(distx / r)
            thetay = np.arcsin(disty / r)
            areax = distx * r * np.cos(thetax) / 2.
            areay = disty * r * np.cos(thetay) / 2.
            return sector(r, distx, theta=thetax) + sector(r, disty, theta=thetay) + \
                            sector(r, r, theta=np.pi / 2.) + distx * disty + areax + areay
        else:
            return np.pi * r**2 - segment(r, distx) - segment(r, disty)
    else:
        if vb: print('this case should not happen because we did not consider radii \
                beyond half the shortest side of the footprint,\
                but will at some point deal with this case')
        return None


# ## Calculates volume normalized environment
# def calc_env(ind):
#     """
#         Runs galenv to calculate galaxy environment.
#         This is set up to run in the multiprocessing so a lot of inputs are
#         not set when you call the function, but are supposed to be defined
#         when running this code.

#         Output: nearest neighbors at a given angular separation.
#     """
#     if opts.run_environment:
#         # Generates environments for GAMA RA/Dec data
#         res = [subsamples[f][s]['CATAID'].values[ind]]
#     if opts.run_particle_environment:
#         # Generates environments for particle RA/Dec data
#         res = [ind]

#     friends = data
#     for dist in try_distances:
#         friends = galenv.nn_finder(friends, data[ind], dist)
#         #print('r/dist', dist, 'x ', data[ind][0], 'y ', data[ind][1], 'minx', minx, 'maxx', maxx, 'miny', miny, 'maxy', maxy)
#         vol = area(dist, data[ind][0], data[ind][1], minx, maxx, miny, maxy, vb=False)
#         #print('vol', vol)
#         res.append(float(len(friends)) / vol)
#     return res
# ###

again, have to run separately per field

In [None]:
# allfields = xlim.keys()
# field = allfields[0]#change me!
loskey = field

with open(newpath+field+'chunkinfo.csv', 'rb') as chunkinfo:
    paircounts = pkl.load(chunkinfo)

nfile = paircounts[0][0]
indends = paircounts[0][1]
(minx, maxx, miny, maxy) = paircounts[0][2]

chunk_pairs = {}
pairkey = 0
for i in range(nfile):
    for j in range(nfile):
        chunk_pairs[pairkey] = (i, j)
        pairkey += 1

In [None]:
for i in range(nfile):
    small_piece = pd.read_csv(newpath+loskey+'chunk'+str(i)+'.csv', index_col='CATAID')
    which_dists = distance_evaluation_points
    for k, dist in enumerate(which_dists):
        small_piece[str(dist)] = np.zeros_like(len(small_piece))
    for j in range(nfile):
        with open(newpath+'neighbors_'+loskey+'chunk'+str(i)+'Xchunk'+str(j)+'allz.pkl', 'rb') as tomerge:
            all_pairs = pkl.load(tomerge)
        inds_with_neighbors, where_my_neighbors_at = np.unique(all_pairs[0], return_inverse=True)
        def help_gather_neighbors(ind):
            newrow = np.zeros_like(which_dists)
            if ind in inds_with_neighbors:
                my_neighbor_dists = all_pairs[2][where_my_neighbors_at == ind].value
                sort_dists = sorted(list(my_neighbor_dists))
                for k, dist in enumerate(which_dists):
                    newrow[k] = bisect.bisect(sort_dists, dist)
            return(newrow)
#         nps = 25#mp.cpu_count()
        pool = mp.Pool(nps)
        distcounts = pool.map(help_gather_neighbors, range(len(small_piece.index)))
        distcounts = np.array(distcounts).T
        print(str((i, j, np.shape(distcounts))))
        for k, dist in enumerate(which_dists):
            small_piece[str(dist)] += distcounts[k]
    for k, dist in enumerate(which_dists):
        small_piece[str(dist)] = small_piece.apply(lambda row: float(row[str(dist)]) / 
                                                        area(dist, row['RA'], row['DEC'], 
                                                             minx, maxx, miny, maxy, vb=False), axis=1)
    print(small_piece)
    small_piece.to_csv(newpath+loskey+'chunk'+str(i)+'dists'+str(ndist)+'.csv')

# no longer used

## OLD: parse the neighbor info to flexibly make curves

In [None]:
# # add in redshift filtering here or earlier?

# def help_find_neighbors(ind):
#     where_my_neighbors_at = np.where(all_pairs[0] == ind)
#     my_neighbor_inds = all_pairs[1][where_my_neighbors_at]
#     save_df = df[['CATAID', 'RA', 'DEC', 'Z']].iloc[my_neighbor_inds]
# #     my_neighbor_zs = df.iloc[my_neighbor_inds]['Z']
#     neighbor_dists = all_pairs[2][where_my_neighbors_at]
#     save_df['dist'+str(ind)] = neighbor_dists
# #     nn = len(neighbor_distances)
# #     res[ind] = sorted(neighbor_distances)
#     output = (ind, df.iloc[ind]['CATAID'], save_df)
# #     res[ind] = output
#     return(output)

In [None]:
# def parse_one_field(test_key):
#     df = pd.read_csv(toppath+'flexible_envirocurves/field'+test_key+'high_q.csv')

#     with open(toppath+'flexible_envirocurves/neighbors_'+test_key+'_allz.pkl', 'rb') as pairfile:
#         all_pairs = pkl.load(pairfile)

#         # res = [[]] * len(df.index)
#     nps = 35#mp.cpu_count() - 1
#     pool = mp.Pool(nps)
#     try_neighbor_dists = pool.map(help_find_neighbors, df.index)
#     pkl.dump(try_neighbor_dists, open(toppath+'flexible_envirocurves/parsed_neighbor_dists_'+test_key+'.pkl', 'wb'))

In [None]:
# for test_key in gf.keys():
#     parse_one_field(test_key)
#     print(test_key)

## OLD: now evaluate at distances

In [None]:
# # try with many more choices for this!
# ndist = 100# 128
# # distance_evaluation_points = np.exp(np.linspace(np.log(2.5 / 60. / 60.), np.log(2.5), ndist+2)[1:-1])
# distance_evaluation_points = np.linspace(0., maxang.value, ndist+2)[1:-1]
# # n_dists = len(distance_evaluation_points)
# # iter_over_dists = enumerate(distance_evaluation_points)
# # ndist = range(len(distance_evaluation_points))

In [None]:
# ndist = 10#100
# # distance_evaluation_points = np.exp(np.linspace(np.log(2.5 / 60. / 60.), np.log(2.5), ndist+2)[1:-1])
# distance_evaluation_points = np.linspace(0., 1., ndist+2)[1:-1]
# pos = bisect.bisect(distance_evaluation_points, maxang.value)
# distance_evaluation_points = distance_evaluation_points[:pos]

normalize the curves by neighbors-per-area

In [None]:
# def segment(r, d, theta=None):
#     if theta == None:
#         theta = 2. * np.arccos(d / r)
#     return r**2 * (theta - np.sin(theta)) / 2.

# def sector(r, d, theta=None):
#     if theta == None:
#         theta = np.arcsin(d / r)
#     return r**2 * theta / 2.

# # this throws an error at the points used to define minx, maxx, miny, maxy
# def area(r, x, y, minx, maxx, miny, maxy, vb=True):
#     lx = x - minx
#     ux = maxx - x
#     ly = y - miny
#     uy = maxy - y
#     distances = np.array([lx, ux, ly, uy])
#     #print(distances)
#     condition = (distances >= r)
#     ntrue = sum(condition)
#     if ntrue == 4:
#         return np.pi * r**2
#     elif ntrue == 3:
#         return np.pi * r**2 - segment(r, min(distances))
#     elif ntrue == 2:
#         if vb: print('radii should be chosen so that these cannot be parallel, \
#                 but will at some point add in a check for this')
#         distx = min(distances[:2])
#         disty = min(distances[-2:])
#         if np.sqrt(distx**2 + disty**2) < r:
#             thetax = np.arcsin(distx / r)
#             thetay = np.arcsin(disty / r)
#             areax = distx * r * np.cos(thetax) / 2.
#             areay = disty * r * np.cos(thetay) / 2.
#             return sector(r, distx, theta=thetax) + sector(r, disty, theta=thetay) + \
#                             sector(r, r, theta=np.pi / 2.) + distx * disty + areax + areay
#         else:
#             return np.pi * r**2 - segment(r, distx) - segment(r, disty)
#     else:
#         if vb: print('this case should not happen because we did not consider radii \
#                 beyond half the shortest side of the footprint,\
#                 but will at some point deal with this case')
#         return None


# # ## Calculates volume normalized environment
# # def calc_env(ind):
# #     """
# #         Runs galenv to calculate galaxy environment.
# #         This is set up to run in the multiprocessing so a lot of inputs are
# #         not set when you call the function, but are supposed to be defined
# #         when running this code.

# #         Output: nearest neighbors at a given angular separation.
# #     """
# #     if opts.run_environment:
# #         # Generates environments for GAMA RA/Dec data
# #         res = [subsamples[f][s]['CATAID'].values[ind]]
# #     if opts.run_particle_environment:
# #         # Generates environments for particle RA/Dec data
# #         res = [ind]

# #     friends = data
# #     for dist in try_distances:
# #         friends = galenv.nn_finder(friends, data[ind], dist)
# #         #print('r/dist', dist, 'x ', data[ind][0], 'y ', data[ind][1], 'minx', minx, 'maxx', maxx, 'miny', miny, 'maxy', maxy)
# #         vol = area(dist, data[ind][0], data[ind][1], minx, maxx, miny, maxy, vb=False)
# #         #print('vol', vol)
# #         res.append(float(len(friends)) / vol)
# #     return res
# # ###

In [None]:
# vols = np.empty((ndist))
# for i, dist in enumerate(distance_evaluation_points):
#     vols[i] = area(dist, ra, dec, minx, maxx, miny, maxy, vb=False)

In [None]:
# def eval_curve_one_field(test_key):

#     minx = xlim[test_key][0]
#     maxx = xlim[test_key][1]
#     miny = ylim[test_key][0]
#     maxy = ylim[test_key][1]

# #     df = pd.read_csv(toppath+'flexible_envirocurves/field'+test_key+'high_q.csv')

# #     if find_neighbors:
# #         all_pairs = pkl.load(open(toppath+'flexible_envirocurves/neighbors_'+test_key+'_allz.pkl', 'rb'))

# #         # res = [[]] * len(df.index)
# #         nps = 35#mp.cpu_count() - 1
# #         pool = mp.Pool(nps)
# #         try_neighbor_dists = pool.map(help_find_neighbors, df.index)
# #         pkl.dump(try_neighbor_dists, open(toppath+'flexible_envirocurves/parsed_neighbor_dists_'+test_key+'.pkl', 'wb'))
    
#     the_neighborhood = pkl.load(open(toppath+'flexible_envirocurves/parsed_neighbor_dists_'+test_key+'.pkl', 'rb'))
    
    

In [None]:
# the_neighborhood = pkl.load(open(toppath+'flexible_envirocurves/parsed_neighbor_dists_'+test_key+'.pkl', 'rb'))

In [None]:
# test_key = 'G12'

In [None]:
# xpl = np.linspace(0., 2.5, ndist+2)[1:-1]
# plt.scatter(xpl, np.linspace(0., 2.5, ndist+2)[1:-1])
# plt.scatter(xpl, np.log(np.logspace(0., np.log(2.5), ndist+2)[1:-1]))
# plt.scatter(xpl, np.exp(np.linspace(np.log(2.5 / 60. / 60.), np.log(2.5), ndist+2)[1:-1]))
# plt.show()

also, subsample redshifts of neighbors to be near redshift of galaxy in question

next steps
- [X] collect neighbors and distances for rudimentary curve
- [X] variable angular distances for evaluation
- [X] normalize for area enclosed within radius
- [X] redshift/depth cutting
- [ ] physical distance conversion

In [None]:
# # helper to select by redshift:
# # easiest to just go +/- 0.1
# # then get more sophisticated

# def give_redshift_neighbors(ind, delta=0.1):
#     my_neighborhood = the_neighborhood[ind][-1]
#     all_neighbor_zs = my_neighborhood['Z']
#     my_z = all_neighbor_zs[ind]
#     close_neighbor_dists = my_neighborhood['dist'+str(ind)][(all_neighbor_zs > my_z-delta) & (all_neighbor_zs < my_z+delta)]
#     return(close_neighbor_dists)

In [None]:
# # add in volume normalization here

# def help_make_curve(ind):
#     curve = [the_neighborhood[ind][1]]#np.empty((ndist))
#     (ra, dec) = (df.iloc[ind]['RA'], df.iloc[ind]['DEC'])
# #     redshifts = test_data[ind][3]
#     distances = sorted(list(give_redshift_neighbors(ind)))
#     for dist in distance_evaluation_points:
# #         print(i)
#         pos = bisect.bisect(distances, dist)
# #         print(pos)
# #         curve[i] = pos
# #         all_curves.iloc[ind][str(i)] = pos
#         vol = area(dist, ra, dec, minx, maxx, miny, maxy, vb=False)
#         curve.append(float(pos) / vol)
        
#     return(curve)

In [None]:
# nps = 35#mp.cpu_count() - 1
# pool = mp.Pool(nps)
# try_neighbor_curves = pool.map(help_make_curve, df.index)
# newdist = np.hstack((np.ones((1)), distance_evaluation_points))
# save_curves = np.vstack((newdist, try_neighbor_curves))
# pkl.dump(np.array(save_curves), open(toppath+'flexible_envirocurves/zslice_normed_lin100dist_'+test_key+'.pkl', 'wb'))

In [None]:
# thing = pkl.load(open(toppath+'flexible_envirocurves/zslice_normed_lin100dist_'+test_key+'.pkl', 'rb'))
# thing

In [None]:
# test_curves = pkl.load(open(toppath+'flexible_envirocurves/zslice_normed_curves_'+test_key+'.pkl', 'rb'))

In [None]:
# test_curves

# scratch below here

In [None]:
# all_curves = pd.DataFrame(columns = distance_evaluation_points).fillna(0)

In [None]:
# n_gal = len(np.unique(test[0]))
# sparse_arr = np.empty((n_gal, n_gal))
# sparse_arr[:] = np.nan

In [None]:
# def fill_sparse(ind):
#     x, y = test[0][ind], test[1][ind]
#     sparse_arr[x][y] = test[2][ind].value
#     return sparse_arr

In [None]:
# # for ind in range(len(test[0])):
# #     fill_sparse(ind)
# n_pair = len(test[0])
# nps = 15
# pool = mp.Pool(nps)
# pool.map(fill_sparse, range(n_pair))

In [None]:
# sparse_arr

In [None]:
# sparse_df = pd.DataFrame(sparse_arr).astype(pd.SparseDtype("float", np.nan))
# sparse_df.to_csv('sparse_neighbors_G12.csv')
# sparse_df

In [None]:
# sparse_df.isna().sum()

In [None]:
# sparse_test = pd.DataFrame(sparse_arr).astype(pd.SparseDtype("float", np.nan))
# sparse_test

In [None]:
# easy_inds['G15']['neighbor_info'] = 
# easy_inds['G15']['neighbor_ids'] = []
# def fetch_neighbors()

In [None]:
# for key in gf.keys():
#     small_piece = gf[field]
#     just_ids = small_piece['CATAID']


In [None]:
# def collect_neighbors(field):
#     near_dists = pkl.load(open(toppath+'flexible_envirocurves/neighbors_'+field+'_allz.pkl', 'rb'))

## scratch below here

In [None]:
# np.array([output[0], output[1]]).T

In [None]:
# all_q.columns

In [None]:
# print(len(all_q))

In [None]:
# print(len(high_q))

In [None]:
# coords1 = ap.coordinates.SkyCoord(small_piece['RA'], small_piece['DEC'], unit='deg')

In [None]:
# small_piece = df.sample(200)
# coords2 = ap.coordinates.SkyCoord(small_piece['RA'], small_piece['DEC'], unit='deg')

In [None]:
# plt.scatter(coords1.ra, coords1.dec)
# plt.scatter(coords2.ra, coords2.dec)

In [None]:
# # warning: slow!!! (not actually that slow)
# output = ap.coordinates.search_around_sky(coords1, coords1, maxang)

In [None]:
# for o in output:
#     print((len(o), o))

In [None]:
# plt.hist(output[2].value, bins=100)

In [None]:
# plt.hist(output[0], bins=len(np.unique(output[0])))