# flexible environment curve generation

In [None]:
import astropy as ap
from astropy import coordinates
import bisect
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import pickle as pkl

In [None]:
toppath = '/media/CRP6/Cosmology/recidivator/GAMA/'

## isolate usable sample

filter for data quality

In [None]:
all_q = pd.read_csv(toppath+'test/one_slice/SpecObjPhot.csv', index_col=['CATAID'])

In [None]:
high_q = all_q[all_q['NQ'] > 2]

In [None]:
high_q = high_q.rename(columns={"Unnamed: 0": "SpecObjPhot_index"})

split by field for speed, omit field with nonuniform coverage

sky coordinate limits came from [GAMA website](https://www.astro.ljmu.ac.uk/~ikb/research/gama_fields/) -- must include DEC for this to work!!!

NOTE: this fails on G15 field because it's got too many galaxies

In [None]:
gf = {}
# gf['G02'] = high_q.loc[(high_q['RA'] > 30.2) & (high_q['RA'] < 38.8)]
# gf['G09'] = high_q.loc[(high_q['RA'] > 129.0) & (high_q['RA'] < 141.0)]
gf['G12'] = high_q.loc[(high_q['RA'] > 174.0) & (high_q['RA'] < 186.0) & (high_q['DEC'] > -3.) & (high_q['DEC'] < 2.)]
# gf['G15'] = high_q.loc[(high_q['RA'] > 211.5) & (high_q['RA'] < 223.5)]

find the neighbors within max angular distance set by GAMA footprint

In [None]:
n_gal_kept = {}
for key in gf.keys():
    n_gal_kept[key] = len(gf[key])
    gf[key].to_csv('flexible_envirocurves/field'+key+'high_q.csv')

In [None]:
# or key in gf.keys():
    
#     n_gal_kept[key] = len(gf[key])
#     gf[key].to_csv('flexible_envirocurves/field'+key+'high_q.csv')

In [None]:
easy_inds = {}
for key in gf.keys():
    easy_inds[key] = pd.read_csv('flexible_envirocurves/field'+key+'high_q.csv')
    easy_inds[key].index.rename('field_index', inplace=True)

In [None]:
maxang = ap.units.Quantity(2.5, 'deg')

In [None]:
def get_neighbors(keyno):
    field = list(gf.keys())[keyno]
    small_piece = pd.read_csv('flexible_envirocurves/field'+field+'high_q.csv')
#     print(small_piece.iloc[99])
    coords = ap.coordinates.SkyCoord(small_piece['RA'], small_piece['DEC'], unit='deg')
    #warning: slow!!! (not actually that slow)
    output = ap.coordinates.search_around_sky(coords, coords, maxang)
    pkl.dump(output, open(toppath+'flexible_envirocurves/neighbors_'+field+'_allz.pkl', 'wb'))
    return(output)

In [None]:
#still kinda slow
nps = len(gf.keys()) #mp.cpu_count()
pool = mp.Pool(nps)
pool.map(get_neighbors, range(len(gf.keys())))

In [None]:
# help(ap.coordinates.search_around_sky)

next steps
- [X] collect neighbors and distances for rudimentary curve
- [X] variable angular distances for evaluation
- [X] normalize for area enclosed within radius
- [ ] redshift/depth cutting
- [ ] physical distance conversion

In [None]:
test_key = 'G12'

In [None]:
# df = pd.read_csv(toppath+'flexible_envirocurves/field'+test_key+'high_q.csv')
all_pairs = pkl.load(open(toppath+'flexible_envirocurves/neighbors_'+test_key+'_allz.pkl', 'rb'))

In [None]:
# add in redshift filtering here or earlier?

def help_find_neighbors(ind):
    where_my_neighbors_at = np.where(all_pairs[0] == ind)
    my_neighbor_inds = all_pairs[1][where_my_neighbors_at]
    save_df = df[['CATAID', 'Z']].iloc[my_neighbor_inds]
#     my_neighbor_zs = df.iloc[my_neighbor_inds]['Z']
    neighbor_dists = all_pairs[2][where_my_neighbors_at]
    save_df['dist'+str(ind)] = neighbor_dists
#     nn = len(neighbor_distances)
#     res[ind] = sorted(neighbor_distances)
    output = (ind, df.iloc[ind]['CATAID'], save_df)
#     res[ind] = output
    return(output)

In [None]:
#remember to do for all fields

# res = [[]] * len(df.index)
nps = 35#mp.cpu_count() - 1
pool = mp.Pool(nps)
try_neighbor_dists = pool.map(help_find_neighbors, df.index)
pkl.dump(try_neighbor_dists, open(toppath+'flexible_envirocurves/parsed_neighbor_dists_'+test_key+'.pkl', 'wb'))

In [None]:
the_neighborhood = pkl.load(open(toppath+'flexible_envirocurves/parsed_neighbor_dists_'+test_key+'.pkl', 'rb'))

In [None]:
# try with many more choices for this!

distance_evaluation_points = np.arange(0., 2.51, 0.25)[1:]
# n_dists = len(distance_evaluation_points)
# iter_over_dists = enumerate(distance_evaluation_points)
ndist = range(len(distance_evaluation_points))

In [None]:
def segment(r, d, theta=None):
    if theta == None:
        theta = 2. * np.arccos(d / r)
    return r**2 * (theta - np.sin(theta)) / 2.

def sector(r, d, theta=None):
    if theta == None:
        theta = np.arcsin(d / r)
    return r**2 * theta / 2.

# this throws an error at the points used to define minx, maxx, miny, maxy
def area(r, x, y, minx, maxx, miny, maxy, vb=True):
    lx = x - minx
    ux = maxx - x
    ly = y - miny
    uy = maxy - y
    distances = np.array([lx, ux, ly, uy])
    #print(distances)
    condition = (distances >= r)
    ntrue = sum(condition)
    if ntrue == 4:
        return np.pi * r**2
    elif ntrue == 3:
        return np.pi * r**2 - segment(r, min(distances))
    elif ntrue == 2:
        if vb: print('radii should be chosen so that these cannot be parallel, \
                but will at some point add in a check for this')
        distx = min(distances[:2])
        disty = min(distances[-2:])
        if np.sqrt(distx**2 + disty**2) < r:
            thetax = np.arcsin(distx / r)
            thetay = np.arcsin(disty / r)
            areax = distx * r * np.cos(thetax) / 2.
            areay = disty * r * np.cos(thetay) / 2.
            return sector(r, distx, theta=thetax) + sector(r, disty, theta=thetay) + \
                            sector(r, r, theta=np.pi / 2.) + distx * disty + areax + areay
        else:
            return np.pi * r**2 - segment(r, distx) - segment(r, disty)
    else:
        if vb: print('this case should not happen because we did not consider radii \
                beyond half the shortest side of the footprint,\
                but will at some point deal with this case')
        return None


# ## Calculates volume normalized environment
# def calc_env(ind):
#     """
#         Runs galenv to calculate galaxy environment.
#         This is set up to run in the multiprocessing so a lot of inputs are
#         not set when you call the function, but are supposed to be defined
#         when running this code.

#         Output: nearest neighbors at a given angular separation.
#     """
#     if opts.run_environment:
#         # Generates environments for GAMA RA/Dec data
#         res = [subsamples[f][s]['CATAID'].values[ind]]
#     if opts.run_particle_environment:
#         # Generates environments for particle RA/Dec data
#         res = [ind]

#     friends = data
#     for dist in try_distances:
#         friends = galenv.nn_finder(friends, data[ind], dist)
#         #print('r/dist', dist, 'x ', data[ind][0], 'y ', data[ind][1], 'minx', minx, 'maxx', maxx, 'miny', miny, 'maxy', maxy)
#         vol = area(dist, data[ind][0], data[ind][1], minx, maxx, miny, maxy, vb=False)
#         #print('vol', vol)
#         res.append(float(len(friends)) / vol)
#     return res
# ###

In [None]:
# do something smarter for getting borders

minx = 174.
maxx = 186.
miny = -3.
maxy = 2.

In [None]:
# helper to select by redshift:
# easiest to just go +/- 0.1
# then get more sophisticated

def give_redshift_neighbors(ind, delta=0.1):
    my_neighborhood = the_neighborhood[ind][-1]
    all_neighbor_zs = my_neighborhood['Z']
    my_z = all_neighbor_zs[ind]
    close_neighbor_dists = my_neighborhood['dist'+str(ind)][(all_neighbor_zs > my_z-delta) & (all_neighbor_zs < my_z+delta)]
    return(close_neighbor_dists)

In [None]:
# add in volume normalization here

def help_make_curve(ind):
    curve = [the_neighborhood[ind][1]]#np.empty((ndist))
    (ra, dec) = (df.iloc[ind]['RA'], df.iloc[ind]['DEC'])
#     redshifts = test_data[ind][3]
    distances = sorted(list(give_redshift_neighbors(ind)))
    for dist in distance_evaluation_points:
#         print(i)
        pos = bisect.bisect(distances, dist)
#         print(pos)
#         curve[i] = pos
#         all_curves.iloc[ind][str(i)] = pos
        vol = area(dist, ra, dec, minx, maxx, miny, maxy, vb=False)
        curve.append(float(pos) / vol)
        
    return(curve)

In [None]:
nps = 35#mp.cpu_count() - 1
pool = mp.Pool(nps)
try_neighbor_curves = pool.map(help_make_curve, df.index)
pkl.dump(np.array(try_neighbor_curves), open(toppath+'flexible_envirocurves/zslice_normed_curves_'+test_key+'.pkl', 'wb'))

In [None]:
# test_curves = pkl.load(open(toppath+'flexible_envirocurves/zslice_normed_curves_'+test_key+'.pkl', 'rb'))

In [None]:
# test_curves

# scratch below here

In [None]:
all_curves = pd.DataFrame(columns = distance_evaluation_points).fillna(0)

In [None]:
n_gal = len(np.unique(test[0]))
sparse_arr = np.empty((n_gal, n_gal))
sparse_arr[:] = np.nan

In [None]:
def fill_sparse(ind):
    x, y = test[0][ind], test[1][ind]
    sparse_arr[x][y] = test[2][ind].value
    return sparse_arr

In [None]:
# for ind in range(len(test[0])):
#     fill_sparse(ind)
n_pair = len(test[0])
nps = 15
pool = mp.Pool(nps)
pool.map(fill_sparse, range(n_pair))

In [None]:
sparse_arr

In [None]:
sparse_df = pd.DataFrame(sparse_arr).astype(pd.SparseDtype("float", np.nan))
sparse_df.to_csv('sparse_neighbors_G12.csv')
sparse_df

In [None]:
sparse_df.isna().sum()

In [None]:
sparse_test = pd.DataFrame(sparse_arr).astype(pd.SparseDtype("float", np.nan))
sparse_test

In [None]:
easy_inds['G15']['neighbor_info'] = 
easy_inds['G15']['neighbor_ids'] = []
def fetch_neighbors()

In [None]:
for key in gf.keys():
    small_piece = gf[field]
    just_ids = small_piece['CATAID']


In [None]:
def collect_neighbors(field):
    near_dists = pkl.load(open(toppath+'flexible_envirocurves/neighbors_'+field+'_allz.pkl', 'rb'))

## scratch below here

In [None]:
# np.array([output[0], output[1]]).T

In [None]:
# all_q.columns

In [None]:
# print(len(all_q))

In [None]:
# print(len(high_q))

In [None]:
# coords1 = ap.coordinates.SkyCoord(small_piece['RA'], small_piece['DEC'], unit='deg')

In [None]:
# small_piece = df.sample(200)
# coords2 = ap.coordinates.SkyCoord(small_piece['RA'], small_piece['DEC'], unit='deg')

In [None]:
# plt.scatter(coords1.ra, coords1.dec)
# plt.scatter(coords2.ra, coords2.dec)

In [None]:
# # warning: slow!!! (not actually that slow)
# output = ap.coordinates.search_around_sky(coords1, coords1, maxang)

In [None]:
# for o in output:
#     print((len(o), o))

In [None]:
# plt.hist(output[2].value, bins=100)

In [None]:
# plt.hist(output[0], bins=len(np.unique(output[0])))