# Quantifying galaxy environment

notebook by _Alex Malz (GCCL@RUB)_, _Kara Ponder (UC Berkeley)_, _Ben Moews (Edinburgh)_, add your name here

In [None]:
from astropy.io import fits
import corner
import environment as galenv
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import os
import pandas as pd

%matplotlib inline
np.seed = 42

We want to get the spectra of galaxies matching conditions found [here](http://www.gama-survey.org/dr3/schema/table.php?id=31).

In [None]:
with fits.open('SpecObj.fits') as hdul:
    hdul.info()
#     print(hdul[1].header)
    zdf = pd.DataFrame(np.array(hdul[1].data).byteswap().newbyteorder())
    print(zdf.columns)
#     df.index = df['CATAID']

GAMA did calculate some environment measures for us, but only on a small subset of galaxies.

In [None]:
# with fits.open('EnvironmentMeasures.fits') as hdul:
#     hdul.info()
# #     print(hdul[1].header)
#     envdf = pd.DataFrame(np.array(hdul[1].data).byteswap().newbyteorder())
#     print(envdf.columns)
# #     envdf.index = envdf['CATAID']

In [None]:
# df = pd.merge(envdf, zdf, on='CATAID')
df = zdf

## Select spectra by redshift and field

Each galaxy in the GAMA catalog has a spectroscopically confirmed redshift.  
We're going to match these redshifts to the snapshots of the particle data.

In [None]:
z_SLICS = np.array([0.042, 0.080, 0.130, 0.221, 0.317, 0.418, 0.525, 0.640, 0.764, 0.897, 
           1.041, 1.199, 1.372, 1.562, 1.772, 2.007, 2.269, 2.565, 2.899])
z_mids = (z_SLICS[1:] + z_SLICS[:-1]) / 2.
z_bins = np.insert(z_mids, 0, min(df['Z']))
z_bins = np.append(z_mids, max(df['Z']))
print(z_bins)
plt.hist(df['Z'], bins=z_bins)
plt.semilogy()
plt.xlabel('redshift')
plt.ylabel('number of galaxies')

The histogram of redshift is skewed by the use of `z=10` as a placeholder for not having a secure redshift.  
GAMA has a quality flag we can use to filter for redshifts that were considered of sufficient quality for science use, which they define as `NQ > 2`.

In [None]:
moar_bins = np.arange(z_bins[0], z_bins[-1] + z_bins[1], z_bins[1])
for i in range(5):
    quality = df.loc[df['NQ'] == i+1, 'Z']
    plt.hist(quality, alpha=0.5, label=str(i+1), bins=moar_bins)
plt.legend(loc='upper right')
plt.semilogy()
plt.xlim(moar_bins[0], moar_bins[-1])
plt.xlabel('Z')
plt.ylabel('number of galaxies')
plt.title('redshift distributions by quality flag "NQ"')

GAMA observed galaxies in four disjoint regions of the sky.
Since environment is about the immediate vicinity of each galaxy, we'll have to divide the galaxies by region, effectively building our redshift-environment-color distribution separately for each region before combining those findings.

In [None]:
corner.corner(np.array([df['RA'], df['DEC']]).T, labels=['RA', 'DEC'], show_titles=True)

In [None]:
RA_bin_ends = [0., 80., 160., 200., 360.]
subsamples, lens = [], []
for j in range(len(z_bins)-1):
    for i in range(len(RA_bin_ends)-1):
        subsample = df.loc[(df['RA'] >= RA_bin_ends[i]) & (df['RA'] < RA_bin_ends[i+1]) 
                             & (df['NQ'] > 2) & (df['Z'] >= z_bins[j]) & (df['Z'] < z_bins[j+1]), 
                             ['CATAID', 'RA', 'DEC', 'Z', 'NQ']]
        nn = len(subsample)
        if nn > 0:
            lens.append(nn)
            subsamples.append(subsample)

In [None]:

print(lens)

In [None]:
chosen_field = np.argmax(lens)
print(chosen_field)

In [None]:
datum = np.vstack((subsamples[chosen_field]['DEC'], [subsamples[chosen_field]['RA']])).T

## Number of neighbors within a distance

Within each field, we can quantify the density of the local region around each galaxy, which is really what the notion of "galaxy environment" is getting at.
We're going to use the number of neighboring galaxies at each of several given distances in angular coordinates, so as not to incur the computational cost of calculating the distances between all galaxies.

In [None]:
help(galenv)

### Choose some reasonable radii

Our angular positions are in degrees.
The distances will be in bogus units because the code normalizes to the radius of the Earth, but we only need the number of neighbors within an angular distance, so it should be fine.

In [None]:
chosen_ind = np.random.randint(0, len(datum), 1)[0]
print(chosen_ind)
try_distances = np.flip(np.geomspace(0.01, 0.1, 10))
res = []
friends = datum
for dist in try_distances:
    friends = galenv.nn_finder(friends, datum[chosen_ind], dist)
    res.append(len(friends) - 1)

In [None]:
plt.plot(try_distances, res)
plt.xlabel('distance in angular coordinates')
plt.ylabel('number of neighbors within distance')

### Examine distribution of environment values

In [None]:
def calc_env(ind):
    res = [subsamples[s]['CATAID'].values[ind]]
    friends = data
    for dist in try_distances:
        friends = galenv.nn_finder(friends, data[ind], dist)
        res.append(len(friends))
    return res

Only execute this cell once, because it's slow!

In [None]:
# do this for all 4 fields and 19 redshifts separately
all_envs = []
for s in range(len(subsamples)):
#     print(lens[s])
    if lens[s] == 1:
        envs_in_field = [[subsamples[s]['CATAID'].values[0]] + [1] * len(try_distances)]
    else:
        data = np.vstack((subsamples[s]['DEC'], [subsamples[s]['RA']])).T
        nps = mp.cpu_count()
        pool = mp.Pool(nps - 1)
        envs_in_field = pool.map(calc_env, range(len(data)))
    all_envs = all_envs + envs_in_field

In [None]:
envs_arr = np.array(all_envs)
envs_df = pd.DataFrame(data=envs_arr, index = envs_arr[:, 0], columns = ['CATAID']+[str(i) for i in try_distances])

df = pd.merge(envs_df, zdf, on='CATAID')
df.to_csv('enviros.csv')

Once you've executed the above once, comment it out and execute the following, to skip the slow step.

In [None]:
zenvdf = pd.read_csv('enviros.csv')

In [None]:
zenvdf.columns

Now we have the number of neighbors for all galaxies!

In [None]:
orig_distances = np.flip(try_distances)
for i in range(len(orig_distances)):
    plt.violinplot(envs_df[str(orig_distances[i])], positions=[i])
plt.xticks(range(len(orig_distances)), np.around(orig_distances, 3))
plt.semilogy()
plt.xlabel('distance')
plt.ylabel('number of neighbors')
plt.ylim(0.99, 10)

It remains to be seen whether the multimodal distributions in some bins are physical or not.
(They look like a problem with smoothing over a discrete variable.)
We'll try plotting them as a function of field and redshift next.

# Next step:

## Constructing the redshift-environment-SED/color relationship

In [None]:
# phodf = pd.read_csv('SpecObjPhot/SpecObjPhot_0.080.csv')

In [None]:
# phodf.columns

In [None]:
# df = pd.merge(phodf, zenvdf, on='CATAID')

In [None]:
# df.columns

In [None]:
# # whichdata = ['Z']
# # corner.corner(np.array([df['RA'], df['DEC']]).T, labels=['RA', 'DEC'], show_titles=True)