# Finding duplicate stars
Filename: KS2_dubplicates.ipynb

This notebook is dedicated to identifying duplicate detections in the KS2 database. A duplicate detection is defined as one where different names ("NMAST" values) are used to define the same object, either in the same exposure, or in different exposures.

An object is identified by its flux and position. The position can either be in the individual exposure, or in the master frame.

In [2]:
from astropy.io import fits
import pandas as pd

In [3]:
from pathlib import Path

In [4]:
from importlib import reload
from utils import ks2_utils, shared_utils, image_utils

In [5]:
import re
import numpy as np

# INPUT.KS2

In [6]:
ks2_filtermapper = ks2_utils.get_filter_mapper()

In [7]:
ks2_filemapper = ks2_utils.get_file_mapper()

# LOGR.XYVIQ1

LOGR.XYVIQ1 gives the average position for each source on the master frame (cols 1 and 2), the average flux (cols 5 and 11), the flux sigma (cols 6 and 12), and fit quality (cols 7 and 13) in each filter)


In [74]:
reload(ks2_utils)
master_catalog_df = ks2_utils.get_master_catalog(raw=True)

In [9]:
master_catalog_df.head()

Unnamed: 0,umast0,vmast0,mmast1,NMAST,zmast1,szmast1,q1,o1,f1,g1,zmast2,szmast2,q2,o2,f2,g2
0,736.37,91.51,-2.3045,R0000001,8.352,2.586,0.9,0.01,9,9,14.088,1.538,0.963,0.01,9,9
1,737.04,34.36,-3.2771,R0000002,20.458,2.26,0.899,0.0,9,9,9.987,2.508,0.909,0.0,16,16
2,740.65,61.26,-5.9669,R0000003,243.655,3.332,0.999,0.01,9,9,177.85,2.386,0.999,0.02,9,9
3,745.91,93.19,-2.9646,R0000004,15.34,1.092,0.968,0.01,9,9,19.786,2.601,0.982,0.01,9,9
4,749.93,124.75,-4.4803,R0000005,61.959,3.305,0.997,0.0,9,9,84.2,2.85,0.998,0.0,18,18


# LOGR.FIND_NIMFO


In [11]:
reload(ks2_utils)
point_sources_df = ks2_utils.get_point_source_catalog(raw=True)

  if (await self.run_code(code, result,  async_=asy)):


In [12]:
point_sources_df.head(5)

Unnamed: 0,umast,vmast,magu,utile,vtile,z0,sz0,f0,g0,u1,...,o3,f3,g3,NMAST,ps_tile_id,tile_id,master_exp_id,filt_id,unk,chip_id
0,736.424,91.482,-2.0238,0.0,0.0,6.5,0.8,0,1,31.849,...,0.0,1,1,R0000001,L005,N001,G004,F1,C,1
1,736.459,91.452,-1.4531,0.0,0.0,3.7,0.8,0,1,31.918,...,0.0,1,1,R0000001,L005,N002,G006,F1,C,1
2,736.323,91.491,-2.545,0.0,0.0,10.4,0.8,0,1,31.646,...,0.0,1,1,R0000001,L005,N003,G007,F1,C,1
3,736.306,91.445,-2.2584,0.0,0.0,8.3,0.8,0,1,31.611,...,0.0,1,1,R0000001,L005,N004,G009,F1,C,1
4,736.398,91.671,-2.5036,0.0,0.0,9.6,0.8,0,1,31.797,...,0.0,1,1,R0000001,L005,N005,G010,F1,C,1


# Generate distance matrix

In [13]:
exp_gb = point_sources_df.groupby("master_exp_id")
exp_df = exp_gb.get_group(list(exp_gb.groups)[0])

In [14]:
reload(ks2_utils)
dist_mat = ks2_utils.generate_exposure_distance_matrix(exp_df, same_nan=True)

In [15]:
# make a list of all the sources that have dist = 0
np.tril(dist_mat, k=0)[:4,:4]

array([[         nan,   0.        ,   0.        ,   0.        ],
       [ 43.62931268,          nan,   0.        ,   0.        ],
       [ 33.27131873,  10.37003013,          nan,   0.        ],
       [156.18065565, 112.56791428, 122.93750628,          nan]])

In [16]:
tril_ind = np.tril_indices_from(dist_mat, k=-1)
dist_mat.values[tril_ind[0], tril_ind[1]]

array([43.62931268, 33.27131873, 10.37003013, ..., 21.44514222,
       16.92549925, 24.70643641])

In [17]:
# neighbors = dist_mat[(dist_mat[row['NMAST']] < 100) & (dist_mat[row['NMAST']] > 0) ][row['NMAST']]
# neighbors.index

In [18]:
samesies = []
for i, row in dist_mat.iterrows():
    tmp = row[row == 0]
    if len(tmp) > 0:
#         print(tmp.name, tmp.keys().values)
        samesies.append(set([tmp.name] + list(tmp.keys().values)))

In [19]:
# make a set out of each key, pair
samesies[0]

{'R0001452', 'R0001454'}

In [20]:
# fig, ax = plt.subplots(1, 1)

# hist_args = {'histtype':'step'}
# tril_ind = np.tril_indices(dist_mat.shape[0])
# hist = ax.hist(dist_mat.values[tril_ind[0], tril_ind[1]], bins=50, **hist_args, label='no cuts');

# tril_ind = np.tril_indices(dist_mat.loc[cut1_df['NMAST']][cut1_df['NMAST']].shape[0])
# ax.hist(dist_mat.loc[cut1_df['NMAST']][cut1_df['NMAST']].values[tril_ind[0], tril_ind[1]],
#         bins=hist[1], **hist_args, label='q,z > 0');

# tril_ind = np.tril_indices(dist_mat.loc[cut2_df['NMAST']][cut2_df['NMAST']].shape[0])
# ax.hist(dist_mat.loc[cut1_df['NMAST']][cut2_df['NMAST']].values[tril_ind[0], tril_ind[1]],
#         bins=hist[1], **hist_args, label='q >= 0.95');

# ax.legend()
# ax.set_xlabel("Pairwise distance [pixels]")
# ax.set_ylabel("N pairs")

In [21]:
# check R0005193 and R0005195
star_ids = ["R0005193", "R0005195"]
for star_id in star_ids:
    print(i, point_sources_df.query("NMAST == @star_id").shape[0])

R0006552 27
R0006552 27


In [22]:
unique_samesies = []
for i in samesies:
    if i in unique_samesies:
        pass
    else:
        unique_samesies.append(i)

In [52]:
unique_samesies;

In [24]:
for star_ids in unique_samesies:
    #print("New set\n")
    for i in star_ids:
        ndet = point_sources_df.query("NMAST == @i")['NMAST'].size
        q_avg = point_sources_df.query("NMAST == @i")[['q1','q2','q3']].mean().mean()
        q_std = point_sources_df.query("NMAST == @i")[['q1','q2','q3']].std().std()
        z_avg = point_sources_df.query("NMAST == @i")[['z1','z2','z3']].mean().mean()
        z_std = point_sources_df.query("NMAST == @i")[['z1','z2','z3']].std().std()
        print(f"{i}\t{ndet}\t{q_avg:0.2f} +/- {q_std:0.2f}\t{z_avg:0.2f} +/- {z_std:0.2f}")
    print("\n")

R0001452	54	1.00 +/- 0.00	849.53 +/- 1.85
R0001454	54	1.00 +/- 0.00	849.53 +/- 1.85


R0001499	18	1.00 +/- 0.00	201.39 +/- 1.43
R0001498	18	1.00 +/- 0.00	201.39 +/- 1.43


R0001715	45	1.00 +/- 0.00	203.55 +/- 1.00
R0001717	45	1.00 +/- 0.00	203.55 +/- 1.00


R0001949	15	1.00 +/- 0.00	405.82 +/- 1.70
R0001948	15	1.00 +/- 0.00	405.82 +/- 1.70


R0001955	27	0.99 +/- 0.00	40.18 +/- 0.29
R0001956	27	0.99 +/- 0.00	40.18 +/- 0.29


R0001968	27	1.00 +/- 0.00	902.27 +/- 4.53
R0001967	27	1.00 +/- 0.00	902.27 +/- 4.53


R0005690	27	0.76 +/- 0.07	5.56 +/- 0.58
R0002556	54	0.59 +/- 0.06	8.79 +/- 0.93
R0003733	54	0.73 +/- 0.06	25.96 +/- 4.17
R0003271	54	0.42 +/- 0.08	1.36 +/- 0.96
R0003754	54	0.58 +/- 0.06	2.36 +/- 0.20
R0002182	54	0.57 +/- 0.05	4.09 +/- 1.02
R0004799	54	0.43 +/- 0.07	1.33 +/- 0.85
R0003742	54	0.47 +/- 0.07	1.79 +/- 0.46
R0004536	54	0.81 +/- 0.12	10.95 +/- 1.48
R0002535	54	0.52 +/- 0.07	2.32 +/- 0.18
R0002533	54	0.70 +/- 0.06	3.88 +/- 0.48
R0004545	27	0.79 +/- 0.04	5.01 +/- 0.31
R000

In [53]:
point_sources_df.query("master_exp_id == 'G001'").query("NMAST == 'R0006533' or NMAST == 'R0006535'").T;

In [26]:
dist_mat.loc['R0006533', 'R0006535']

0.0

Wait, these are actual duplicates of each other

# Removing identical entries
These entries are identical except for the actual star identifiers

In [27]:
cols = list(point_sources_df.columns)
for i in ['NMAST', 'ps_tile_id', 'tile_id', 'master_exp_id', 'filt_id', 'unk', 'chip_id']:
    cols.pop(cols.index(i))

point_sources_df.drop_duplicates(subset=cols, keep='first')

Unnamed: 0,umast,vmast,magu,utile,vtile,z0,sz0,f0,g0,u1,...,o3,f3,g3,NMAST,ps_tile_id,tile_id,master_exp_id,filt_id,unk,chip_id
0,736.424,91.482,-2.0238,0.0,0.0,6.5,0.8,0,1,31.849,...,0.0,1,1,R0000001,L005,N001,G004,F1,C,1
1,736.459,91.452,-1.4531,0.0,0.0,3.7,0.8,0,1,31.918,...,0.0,1,1,R0000001,L005,N002,G006,F1,C,1
2,736.323,91.491,-2.5450,0.0,0.0,10.4,0.8,0,1,31.646,...,0.0,1,1,R0000001,L005,N003,G007,F1,C,1
3,736.306,91.445,-2.2584,0.0,0.0,8.3,0.8,0,1,31.611,...,0.0,1,1,R0000001,L005,N004,G009,F1,C,1
4,736.398,91.671,-2.5036,0.0,0.0,9.6,0.8,0,1,31.797,...,0.0,1,1,R0000001,L005,N005,G010,F1,C,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204678,1123.238,1594.778,-0.9718,0.0,0.0,-0.4,0.6,0,1,85.476,...,0.0,1,1,R0006552,L013,N021,G043,F2,C,1
204679,1123.163,1594.772,-1.9953,0.0,0.0,5.3,0.6,0,1,85.327,...,0.0,1,1,R0006552,L013,N022,G047,F2,C,1
204680,1123.230,1594.817,-1.4931,0.0,0.0,8.4,0.6,0,1,85.460,...,0.0,1,1,R0006552,L013,N023,G048,F2,C,1
204681,1123.607,1594.519,-2.4661,0.0,0.0,8.8,0.7,0,1,86.214,...,0.0,1,1,R0006552,L013,N024,G049,F2,C,1


In [28]:
point_sources_df.drop_duplicates(subset=cols, keep=False)

Unnamed: 0,umast,vmast,magu,utile,vtile,z0,sz0,f0,g0,u1,...,o3,f3,g3,NMAST,ps_tile_id,tile_id,master_exp_id,filt_id,unk,chip_id
0,736.424,91.482,-2.0238,0.0,0.0,6.5,0.8,0,1,31.849,...,0.0,1,1,R0000001,L005,N001,G004,F1,C,1
1,736.459,91.452,-1.4531,0.0,0.0,3.7,0.8,0,1,31.918,...,0.0,1,1,R0000001,L005,N002,G006,F1,C,1
2,736.323,91.491,-2.5450,0.0,0.0,10.4,0.8,0,1,31.646,...,0.0,1,1,R0000001,L005,N003,G007,F1,C,1
3,736.306,91.445,-2.2584,0.0,0.0,8.3,0.8,0,1,31.611,...,0.0,1,1,R0000001,L005,N004,G009,F1,C,1
4,736.398,91.671,-2.5036,0.0,0.0,9.6,0.8,0,1,31.797,...,0.0,1,1,R0000001,L005,N005,G010,F1,C,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204678,1123.238,1594.778,-0.9718,0.0,0.0,-0.4,0.6,0,1,85.476,...,0.0,1,1,R0006552,L013,N021,G043,F2,C,1
204679,1123.163,1594.772,-1.9953,0.0,0.0,5.3,0.6,0,1,85.327,...,0.0,1,1,R0006552,L013,N022,G047,F2,C,1
204680,1123.230,1594.817,-1.4931,0.0,0.0,8.4,0.6,0,1,85.460,...,0.0,1,1,R0006552,L013,N023,G048,F2,C,1
204681,1123.607,1594.519,-2.4661,0.0,0.0,8.8,0.7,0,1,86.214,...,0.0,1,1,R0006552,L013,N024,G049,F2,C,1


In [29]:
point_sources_df[point_sources_df.duplicated(subset=cols, keep=False)]['NMAST'].unique().size

716

In [30]:
cols = list(master_catalog_df.columns)
cols.pop(cols.index("NMAST"))
unique_ind = master_catalog_df.drop_duplicates(subset=cols).index

In [31]:
dup_ind = set.difference(set(master_catalog_df.index), set(unique_ind))

In [32]:
master_catalog_df.loc[dup_ind]

Unnamed: 0,umast0,vmast0,mmast1,NMAST,zmast1,szmast1,q1,o1,f1,g1,zmast2,szmast2,q2,o2,f2,g2
4100,1410.26,975.32,-8.4633,R0004101,2428.495,58.838,0.999,1.31,9,9,3125.836,34.168,0.999,1.76,18,18
5,749.93,124.75,-4.4803,R0000006,61.959,3.305,0.997,0.00,9,9,84.200,2.850,0.998,0.00,18,18
7,754.79,70.36,-8.5803,R0000008,2704.709,35.819,1.000,0.13,5,5,3227.365,47.837,1.000,0.19,9,9
4113,1435.88,914.27,-7.2660,R0004114,806.117,10.678,1.000,7.20,9,9,963.330,14.263,0.999,7.39,18,18
22,834.77,43.73,-8.3215,R0000023,2131.079,17.199,1.000,0.15,9,9,2200.292,131.793,1.000,0.12,18,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4031,1187.23,937.41,-6.6186,R0004032,444.038,5.796,0.999,0.00,9,9,458.005,12.369,0.999,0.01,18,18
4036,1195.24,975.72,-7.2780,R0004037,815.059,14.309,0.999,0.30,9,9,957.949,47.928,0.999,0.28,18,18
4039,1197.25,969.12,-7.6728,R0004040,1172.524,14.749,0.999,0.18,9,9,1427.434,16.134,0.999,0.28,18,15
4040,1197.74,860.15,-8.1497,R0004041,1819.263,14.158,1.000,0.16,9,9,1993.288,29.039,1.000,0.13,18,18


In [33]:
6552-6194

358

In [34]:
cols = list(master_catalog_df.columns)
cols.pop(cols.index("NMAST"))
dups = master_catalog_df[master_catalog_df.duplicated(subset=cols, keep=False)]

In [35]:
gb = dups.groupby(cols)

In [36]:
len(gb.groups)

358

In [37]:
tmp = [list(gb.get_group(group)['NMAST'].values)
       for group, index in gb.groups.items()]
# pairs = {pair[0]: pair for pair in pairs}
# pairs = [[(pair[0], p) for p in pair] for pair in pairs]

In [38]:
pairs = []
# for pair in tmp:
#     for p in pair:
#         pairs[pair[0]] = p
for group, index in gb.groups.items():
    names = list(gb.get_group(group)['NMAST'].values)
    key = names[0]
    for n in names:
        pairs.append((n, key))

In [54]:
pairs;

In [40]:
# columns: 'name' contains the original name, 'main_id' is the one you'll be keeping
pairs_df = pd.DataFrame(pairs, columns=['name','main_id'])

In [41]:
pairs_df.head()

Unnamed: 0,name,main_id
0,R0002708,R0002708
1,R0002711,R0002708
2,R0001942,R0001942
3,R0001943,R0001942
4,R0002709,R0002709


In [42]:
point_sources_df.columns

Index(['umast', 'vmast', 'magu', 'utile', 'vtile', 'z0', 'sz0', 'f0', 'g0',
       'u1', 'v1', 'x1', 'y1', 'xraw1', 'yraw1', 'z1', 'sz1', 'q1', 'o1', 'f1',
       'g1', 'x0', 'y0', 'z2', 'sz2', 'q2', 'o2', 'f2', 'g2', 'z3', 'sz3',
       'q3', 'o3', 'f3', 'g3', 'NMAST', 'ps_tile_id', 'tile_id',
       'master_exp_id', 'filt_id', 'unk', 'chip_id'],
      dtype='object')

In [43]:
point_sources_df[['NMAST', 'ps_tile_id', 'tile_id', 'master_exp_id', 'filt_id', 'unk', 'chip_id']].head()

Unnamed: 0,NMAST,ps_tile_id,tile_id,master_exp_id,filt_id,unk,chip_id
0,R0000001,L005,N001,G004,F1,C,1
1,R0000001,L005,N002,G006,F1,C,1
2,R0000001,L005,N003,G007,F1,C,1
3,R0000001,L005,N004,G009,F1,C,1
4,R0000001,L005,N005,G010,F1,C,1


In [44]:
master_catalog_df.head()

Unnamed: 0,umast0,vmast0,mmast1,NMAST,zmast1,szmast1,q1,o1,f1,g1,zmast2,szmast2,q2,o2,f2,g2
0,736.37,91.51,-2.3045,R0000001,8.352,2.586,0.9,0.01,9,9,14.088,1.538,0.963,0.01,9,9
1,737.04,34.36,-3.2771,R0000002,20.458,2.26,0.899,0.0,9,9,9.987,2.508,0.909,0.0,16,16
2,740.65,61.26,-5.9669,R0000003,243.655,3.332,0.999,0.01,9,9,177.85,2.386,0.999,0.02,9,9
3,745.91,93.19,-2.9646,R0000004,15.34,1.092,0.968,0.01,9,9,19.786,2.601,0.982,0.01,9,9
4,749.93,124.75,-4.4803,R0000005,61.959,3.305,0.997,0.0,9,9,84.2,2.85,0.998,0.0,18,18


get a list of the remaining point sources

In [146]:
ps_cols = list(point_sources_df.columns)
for i in ['NMAST', 'g0','g1','g2','g3', 'ps_tile_id', 'tile_id', 'master_exp_id', 'filt_id', 'unk', 'chip_id']:
    ps_cols.pop(ps_cols.index(i))

ps_cat_nodup = point_sources_df.drop_duplicates(subset=ps_cols, keep='first')
ps_cat_dups = point_sources_df[point_sources_df.duplicated(subset=ps_cols, keep=False)]

In [147]:
ps_cat_nodup['NMAST'].unique().size

6194

In [148]:
import time

In [149]:
import itertools

In [150]:
t0 = time.time()
ps_gb = point_sources_df.groupby(ps_cols)
# name_sets = []
# for k, v in ps_gb.groups.items():
#     if len(v) > 1:
#         names = point_sources_df.loc[v, 'NMAST'].values
#         name_pairs = [tuple(i) for i in list(itertools.product([names[0]], names))]
#         for n in name_pairs:
#             name_sets.append(n)
# # name_sets = set(name_sets)
# # uniqify
# name_sets = pd.DataFrame(data=sorted(list(set(name_sets))), columns=['primary_id','alias'])
# t1 = time.time()
# print(f"{(t1-t0):0.2f}")

In [151]:
keep_indices = [sorted(ind)[0] for ind in ps_gb.groups.values()]
np.shape(keep_indices)

(194629,)

# OK, I have it! 

Everything that works is wrapped up in ks2_utils.remove_duplicates(). Run it and write the new catalogs to file.

In [152]:
reload(ks2_utils)

<module 'utils.ks2_utils' from '/user/jaguilar/tr14/tr14/utils/ks2_utils.py'>

In [153]:
point_sources_df = ks2_utils.get_point_source_catalog(raw=True)
master_catalog_df = ks2_utils.get_master_catalog(raw=True)
ps_cat, mast_cat = ks2_utils.remove_duplicates(point_sources_df, master_catalog_df)

  if (await self.run_code(code, result,  async_=asy)):


In [154]:
ps_dups = ps_cat[ps_cat.duplicated(subset=ps_cols, keep=False)]

In [155]:
ps_dups.empty

True

In [156]:
dups = mast_cat[mast_cat.duplicated(subset=mast_cols, keep=False)]['NMAST']

In [160]:
dups.empty

True

In [161]:
ks2_utils.ks2_files

[PosixPath('/user/jaguilar/tr14/data/ks2/LOGR.XYVIQ1'),
 PosixPath('/user/jaguilar/tr14/data/ks2/LOGR.FIND_NIMFO'),
 PosixPath('/user/jaguilar/tr14/data/ks2/INPUT.KS2')]

In [162]:
csv_args = {'index':False, "na_rep":None, "header":True}

In [163]:
ps_cat.to_csv(ks2_utils.ks2_files[1].as_posix() + '-nodup.csv', **csv_args)

In [164]:
mast_cat.to_csv(ks2_utils.ks2_files[0].as_posix() + '-nodup.csv', **csv_args)

In [165]:
pd.read_csv(ks2_utils.ks2_files[0].as_posix() + '-nodup.csv', sep=',').head()

Unnamed: 0,umast0,vmast0,mmast1,NMAST,zmast1,szmast1,q1,o1,f1,g1,zmast2,szmast2,q2,o2,f2,g2
0,736.37,91.51,-2.3045,R0000001,8.352,2.586,0.9,0.01,9,9,14.088,1.538,0.963,0.01,9,9
1,737.04,34.36,-3.2771,R0000002,20.458,2.26,0.899,0.0,9,9,9.987,2.508,0.909,0.0,16,16
2,740.65,61.26,-5.9669,R0000003,243.655,3.332,0.999,0.01,9,9,177.85,2.386,0.999,0.02,9,9
3,745.91,93.19,-2.9646,R0000004,15.34,1.092,0.968,0.01,9,9,19.786,2.601,0.982,0.01,9,9
4,749.93,124.75,-4.4803,R0000005,61.959,3.305,0.997,0.0,9,9,84.2,2.85,0.998,0.0,18,18


In [170]:
mast_cat[mast_cat.duplicated(subset=mast_cols, keep=False)].empty

True

In [169]:
ps_cat[ps_cat.duplicated(subset=cols, keep=False)].empty

True