In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from astropy.io import fits
from astropy.table import Table
import glob
import time

In [2]:
plx_df = pd.read_hdf('parallax_ps1_gaia_pm_plx.h5')
plx_stars = np.where(plx_df.pm_over_error < 9)

In [70]:
missing_files = glob.glob('../missing*miller.fit')
plx_objid = np.empty(0).astype(np.int64)
for mf in missing_files:
    tstart = time.time()
    tmp_tbl = fits.getdata(mf)
    unique_objid = np.unique(tmp_tbl.objid)

    missing_is_plx_star = np.isin(unique_objid, plx_df.iloc[plx_stars].objid.values)
    
    plx_objid = np.append(plx_objid, unique_objid[missing_is_plx_star])
    tend = time.time()
    print(mf, len(plx_objid), tend - tstart)

../missing_0_adamamiller.fit 227 6.627413034439087
../missing_1_adamamiller.fit 522 2.669977903366089
../missing_33_adamamiller.fit 760 8.749786853790283
../missing_3_adamamiller.fit 817 2.0258970260620117
../missing_20_1_adamamiller.fit 1422 6.498478174209595
../missing_36_adamamiller.fit 1755 7.971898794174194
../missing_5_1_adamamiller.fit 2681 5.6740899085998535
../missing_25_1_adamamiller.fit 3021 7.187233924865723
../missing_15_1_adamamiller.fit 3358 5.732959985733032
../missing_10_1_adamamiller.fit 4339 6.467685699462891
../missing_4_adamamiller.fit 4356 1.990602731704712
../missing_15_0_adamamiller.fit 4839 7.377546072006226
../missing_10_0_adamamiller.fit 5013 4.948205947875977
../missing_30_adamamiller.fit 5363 7.730467796325684
../missing_2_adamamiller.fit 5795 2.476632833480835
../missing_20_0_adamamiller.fit 6128 5.959709882736206
../missing_5_0_adamamiller.fit 6310 5.394402980804443
../missing_25_0_adamamiller.fit 6920 6.782533884048462


In [71]:
new_plx_stars = pd.DataFrame(plx_objid, columns=['objid'])
new_plx_stars.to_hdf('plx_stars_in_ps1_missing.h5', 'd1')

In [72]:
len(plx_objid)

6920

In [73]:
len(np.unique(plx_objid))

6920

## Find the stars selected via proper motion

In [11]:
pm_df = pd.read_hdf('pm_objid_stars.h5')

In [22]:
len(missing_is_pm_star)

31662640

In [26]:
len(np.where(missing_is_pm_star == 1)[0])

428410

In [27]:
len(tmp_tbl)

12772774

In [29]:
len(np.unique(tmp_tbl.objid))

10942237

In [31]:
tmp_tbl

FITS_rec([(150333149908634462, 150333149908634462, 3, 0, 0, 0),
          (150343150089180587, 150343150089180587, 3, 0, 0, 1),
          (150343150089180587, 150343150089180587, 3, 1, 0, 1), ...,
          (180933334077476280, 180933334077476280, 7, 0, 0, 0),
          (180933334228669465, 180933334228669465, 3, 0, 0, 0),
          (180943334238613707, 180943334238613707, 3, 0, 0, 0)],
         dtype=(numpy.record, [('strid', '>i8'), ('objid', '>i8'), ('nDetections', '>i2'), ('primaryDetection', 'u1'), ('stackDet', '>i4'), ('forceDet', '>i4')]))

In [32]:
pm_objid = np.empty(0).astype(np.int64)
for mf in missing_files:
    tstart = time.time()
    tmp_tbl = fits.getdata(mf)
    unique_objid = np.unique(tmp_tbl.objid)
    missing_is_pm_star = np.isin(unique_objid, pm_df.objid.values)
    
    pm_objid = np.append(pm_objid, unique_objid[missing_is_pm_star])
    tend = time.time()
    print(mf, len(pm_objid), len(np.unique(pm_objid)), tend - tstart)

../missing_0_adamamiller.fit 347853 347853 70.50142478942871
../missing_1_adamamiller.fit 597099 597099 81.0002806186676
../missing_33_adamamiller.fit 2099526 2099526 89.86220407485962
../missing_3_adamamiller.fit 2152447 2152447 79.82836198806763
../missing_20_1_adamamiller.fit 3937950 3937950 89.26474189758301
../missing_36_adamamiller.fit 6095152 6095152 85.94800782203674
../missing_5_1_adamamiller.fit 6507839 6507839 84.03519415855408
../missing_25_1_adamamiller.fit 7698989 7698989 92.32728672027588
../missing_15_1_adamamiller.fit 9791146 9791146 92.18693709373474
../missing_10_1_adamamiller.fit 11883131 11883131 89.28665709495544
../missing_4_adamamiller.fit 11924926 11924926 79.2079348564148
../missing_15_0_adamamiller.fit 13883848 13883848 2410.270602941513
../missing_10_0_adamamiller.fit 14418748 14418748 85.39474391937256
../missing_30_adamamiller.fit 15189833 15189833 87.42225193977356
../missing_2_adamamiller.fit 15340932 15340932 83.33330488204956
../missing_20_0_adamamille

In [33]:
len(np.unique(pm_objid))

18651666

In [34]:
new_pm_stars = pd.DataFrame(pm_objid, columns=['objid'])
new_pm_stars.to_hdf('pm_stars_in_ps1_missing.h5', 'd1')

In [81]:
# add unique command because there are a few repeats
star_objid = np.unique(np.append(pm_objid, plx_objid)) 
new_stars = pd.DataFrame(star_objid, columns=['objid'])
new_stars.to_hdf('stars_in_ps1_missing.h5', 'd1')

In [14]:
gaia_in_ps1 = pd.read_hdf('stars_in_ps1_missing.h5')
star_objid = gaia_in_ps1.objid.values

print(len(star_objid) - len(np.unique(star_objid)))
print(len(np.unique(star_objid)))

0
18658572


### Read in RF classifications and replace Gaia stars with score = 1

In [17]:
rf_files = glob.glob('../update_*.csv')
N_gaia_and_ps1 = 0
for rff in rf_files:
    tstart = time.time()
    rf_df = pd.read_csv(rff)
    already_one = len(np.where(rf_df.score == 1)[0])
    gaia_star = np.isin(rf_df.objid.values, star_objid)
    gaia_and_ps1 = len(np.where(gaia_star == True)[0])
    N_gaia_and_ps1 += gaia_and_ps1
    update_rf_score = (gaia_star & (rf_df.score != 1))
    
    rf_df.loc[update_rf_score, "score"] = 1
    now_one = len(np.where(rf_df.score == 1)[0])
    rf_df.to_csv(rff.replace('update', 'gaia_update'), index=False)
    star_objid = star_objid[~np.isin(star_objid, rf_df.objid.values[gaia_star])]
    tend = time.time()
    print(rff, len(star_objid), gaia_and_ps1, 
          len(np.where(update_rf_score == 1)[0]), 
          tend-tstart)

../update_12.csv 18403590 254982 247716 22.005656242370605
../update_13.csv 18071275 332315 320059 20.963937044143677
../update_11.csv 17776859 294416 288940 21.452131748199463
../update_10.csv 17145689 631170 627165 21.512128591537476
../update_14.csv 16649040 496649 490166 27.220184087753296
../update_15.csv 16190751 458289 448386 25.2616069316864
../update_17.csv 15731036 459715 436965 21.85850191116333
../update_16.csv 14878765 852271 843700 22.153571844100952
../update_4.csv 13883655 995110 991752 22.852558135986328
../update_5.csv 13154252 729403 723355 25.13189697265625
../update_7.csv 12294765 859487 856210 24.658520936965942
../update_6.csv 11358650 936115 933271 24.727519989013672
../update_2.csv 10654533 704117 699911 25.42244601249695
../update_3.csv 9692976 961557 957175 21.370465755462646
../update_1.csv 8569539 1123437 1120544 19.574735164642334
../update_0.csv 8085193 484346 473226 19.868874073028564
../update_8.csv 7581002 504191 497556 20.13108468055725
../update_9.cs

In [18]:
N_gaia_and_ps1

11427503

In [94]:
len(star_objid)

7231069

In [95]:
gaia_only = pd.DataFrame(star_objid, columns=['objid'])

In [96]:
gaia_only['score'] = np.ones(len(star_objid)).astype(float)

In [97]:
gaia_only.head()

Unnamed: 0,objid,score
0,66852148902550138,1.0
1,66852148907643713,1.0
2,66872148781320310,1.0
3,66882149096231177,1.0
4,66892148811184766,1.0


In [98]:
gaia_only.to_csv('../gaia_only_update.csv', index=False)

In [99]:
len(gaia_only)

7231069

In [100]:
len(np.unique(gaia_only.objid))

7231069