In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from astropy.io import fits
from astropy.table import Table
import glob
import time

In [2]:
# missing = pd.read_hdf('missing_scores.h5')

In [3]:
# missing.set_index('objid', inplace=True)

In [2]:
plx_df = pd.read_hdf('parallax_ps1_gaia_pm_plx.h5')
plx_stars = np.where(plx_df.pm_over_error < 9)

In [3]:
missing_files = glob.glob('../missing*miller.fit')
plx_objid = np.empty(0).astype(np.int64)
for mf in missing_files:
    tstart = time.time()
    tmp_tbl = fits.getdata(mf)
    missing_is_plx_star = np.isin(tmp_tbl.objid, plx_df.iloc[plx_stars].objid.values)
    
    plx_objid = np.append(plx_objid, tmp_tbl.objid[missing_is_plx_star])
    tend = time.time()
    print(mf, len(plx_objid), tend - tstart)

../missing_0_adamamiller.fit 341 1.9293696880340576
../missing_1_adamamiller.fit 733 1.9723451137542725
../missing_33_adamamiller.fit 1150 7.713251829147339
../missing_3_adamamiller.fit 1231 1.4695489406585693
../missing_20_1_adamamiller.fit 2263 4.929059267044067
../missing_36_adamamiller.fit 2747 5.581412076950073
../missing_5_1_adamamiller.fit 4531 4.190327882766724
../missing_25_1_adamamiller.fit 5170 4.980125904083252
../missing_15_1_adamamiller.fit 5686 4.115974187850952
../missing_10_1_adamamiller.fit 6979 4.12535285949707
../missing_4_adamamiller.fit 6997 1.601639986038208
../missing_15_0_adamamiller.fit 7744 5.090753078460693
../missing_10_0_adamamiller.fit 7982 3.728666067123413
../missing_30_adamamiller.fit 8619 5.88946008682251
../missing_2_adamamiller.fit 9390 1.8842689990997314
../missing_20_0_adamamiller.fit 9948 4.357120990753174
../missing_5_0_adamamiller.fit 10200 3.6611342430114746
../missing_25_0_adamamiller.fit 11319 5.33674430847168


In [4]:
plx_objid

array([151053144075784963, 152703159200180189, 152703159200180189, ...,
        99372771984543993,  99502777773549037,  99502777773549037])

In [5]:
new_plx_stars = pd.DataFrame(plx_objid, columns=['objid'])
new_plx_stars.to_hdf('plx_stars_in_ps1_missing.h5', 'd1')

## Find the stars selected via proper motion

In [6]:
pm_df = pd.read_hdf('pm_objid_stars.h5')

In [7]:
pm_objid = np.empty(0).astype(np.int64)
for mf in missing_files:
    tstart = time.time()
    tmp_tbl = fits.getdata(mf)
    missing_is_pm_star = np.isin(tmp_tbl.objid, pm_df.objid.values)
    
    pm_objid = np.append(pm_objid, tmp_tbl.objid[missing_is_pm_star])
    tend = time.time()
    print(mf, len(pm_objid), tend - tstart)

../missing_0_adamamiller.fit 428410 69.3012490272522
../missing_1_adamamiller.fit 729286 77.38436603546143
../missing_33_adamamiller.fit 2551081 80.14119505882263
../missing_3_adamamiller.fit 2620458 75.39411520957947
../missing_20_1_adamamiller.fit 4741414 87.74567699432373
../missing_36_adamamiller.fit 7338383 87.96328401565552
../missing_5_1_adamamiller.fit 7892732 85.62994003295898
../missing_25_1_adamamiller.fit 9335424 80.76299905776978
../missing_15_1_adamamiller.fit 11766650 80.5249719619751
../missing_10_1_adamamiller.fit 14209488 86.02674198150635
../missing_4_adamamiller.fit 14266583 70.81423997879028
../missing_15_0_adamamiller.fit 16561224 76.35738325119019
../missing_10_0_adamamiller.fit 17220628 79.05603790283203
../missing_30_adamamiller.fit 18202849 85.74463605880737
../missing_2_adamamiller.fit 18403109 65.12621307373047
../missing_20_0_adamamiller.fit 20736670 79.89972186088562
../missing_5_0_adamamiller.fit 21038631 91.3676369190216
../missing_25_0_adamamiller.fit 2

In [8]:
new_pm_stars = pd.DataFrame(pm_objid, columns=['objid'])
new_pm_stars.to_hdf('pm_stars_in_ps1_missing.h5', 'd1')

In [17]:
star_objid = np.append(pm_objid, plx_objid)
new_stars = pd.DataFrame(star_objid, columns=['objid'])
new_stars.to_hdf('stars_in_ps1_missing.h5', 'd1')

In [18]:
len(star_objid)

22383082

### Read in RF classifications and replace Gaia stars with score = 1

In [10]:
rf_files = glob.glob('../update_*.csv')
for rff in rf_files:
    tstart = time.time()
    rf_df = pd.read_csv(rff)
    rf_is_gaia_star = np.isin(rf_df.objid.values, star_objid)
    rf_df.iloc[rf_is_gaia_star].score = 1
    rf_df.to_csv(rff.replace('update', 'gaia_update'), index=False)
    star_objid = star_objid[~np.isin(star_objid, rf_df.iloc[rf_is_gaia_star].objid.values)]
    tend = time.time()
    print(rff, len(star_objid), tend-tstart)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


../update_12.csv 22011414 18.837344884872437
../update_13.csv 21516959 18.880218982696533
../update_11.csv 21088741 18.90144395828247
../update_10.csv 20289499 19.158812046051025
../update_14.csv 19625407 18.971892833709717
../update_15.csv 18984128 18.819652795791626
../update_17.csv 18237431 20.251355171203613
../update_16.csv 17130853 18.304237127304077
../update_4.csv 15907419 18.17227602005005
../update_5.csv 14958417 17.44241213798523
../update_7.csv 13884378 17.148157119750977
../update_6.csv 12724183 17.26037096977234
../update_2.csv 11826881 16.77374005317688
../update_3.csv 10625402 17.084017992019653
../update_1.csv 9226882 16.99544405937195
../update_0.csv 8537651 16.533631801605225
../update_8.csv 7865228 16.42284893989563
../update_9.csv 7397493 16.004412174224854


In [11]:
len(star_objid)

7397493

In [13]:
gaia_only = pd.DataFrame(star_objid, columns=['objid'])

In [14]:
gaia_only['score'] = np.ones(len(star_objid)).astype(float)

In [15]:
gaia_only.head()

Unnamed: 0,objid,score
0,150503148357876729,1.0
1,150433149582865459,1.0
2,150373149437948650,1.0
3,150373149741710201,1.0
4,150373150946136004,1.0


In [16]:
gaia_only.to_csv('../gaia_only_update.csv', index=False)