# Clean the sample in the SMB_VCV file
Here we remove the Seyfert galaxies and classifications without a known source (No Bibcode)
We import the Python packages first

In [1]:
%pylab

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
from astropy.table import Table, join

We read the original SMB_VCV file and the table with the Bibcodes for each otype. We rearrange some strings.

In [3]:
# File containing all the otype information
bibcodes_types = Table.read('../Data/Raw/BibcodesOtypes_Dec_03_2020.csv', format='ascii.csv')
# File with the cross-match between VCV and SMB
matched = Table.read('../Data/Raw/SMB_VCV_Dec_03_2020.csv', format='ascii.csv')

## Remove different redshifts
We remove bad matching by assuming a difference between the redshifts in the catalogs should not be higher than 0.01.

In [4]:
diff_red = matched['rvz_redshift'] - matched['z']
bad_redshift = np.where(abs(diff_red) > 0.01)

We plot the redshift distribution to justify this selection criteria.

In [5]:
# with plt.style.context('seaborn-talk'):
with plt.style.context('seaborn-paper'):
    plt.rcParams.update({'font.size': 12})
    plt.rcParams.update({'axes.labelsize': 12})
    plt.rcParams.update({'legend.fontsize': 10})
    plt.rcParams.update({'ytick.labelsize': 10})
    plt.rcParams.update({'xtick.labelsize': 10})
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6.75, 9.44),
                                   sharex=True, gridspec_kw={'height_ratios': [1, 3]})
    ax1.hist(matched['rvz_redshift'][bad_redshift], bins=np.linspace(0,1,51),
             log=True, histtype='step', color='C3', lw=2)
    ax1.hist(matched['rvz_redshift'], bins=np.linspace(0,1,51), log=True,
             color='C0', histtype='step', lw=2)
    ax1.set_ylabel('# galaxies')
    ax2.scatter(matched['rvz_redshift'], diff_red)
    ax2.scatter(matched['rvz_redshift'][bad_redshift], diff_red[bad_redshift],
                c='C3')
    # plt.yscale('log')
    ax2.set_xlim(0,1)
    ax2.set_ylim(-0.05,0.05)
    plt.subplots_adjust(wspace=0, hspace=0)
    ax2.set_xlabel('Redshift')
    ax2.set_ylabel('Difference between redshifts')

In [6]:
matched.remove_rows(bad_redshift)

## Separate origin of the bibcode
We separate the bibcode to simplify querys

In [7]:
def separate_origin(table):
    """Separate the origin inside the file"""
    bibcod = np.empty(len(table), dtype='S19')
    for j, jtex in enumerate(table):
        if 'bibcode' in jtex['origin']:
            bibcod[j] = jtex['origin'].split('bibcode=')[-1]
        elif 'from basic data' in jtex['origin']:
            bibcod[j] = 'CDS'
        elif 'from id' in jtex['origin']:
            bibcod[j] = 'ID'
        elif jtex['origin'] == '' or jtex['origin'].mask:
            bibcod[j] = 'UNK'
        else:
            print(j)
            raise NameError('No origin??')
    table['otype_bibcode'] = bibcod
    return table

In [8]:
bib_ot = separate_origin(bibcodes_types)

The galaxies in the sample have different otypes, here we center in the `main_id`. Then, we join the files to only have the information of the `main_id` for each galaxy.

In [9]:
all_typ = join(bib_ot, matched, keys='main_id')
tot_galax = len(np.unique(all_typ['main_id']))
print('The total number of galaxies is %i' % tot_galax)
loc_seyfert = np.logical_or.reduce([np.logical_and(all_typ['otype_txt_1'] == 'Sy1',
                                                   all_typ['otype_txt_2'] == 'Sy1'),
                                   np.logical_and(all_typ['otype_txt_1'] == 'Sy2',
                                                  all_typ['otype_txt_2'] == 'Sy2'),
                                   np.logical_and(all_typ['otype_txt_1'] == 'SyG',
                                                  all_typ['otype_txt_2'] == 'SyG')])
unk_typ = len(all_typ[np.logical_and(all_typ['otype_bibcode'] == 'UNK',
                                     loc_seyfert)])
cds_typ = len(all_typ[np.logical_and(all_typ['otype_bibcode'] == 'CDS',
                                     loc_seyfert)])
print('The total number unknown origin of Seyfert classifications in SMB is %i' % unk_typ)
print('The total number unknown origin of Seyfert classifications in SMB is %i' % cds_typ)
print('The fraction of unknown origin classification %.2f' %
      ((unk_typ+cds_typ)/tot_galax))

The total number of galaxies is 18923
The total number unknown origin of Seyfert classifications in SMB is 1026
The total number unknown origin of Seyfert classifications in SMB is 8468
The fraction of unknown origin classification 0.50


## Dealing with unknown bibcodes and Seyfert types

Some Seyfert sources are not classified and/or the source is unknown. If the source is unknown, but they have a classification in VCV we assume that the classification comes from VCV.

In [10]:
UKSyGVCV = np.where(np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                           all_typ['otype_txt_1'] == 'SyG',
                                           all_typ['otype_txt_2'] == 'SyG',
                                           all_typ['Sp'] == 'S']))

In [11]:
UKSy1VCV = np.where(np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                           all_typ['otype_txt_1'] == 'Sy1',
                                           all_typ['otype_txt_2'] == 'Sy1',
                                           all_typ['Sp'] == 'S1']))

In [12]:
UKSy2VCV = np.where(np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                           all_typ['otype_txt_1'] == 'Sy2',
                                           all_typ['otype_txt_2'] == 'Sy2',
                                           all_typ['Sp'] == 'S2']))

In [13]:
all_typ['otype_bibcode'][UKSyGVCV] = 'VCV'
all_typ['otype_bibcode'][UKSy1VCV] = 'VCV'
all_typ['otype_bibcode'][UKSy2VCV] = 'VCV'

In [14]:
seyfert_unk = np.logical_and(loc_seyfert, all_typ['otype_bibcode'] == 'UNK')
print('The total number unknown classifications in SMB at this point is %i' %
      sum(seyfert_unk))

The total number unknown classifications in SMB at this point is 51


We remove the galaxies with a LINER classification in VCV and unknown source of the Seyfert classification in SMB.

In [15]:
GalaxiesToRemove = all_typ[np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                                  all_typ['Sp'] == 'S3',
                                                  loc_seyfert])]['main_id'].data
print('The galaxies to remove are: ', GalaxiesToRemove, len(GalaxiesToRemove))
LocGTR = [np.where(all_typ['main_id'] == i)[0] for i in GalaxiesToRemove]
all_typ.remove_rows(np.concatenate(LocGTR))

The galaxies to remove are:  ['2MASX J15320963+5854187' 'Mrk  266NE'] 2


If galaxies have an unknown source but are still classified as a different Seyfert type in VCV, we reclassify them as a unclassified Seyfert (SyG) for further study.

In [16]:
GalaxiesToStudy = np.concatenate((all_typ[np.where(np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                                                           all_typ['otype_txt_1'] == 'Sy2',
                                                                           all_typ['otype_txt_2'] == 'Sy2']))[0]]['main_id'].data,
                                  all_typ[np.where(np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                                                           all_typ['otype_txt_1'] == 'Sy1',
                                                                           all_typ['otype_txt_2'] == 'Sy1']))[0]]['main_id'].data))

In [17]:
GalaxiesToStudy

array(['2MASX J18311470-3336085', 'EGSIRAC J141515.60+520354.2',
       'ESO 323-77', 'MCG+03-60-031', 'NAME SMM J141741.90+522823.6',
       '[VV2006c] J125310.5-091024', '2E  2294', '2E  2628', '2E  3786',
       '2MASS J00423990+3017514', '2MASS J01341936+0146479',
       '2MASS J10102753+4132389', '2MASS J12002696+3317286',
       '2MASX J05064491-1011357', '2MASX J06374318-7538458',
       '2MASX J07185777+7059209', '2MASX J08420557+0759253',
       '2MASX J09443702-2633554', '2MASX J10155660-2002268',
       '2MASX J14555293-3548223', '2MASX J15085291+6814074',
       '2MASX J16383091-2055246', '2MASX J21033788-0455396',
       '2MASX J22024516-1304538', '2dFGRS TGN357Z241', '3C 286',
       '6dFGS gJ043944.9-454043', '6dFGS gJ084628.7-121409',
       '6dFGS gJ101329.7-283126', '7C 151247.00+370154.00', 'CTS   11',
       'HE 0226-4110', 'ICRF J025937.6+423549', 'ICRF J073352.5+502209',
       'ICRF J081100.6+571412', 'ICRF J100646.4-215920',
       'ICRF J110153.4+624150', 'ICRF

Then, we remove galaxies that are not useful.

In [18]:
Reclass = [np.where(matched['main_id'] == gal)[0][0] for gal in GalaxiesToStudy]
matched['otype_txt'][Reclass] = 'SyG'

LocGTR_Matched = [np.where(matched['main_id'] == i)[0] for i in GalaxiesToRemove]

matched.remove_rows(np.concatenate(LocGTR_Matched))

From the rest of this work we are going to assume that S1n are also S1 galaxies. Only three galaxies classified as S1n in VCV are classified as Sy2 in SIMBAD. We are going to include this three galaxies in the unclassified sample.

In [19]:
Reclass2 = np.where(np.logical_and(matched['Sp'] == 'S1n', matched['otype_txt'] == 'Sy2'))
print(matched[Reclass2]['main_id'])
matched['Sp'][Reclass2] = 'S'
Reclass3 = np.where(matched['Sp'] == 'S1n')
matched['Sp'][Reclass3] = 'S1'

        main_id        
-----------------------
2MASX J10194946+3322041
2MASS J09455439+4238399
2MASX J23383708-0028105


We additionally save the information of the galaxies that where reclassified as S1 from S1n for further analysis (Check [A1_NarrowLineS1](A1_NarrowLineS1.ipynb) notebook).

In [20]:
matched[Reclass3]['main_id', 'otype_txt'].write('../Data/Complementary/NLS1_reclass.txt',
                                                format='ascii')



## Create the clean VCV SMB sample file
Here we organize and save the data of the sample that will be used later.

In [21]:
print('We get in total %i galaxies in the sample'%len(matched))

We get in total 18921 galaxies in the sample


In [22]:
matched.write('../Data/Final/VCV_SMB_otype.txt', format='ascii')



## Final numbers from the otypes
##### Part TABLE 3
We compute the final numbers about where the classifications come from. We take into account the reclassification that we use before for specific galaxies.

In [23]:
ReclassS1 = np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                   all_typ['otype_txt_1'] == 'Sy1',
                                   all_typ['otype_txt_2'] == 'Sy1'])

ReclassS2 = np.logical_and.reduce([all_typ['otype_bibcode'] == 'UNK',
                                   all_typ['otype_txt_1'] == 'Sy2',
                                   all_typ['otype_txt_2'] == 'Sy2'])

Data from Seyfert 1 galaxies comes predominantly (95%) from 5 works

In [24]:
print('The total number of Seyfert 1 is:',
      len(all_typ[np.logical_and(all_typ['otype_txt_1'] == 'Sy1',
                                 all_typ['otype_txt_2'] == 'Sy1')])-sum(ReclassS1))

The total number of Seyfert 1 is: 13760


Here we show the bibcode for the first 5 contributions to the Seyfert 1 sample

In [25]:
S1Otyp = np.logical_xor(np.logical_and(all_typ['otype_txt_1'] == 'Sy1',
                                       all_typ['otype_txt_2'] == 'Sy1'),
                        ReclassS1)
GroupedS1 = all_typ[S1Otyp].to_pandas().groupby('otype_bibcode').size()
GroupedS1.sort_values(ascending=False).head(5)

otype_bibcode
b'CDS'                    5566
b'2014ApJ...788...45T'    2784
b'2015ApJS..219....1O'    2001
b'2006ApJS..166..128Z'    1783
b'2017ApJS..229...39R'     988
dtype: int64

In [26]:
# Fractional contribution from those 5 works
print((sum(GroupedS1.sort_values(ascending=False).head(5))) / (len(all_typ[np.logical_and(
    all_typ['otype_txt_1'] == 'Sy1', all_typ['otype_txt_2'] == 'Sy1')])-sum(ReclassS1)))

0.9536337209302326


Data from Seyfert 2 galaxies comes predominantly (97%) from 3 works

In [27]:
print('The total number of Seyfert 2 is:',
      len(all_typ[np.logical_and(all_typ['otype_txt_1'] == 'Sy2',
                                 all_typ['otype_txt_2'] == 'Sy2')])-sum(ReclassS2))

The total number of Seyfert 2 is: 5040


Here we show the bibcode for the first 3 contributions to the Seyfert 2 sample

In [28]:
S2Otyp = np.logical_xor(np.logical_and(all_typ['otype_txt_1'] == 'Sy2',
                                       all_typ['otype_txt_2'] == 'Sy2'),
                        ReclassS2)
GroupedS2 = all_typ[S2Otyp].to_pandas().groupby('otype_bibcode').size()
GroupedS2.sort_values(ascending=False).head(3)

otype_bibcode
b'CDS'                    2845
b'2014ApJ...788...45T'    1239
b'VCV'                     787
dtype: int64

In [29]:
# Fractional contribution from those 3 works
print((sum(GroupedS2.sort_values(ascending=False).head(3))) / (len(all_typ[np.logical_and(
    all_typ['otype_txt_1'] == 'Sy2', all_typ['otype_txt_2'] == 'Sy2')])-sum(ReclassS2)))

0.966468253968254


Finally, data from the unclassified Seyfert galaxies comes predominantly (92%) from 3 works

In [30]:
print('The total number of Seyfert 2 is:',
      len(all_typ[np.logical_and(all_typ['otype_txt_1'] == 'SyG',
                                  all_typ['otype_txt_2'] == 'SyG')]) +
      sum(np.logical_or(ReclassS1, ReclassS2)))

The total number of Seyfert 2 is: 121


We show the bibcode for the first 3 contributions to the unclassified Seyfert sample

In [31]:
SGOtyp = np.logical_xor(np.logical_and(all_typ['otype_txt_1'] == 'SyG',
                                       all_typ['otype_txt_2'] == 'SyG'),
                        np.logical_or(ReclassS1, ReclassS2))
GroupedSG = all_typ[SGOtyp].to_pandas().groupby('otype_bibcode').size()
GroupedSG.sort_values(ascending=False).head(3)

otype_bibcode
b'CDS'    57
b'UNK'    49
b'VCV'     5
dtype: int64

In [32]:
# Fractional contribution from those 3 works
print((sum(GroupedSG.sort_values(ascending=False).head(3))) /
      (len(all_typ[np.logical_and(all_typ['otype_txt_1'] == 'SyG',
                                  all_typ['otype_txt_2'] == 'SyG')]) +
       sum(np.logical_or(ReclassS1, ReclassS2))))

0.9173553719008265


We notice that almost 45% of the classifications come from the CDS basic data

In [33]:
print((sum(all_typ[S1Otyp]['otype_bibcode'] == 'CDS') +
       sum(all_typ[S2Otyp]['otype_bibcode'] == 'CDS') +
       sum(all_typ[SGOtyp]['otype_bibcode'] == 'CDS')) /
      len(all_typ[np.logical_or.reduce([np.logical_and(all_typ['otype_txt_1'] == 'Sy2',
                                                       all_typ['otype_txt_2'] == 'Sy2'),
                                        np.logical_and(all_typ['otype_txt_1'] == 'Sy1',
                                                       all_typ['otype_txt_2'] == 'Sy1'),
                                        np.logical_and(all_typ['otype_txt_1'] == 'SyG',
                                                       all_typ['otype_txt_2'] == 'SyG')])]))

0.44754505575815234


##### Notebook info

In [34]:
%load_ext watermark
%watermark -a "Andres Ramos" -d -v -m
print('Specific Python packages')
%watermark -iv -w --packages astropy,pandas

Author: Andres Ramos

Python implementation: CPython
Python version       : 3.8.3
IPython version      : 7.16.1

Compiler    : GCC 7.3.0
OS          : Linux
Release     : 3.10.0-1160.25.1.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit

Specific Python packages
astropy: 4.2
pandas : 1.2.0

autopep8  : 1.5.7
matplotlib: 3.2.2
numpy     : 1.19.5
logging   : 0.5.1.2
re        : 2.2.1
json      : 2.0.9
sys       : 3.8.3 (default, Jul  2 2020, 16:21:59) 
[GCC 7.3.0]

Watermark: 2.1.0

