# Clean the sample in the SMB_VCV file
Here we remove the Seyfert galaxies and classifications without a known source (No Bibcode)

We import the Python packages first

In [1]:
%pylab

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
from astropy.table import Table, join
from astropy.table import unique as tunique

We read the original SMB_VCV file and the table with the Bibcodes for each otype. We rearrange some strings.

In [4]:
# File containing all the otype information
BibOT = Table.read('../Data/Raw/BibcodesOtypes.fits', format='fits')
# File with the cross-match between VCV and SMB
Matched = Table.read('../Data/Raw/SMB_VCV.fits', format='fits')

In [5]:
Matched['main_id'] = [j.rstrip() for j in Matched['main_id']]
Matched['Sp'] = [j.rstrip() for j in Matched['Sp']]
BibOT['main_id'] = [j.rstrip() for j in BibOT['main_id']]
BibOT['otype_txt'] = [j.rstrip() for j in BibOT['otype_txt']]
BibOT['origin'] = [j.rstrip() for j in BibOT['origin']]

## Remove different redshifts
We remove galaxies where the cross-match shows that the redshift for the galaxies was higher than 0.01 

In [6]:
Matched['DiffRed'] = Matched['rvz_redshift']-Matched['z']
Badz = where(abs(Matched['DiffRed']) > 0.01)
Matched.remove_rows(Badz)

## Separate origin of the bibcode
We separate the bibcode to simplify querys

In [7]:
def SeparateOrigin(Table):
    """Separate the origin inside the file"""
    Bibcod = np.empty(len(Table), dtype='S19')
    for j, jtex in enumerate(Table):
        if 'bibcode' in jtex['origin']:
            Bibcod[j] = jtex['origin'].split('bibcode=')[-1]
        elif 'from basic data' in jtex['origin']:
            Bibcod[j] = 'CDS'
        elif 'from id' in jtex['origin']:
            Bibcod[j] = 'ID'
        elif jtex['origin'] == '':
            Bibcod[j] = 'UNK'
        else:
            raise NameError('No origin??')
    Table['otype_bibcode'] = Bibcod
    return(Table)

In [8]:
BibOT = SeparateOrigin(BibOT)

The galaxies in the sample have different otypes, here we center in the `main_id`. Then, we join the files to only have the information of the `main_id` for each galaxy.

In [10]:
AllTypes = join(BibOT, Matched, keys='main_id')
print('The total number of galaxies is %i' % len(unique(AllTypes['main_id'])))
print('The total number unknown classifications in SMB is %i' %
      sum(AllTypes['otype_bibcode'] == 'UNK'))

The total number of galaxies is 18923
The total number unknown classifications in SMB is 3441


## Dealing with unknown bibcodes and Seyfert types

Some Seyfert sources are not classified and/or the source is unknown. If the source is unknown, but they have a classification in VCV we assume that the classification comes from VCV. 

In [11]:
UKSyGVCV = np.where(np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                           AllTypes['otype_txt_1'] == 'SyG',
                                           AllTypes['otype_txt_2'] == 'SyG',
                                           AllTypes['Sp'] == 'S']))

In [12]:
UKSy1VCV = np.where(np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                           AllTypes['otype_txt_1'] == 'Sy1',
                                           AllTypes['otype_txt_2'] == 'Sy1',
                                           AllTypes['Sp'] == 'S1']))

In [13]:
UKSy2VCV = np.where(np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                           AllTypes['otype_txt_1'] == 'Sy2',
                                           AllTypes['otype_txt_2'] == 'Sy2',
                                           AllTypes['Sp'] == 'S2']))

In [14]:
AllTypes['otype_bibcode'][UKSyGVCV] = 'VCV'
AllTypes['otype_bibcode'][UKSy1VCV] = 'VCV'
AllTypes['otype_bibcode'][UKSy2VCV] = 'VCV'

In [15]:
print('The total number unknown classifications in SMB at this point is %i' %
      sum(AllTypes['otype_bibcode'] == 'UNK'))

The total number unknown classifications in SMB at this point is 2466


We remove the galaxies with a LINER classification in VCV and unknown source of the Seyfert classification in SMB.

In [16]:
LocSeyfert=np.logical_or.reduce([AllTypes['otype_txt_1'] == 'Sy1',
                                 AllTypes['otype_txt_1'] == 'Sy2',
                                 AllTypes['otype_txt_1'] == 'SyG'])
GalaxiesToRemove = AllTypes[np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                                   AllTypes['Sp'] == 'S3', 
                                                   LocSeyfert])]['main_id'].data
print('The galaxies to remove are: ',GalaxiesToRemove)
LocGTR = [np.where(AllTypes['main_id'] == i)[0] for i in GalaxiesToRemove]
AllTypes.remove_rows(np.concatenate(LocGTR))

The galaxies to remove are:  ['2MASX J15320963+5854187' 'Mrk  266NE']


If galaxies have an unknown source but are still classified as Seyfert, we send these galaxies from their SMB classification to a Seyfert for further study.

In [17]:
GalaxiesToStudy = concatenate((AllTypes[np.where(np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                                                        AllTypes['otype_txt_1'] == 'Sy2',
                                                                        AllTypes['otype_txt_2'] == 'Sy2']))[0]]['main_id'].data,
                               AllTypes[np.where(np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                                                        AllTypes['otype_txt_1'] == 'Sy1',
                                                                        AllTypes['otype_txt_2'] == 'Sy1']))[0]]['main_id'].data))

In [18]:
GalaxiesToStudy

masked_array(data=['2MASX J18311470-3336085',
                   'EGSIRAC J141515.60+520354.2', 'ESO 323-77',
                   'MCG+03-60-031', 'NAME SMM J141741.90+522823.6',
                   '[VV2006c] J125310.5-091024', '2E  2294', '2E  2628',
                   '2E  3786', '2MASS J00423990+3017514',
                   '2MASS J01341936+0146479', '2MASS J10102753+4132389',
                   '2MASS J12002696+3317286', '2MASX J05064491-1011357',
                   '2MASX J06374318-7538458', '2MASX J07185777+7059209',
                   '2MASX J08420557+0759253', '2MASX J09443702-2633554',
                   '2MASX J10155660-2002268', '2MASX J14555293-3548223',
                   '2MASX J15085291+6814074', '2MASX J16383091-2055246',
                   '2MASX J21033788-0455396', '2MASX J22024516-1304538',
                   '2dFGRS TGN357Z241', '3C 286',
                   '6dFGS gJ043944.9-454043', '6dFGS gJ084628.7-121409',
                   '6dFGS gJ101329.7-283126', '7C 151247.

We re-classify some galaxies from Sy1 or Sy2 to SyG , to study later on. Then, we remove galaxies that are not useful. And finally, we create a flag for the type of emission

In [19]:
Reclass = [np.where(Matched['main_id'] == gal)[0][0] for gal in GalaxiesToStudy]
Matched['otype_txt'][Reclass] = 'SyG'

LocGTR_Matched = [np.where(Matched['main_id'] == i)[0] for i in GalaxiesToRemove]

Matched.remove_rows(np.concatenate(LocGTR_Matched))

From the rest of this work we are going to assume that S1n are also S1 galaxies. Only three galaxies classified as S1n in VCV are classified as Sy2 in SIMBAD. We are going to include this three galaxies in the unclassified sample.

In [20]:
Reclass2 = np.where(np.logical_and(Matched['Sp'] == 'S1n', Matched['otype_txt'] == 'Sy2'))
print(Matched[Reclass2]['main_id'])
Matched['Sp'][Reclass2] = 'S'
Reclass3 = np.where(Matched['Sp'] == 'S1n')
Matched['Sp'][Reclass3] = 'S1'

        main_id        
-----------------------
2MASX J10194946+3322041
2MASS J09455439+4238399
2MASX J23383708-0028105


We additionally save the information of the galaxies that where reclassified as S1 from S1n for further analysis (Check [A1_NarrowLineS1](A1_NarrowLineS1.ipynb) notebook).

In [21]:
Matched[Reclass3]['main_id','otype_txt'].write('../Data/Complementary/NLS1_reclass.txt',
                                               format='ascii')

## Create the clean VCV SMB sample file
Here we organize and save the data of the sample that will be used later.

In [21]:
print('We get in total %i galaxies in the sample'%len(Matched))

We get in total 18921 galaxies in the sample


In [18]:
Matched.write('../Data/Final/VCV_SMB_otype.txt', format='ascii')



## Final numbers from the otypes
##### Part TABLE 3
We compute the final numbers about where the classifications come from. We take into account the reclassification that we use before for specific galaxies.

In [22]:
ReclassS1 = np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                   AllTypes['otype_txt_1'] == 'Sy1',
                                   AllTypes['otype_txt_2'] == 'Sy1'])

ReclassS2 = np.logical_and.reduce([AllTypes['otype_bibcode'] == 'UNK',
                                   AllTypes['otype_txt_1'] == 'Sy2',
                                   AllTypes['otype_txt_2'] == 'Sy2'])

Data from Seyfert 1 galaxies comes predominantly (95%) from 5 works 

In [23]:
print('The total number of Seyfert 1 is:',
      len(AllTypes[np.logical_and(AllTypes['otype_txt_1'] == 'Sy1',
                                  AllTypes['otype_txt_2'] == 'Sy1')])-sum(ReclassS1))

The total number of Seyfert 1 is: 13760


Here we show the bibcode for the first 5 contributions to the Seyfert 1 sample

In [25]:
S1Otyp = np.logical_xor(np.logical_and(AllTypes['otype_txt_1'] == 'Sy1',
                                       AllTypes['otype_txt_2'] == 'Sy1'),
                        ReclassS1)
GroupedS1 = AllTypes[S1Otyp].to_pandas().groupby('otype_bibcode').size()
GroupedS1.sort_values(ascending=False).head(5)

otype_bibcode
b'CDS'                    5566
b'2014ApJ...788...45T'    2784
b'2015ApJS..219....1O'    2001
b'2006ApJS..166..128Z'    1783
b'2017ApJS..229...39R'     988
dtype: int64

In [26]:
# Fractional contribution from those 5 works
(sum(GroupedS1.sort_values(ascending=False).head(5))) / (len(AllTypes[np.logical_and(
    AllTypes['otype_txt_1'] == 'Sy1', AllTypes['otype_txt_2'] == 'Sy1')])-sum(ReclassS1))

0.9536337209302326

Data from Seyfert 2 galaxies comes predominantly (97%) from 3 works

In [27]:
print('The total number of Seyfert 2 is:', 
      len(AllTypes[np.logical_and(AllTypes['otype_txt_1'] == 'Sy2',
                                  AllTypes['otype_txt_2'] == 'Sy2')])-sum(ReclassS2))

The total number of Seyfert 2 is: 5040


Here we show the bibcode for the first 3 contributions to the Seyfert 2 sample

In [28]:
S2Otyp = np.logical_xor(np.logical_and(AllTypes['otype_txt_1'] == 'Sy2',
                                       AllTypes['otype_txt_2'] == 'Sy2'),
                        ReclassS2)
GroupedS2 = AllTypes[S2Otyp].to_pandas().groupby('otype_bibcode').size()
GroupedS2.sort_values(ascending=False).head(3)

otype_bibcode
b'CDS'                    2845
b'2014ApJ...788...45T'    1239
b'VCV'                     787
dtype: int64

In [29]:
# Fractional contribution from those 3 works
(sum(GroupedS2.sort_values(ascending=False).head(3))) / (len(AllTypes[np.logical_and(
    AllTypes['otype_txt_1'] == 'Sy2', AllTypes['otype_txt_2'] == 'Sy2')])-sum(ReclassS2))

0.966468253968254

Finally, data from the unclassified Seyfert galaxies comes predominantly (92%) from 3 works

In [30]:
print('The total number of Seyfert 2 is:',
      len(AllTypes[np.logical_and(AllTypes['otype_txt_1'] == 'SyG',
                                  AllTypes['otype_txt_2'] == 'SyG')])+sum(np.logical_or(ReclassS1, ReclassS2)))

The total number of Seyfert 2 is: 121


We show the bibcode for the first 3 contributions to the unclassified Seyfert sample

In [31]:
SGOtyp = np.logical_xor(np.logical_and(AllTypes['otype_txt_1'] == 'SyG',
                                       AllTypes['otype_txt_2'] == 'SyG'),
                        np.logical_or(ReclassS1, ReclassS2))
GroupedSG = AllTypes[SGOtyp].to_pandas().groupby('otype_bibcode').size()
GroupedSG.sort_values(ascending=False).head(3)

otype_bibcode
b'CDS'    57
b'UNK'    49
b'VCV'     5
dtype: int64

In [32]:
# Fractional contribution from those 3 works
(sum(GroupedSG.sort_values(ascending=False).head(3))) / (len(AllTypes[np.logical_and(
    AllTypes['otype_txt_1'] == 'SyG', AllTypes['otype_txt_2'] == 'SyG')])+sum(np.logical_or(ReclassS1, ReclassS2)))

0.9173553719008265

We notice that almost 45% of the classifications come from the CDS basic data

In [33]:
(sum(AllTypes[S1Otyp]['otype_bibcode'] == 'CDS')+sum(AllTypes[S2Otyp]['otype_bibcode'] == 'CDS')+sum(AllTypes[SGOtyp]['otype_bibcode'] == 'CDS'))/len(AllTypes[np.logical_or.reduce([np.logical_and(AllTypes['otype_txt_1'] == 'Sy2', AllTypes['otype_txt_2'] == 'Sy2'),
                                                                                                                                                                                     np.logical_and(
                                                                                                                                                                                         AllTypes['otype_txt_1'] == 'Sy1', AllTypes['otype_txt_2'] == 'Sy1'),
                                                                                                                                                                                     np.logical_and(AllTypes['otype_txt_1'] == 'SyG', AllTypes['otype_txt_2'] == 'SyG')])])

0.44754505575815234

##### Notebook info

In [30]:
%load_ext watermark
%watermark -a "Andres Ramos" -d -v -m
print('Specific Python packages')
%watermark -iv -w --packages astropy,pandas

Author: Andres Ramos

Python implementation: CPython
Python version       : 3.8.3
IPython version      : 7.16.1

Compiler    : GCC 7.3.0
OS          : Linux
Release     : 3.10.0-1160.el7.x86_64
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit

Specific Python packages
astropy: 4.2
pandas : 1.2.0

re        : 2.2.1
matplotlib: 3.2.2
json      : 2.0.9
autopep8  : 1.5.4
numpy     : 1.19.5
sys       : 3.8.3 (default, Jul  2 2020, 16:21:59) 
[GCC 7.3.0]
logging   : 0.5.1.2

Watermark: 2.1.0

