# AGN Type I Catalogue

**A Comprehensive and Uniform Sample of Broad-line Active Galactic Nuclei from the SDSS DR7**

https://iopscience.iop.org/article/10.3847/1538-4365/ab298b

In [1]:
from astropy.io import fits
from astropy.table import Table
from matplotlib import pyplot as plt
from astroML.datasets import fetch_sdss_spectrum
import multiprocessing
import pandas as pd
from functools import partial
import numpy as np
import glob
import re
import wget

# Read lists

In [2]:
galaxy = pd.read_csv('galaxy.csv', header=0, comment='#')
QSO = pd.read_csv('QSO.csv', header=0, comment='#')
sy19 = pd.read_csv('sy19.csv', header=0, comment='#')
sy20 = pd.read_csv('sy20.csv', header=0, comment='#')

# duplication filtered
sy20d = pd.read_csv('sy20d.csv', header=0, comment='#')
galaxyd = pd.read_csv('galaxyd.csv', header=0, comment='#')


# Define multithreds functions

In [3]:
def parallelize_dataframe(df,func,data_home='./',number=8):
    num_partitions = number # number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_partitions)
    f = partial(func, data_home)
    pool.map(f, df_split)
    pool.close()
    pool.join()
    return

def download(data_home,df):
    # Fetch single spectrum
    for index, row in df.iterrows():
        plate = str(row['PLATE']).zfill(4)
        mjd = str(row['MJD']).zfill(5)
        fiberID = str(row['FIBERID']).zfill(4)
        rootURL = 'https://data.sdss.org/sas/dr16/sdss/spectro/redux/26/spectra/'+plate+'/'
        filename = 'spec-'+plate+'-'+mjd+'-'+fiberID+'.fits'
        url = rootURL+filename
        try:
            wget.download(url,data_home)
        except:
            print("!!!!! Failed:",row['PLATE'], row['MJD'], row['FIBERID'])

# Download galaxy Type I

In [4]:
parallelize_dataframe(galaxy,download,data_home='./DR16/galaxy',number=16)

!!!!! Failed: 2516 54240 32
!!!!! Failed: 1631 54468 199
!!!!! Failed: 1631 54468 384


# Download galaxy Type I (dupllication filtered)

In [5]:
parallelize_dataframe(galaxyd,download,data_home='./DR16/galaxyd',number=16)

!!!!! Failed: 2516 54240 32
!!!!! Failed: 1631 54468 199
!!!!! Failed: 1631 54468 384


# Download sy20 (dupllication filtered)

In [6]:
parallelize_dataframe(sy20d,download,data_home='./DR16/sy20d',number=16)

# Download QSO Type I

In [7]:
# parallelize_dataframe(QSO,download,data_home='./DR16/QSO',number=16)

# Download sy19

In [8]:
# parallelize_dataframe(sy19,download,data_home='./DR16/sy19',number=16)

# Notes

In [9]:
## linux monitoring net traffic: nethogs 
## counting file number: ls sy19 | wc -l