In [1]:
#import everything necessary
import requests
import numpy as np
import pandas as pd
import csv
import glob
import os
import sys
sys.path.append("../BerkeleySETI")
import data
from data import loaders
from bs4 import BeautifulSoup

In [2]:
#Mounting the data
data_dir = "/mnt/disks/lcs/"
data.mount_drive(data_dir)
data_dir = "/mnt/disks/lcs/tess-goddard-lcs/"

Disk mounted


**Loading and Featurizing the Data**

This notebook will be dedicated to loading in and featuring the data necessary for this project. The EBs will be taken from the Villanova TESS EB dataset, which consists of (hopefully) all of the EBs that are in the two minute cadence list, while the NonEBs are taken by a stratified random sampling method, in which the two minute cadence list in each sector is a strata. At the time of creation (4-15-22), there is 8574 EBs in the Villanova dataset, and I took a sample of 5200 NonEBs.

Using BeautifulSoup, I wrote a webscraper that would scrape the 47 pages on the Villanova TESS EBs dataset, and put each TICID into array x.

In [3]:
x = np.empty((4584), dtype=int)
index = 0
for pages in range(1,47):
    URL = f'http://tessebs.villanova.edu/?order_by=tic&page={pages}'
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    html = soup.find_all('a')
    for t in html:
        text = t.text.strip()
        text = text.strip('0')
        if (text.isnumeric()):
            x[index] = text
            index=index+1

Below, I printed out x as a reference.

In [None]:
x

array([     91961,     101462,     120016, ..., 1992266045, 2003333263,
       2046417955])

I used the list of TICIDs, and checked each of the two minute cadence lists in the 26 lookup tables for the TICID, and saved each row that contained the TICIDs to a csv file. I then combined each files into one file, and added a column with the label of EBs.

In [None]:
for sec in range (1,27):
    ref = pd.read_csv(data_dir+f"sector{sec}lookup.csv")
    tmcl = ["2_min_cadence" in fn for fn in ref.Filename]
    reftmcl = ref[tmcl]
    z = reftmcl.TIC_ID.isin(x)
    print(f"sector {sec} done")
    reftmcl[z].to_csv(f'../BerkeleySETI/inputdata/tmclbysector/sector{sec}tmclebs.csv', index = False)

In [None]:
files = os.path.join(f'../BerkeleySETI/inputdata/tmcl_ebs_by_sector', 'sector*tmclebs.csv')
files = glob.glob(files)
df = pd.concat(map(pd.read_csv, files), ignore_index=True)
df.to_csv(f'../BerkeleySETI/inputdata/allsectorstmclebs.csv', index = False)

In [None]:
df2 = pd.read_csv(f'../BerkeleySETI/inputdata/allsectorstmclebs.csv')
Label = ['EB'] * len(df)
df2['Label'] = Label
df2.to_csv(f'../BerkeleySETI/inputdata/allsectorstmclebs.csv', index = False)

After that, I went through the lookup tables of the 26 sectors we have access to in tess-goddard-lcs, and took a SRS of 200 two minute cadence list light curves in each sector that is NOT a light curve from the Villanova dataset. Each SRS of 200 LCs are saved to the folder named tmcl_nonebs_by_sector.

In [27]:
for sec in range (1,27):
    ref = pd.read_csv(data_dir+f"sector1lookup.csv")
    tmcl = ["2_min_cadence" in fn for fn in ref.Filename]
    reftmcl = ref[tmcl]
    z = ~reftmcl.TIC_ID.isin(x)
    nonebs = reftmcl[z].sample(n = 200)
    nonebs.to_csv(f'../BerkeleySETI/inputdata/tmcl_nonebs_by_sector/sector{sec}tmclnonebs.csv', index = False)

In [28]:
#Combined the individual files into allsectorstmclnonebs.csv
files = os.path.join(f'../BerkeleySETI/inputdata/tmcl_nonebs_by_sector', 'sector*tmclnonebs.csv')
files = glob.glob(files)
df3 = pd.concat(map(pd.read_csv, files), ignore_index=True)
df3.to_csv(f'../BerkeleySETI/inputdata/allsectorstmclnonebs.csv', index = False)

Below, I added a NonEBs label for each row in this dataset.

In [29]:
df4 = pd.read_csv(f'../BerkeleySETI/inputdata/allsectorstmclnonebs.csv')
Label = ['NonEB'] * len(df3)
df4['Label'] = Label
df4.to_csv(f'../BerkeleySETI/inputdata/allsectorstmclnonebs.csv', index = False)

Finally, I combined the EBs and NonEBs list to get my final input data.

In [30]:
files = os.path.join(f'../BerkeleySETI/inputdata', 'allsectorstmcl*.csv')
files = glob.glob(files)
df5 = pd.concat(map(pd.read_csv, files), ignore_index=True)
df5.to_csv(f'../BerkeleySETI/inputdata/fulldataset.csv', index = False)