## Code for Feature Extraction

After downloading the data, we read it in and pre-process it using *feets*, a Python package based on FATS that is better maintained (FATS had some issues and does not have Python 3 support).

In [None]:
#Package imports
import feets
import itertools
import numpy as np 
import pandas as pd

In [None]:
#Read Data
import gzip
with gzip.open('AllVar.phot.gz', 'rt') as f:
    temp = f.read()

In [None]:
#Turn the data into floats (numeric)
temp2 = temp.split("\n")
del temp2[-1]
dataa = [[float(y) for y in x.split(',')] for x in temp2]

In [None]:
#Group the long data by stars
groups = itertools.groupby(dataa, lambda x: x[0])
grouped_data = [list(g) for k,g in groups]

In [None]:
#Numpy array of the magnitude, time, and measurement error by star
#Preprocessed into format for use with FATS/feets
lc = np.array([[list(zip(*a))[2],list(zip(*a))[1],list(zip(*a))[3]] for a in grouped_data])

In [None]:
#Extract features then save them as a numpy array -- takes ~16 hrs, output is uploaded in Github directly
fs = feets.FeatureSpace(data = ['magnitude','time','error'])
feature_vectors = np.array(list(zip(*[fs.extract(*a) for a in lc]))[1])
np.save("Features",feature_vectors)

## True label extraction

Getting the true labels and reordering to match the features

In [None]:
#Read in table with true labels
tabb = pd.read_csv('CatalinaVars.tbl',sep='\s+')
star_pd = pd.DataFrame(tabb)
star_class = np.array(star_pd)

In [None]:
#Get the starID and class, remove the rest
stard = np.array([list(zip(*star_class))[1],list(zip(*star_class))[-2]])

In [None]:
#Get the star labels of the features
star = [x[0][0] for x in grouped_data]

In [None]:
#Create a dictionary to quickly allow for finding the star class by ID
#Save as file for quick access
mapping = dict(np.transpose(stard))
classes = [mapping[x] for x in star]
np.save("true_labels", classes)

In [None]:
#Table the true classes; it is heavily imbalanced
from collections import Counter
Counter(list(classes))

Counter({5: 5433,
         1: 30593,
         4: 2420,
         13: 512,
         6: 500,
         2: 4658,
         8: 1514,
         17: 153,
         10: 124,
         3: 279,
         14: 142,
         11: 242,
         16: 85,
         9: 62,
         7: 72,
         12: 7,
         15: 25})