<a href="https://colab.research.google.com/github/arpdm/apcs/blob/main/apcs_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import os
import pathlib
import tarfile
import urllib.request
import glob
import re
import pandas as pd

In [120]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [121]:
core_path = "/content/drive/MyDrive/Sector_002/APCS"
data_path = "data/smass_data/"

# Set the path for downloading data files
pathlib.Path(os.path.join(core_path, data_path)).mkdir(parents=True, exist_ok=True)

In [122]:
smass_data_files = \
    {'file1': {'url': 'http://smass.mit.edu/data/smass/Bus.Taxonomy.txt',
               'sha256': '0ce970a6972dd7c49d512848b9736d00b621c9d6395a035bd1b4f3780d4b56c6'},
     'file2': {'url': 'http://smass.mit.edu/data/smass/smass2data.tar.gz',
               'sha256': 'dacf575eb1403c08bdfbffcd5dbfe12503a588e09b04ed19cc4572584a57fa97'}}

In [123]:
def downloadDataFiles(file_dictionary):

    for df in file_dictionary:

        # Get the URL and create a download filepath by splitting it at the last "/"
        split = urllib.parse.urlsplit(file_dictionary[df]["url"])
        filename = pathlib.Path(os.path.join(core_path, data_path, split.path.split("/")[-1]))

        # Download the files if they dont exist already
        if not filename.is_file():
            print(f"Downloading now: {file_dictionary[df]['url']}")
            _ , _ = urllib.request.urlretrieve(url=file_dictionary[df]["url"], filename=filename)

In [124]:
downloadDataFiles(smass_data_files)

In [125]:
# Untar the spectra data
tar = tarfile.open(os.path.join(core_path, data_path, "smass2data.tar.gz"), "r:gz")
tar.extractall(os.path.join(core_path, data_path))
tar.close()

In [126]:
file_paths = sorted(glob.glob(os.path.join(core_path, data_path, "smass2/*spfit*")))

In [127]:
des_file_paths = file_paths[:-8]
non_des_file_paths = file_paths[-8:]

In [128]:
des_file_paths_df = pd.DataFrame(des_file_paths, columns=['FilePath'])
non_des_file_paths_df = pd.DataFrame(non_des_file_paths, columns=['FilePath'])

In [129]:
# Apply designations and temporary names of asteroids to their file paths
des_file_paths_df.loc[:,"DesNr"] = des_file_paths_df["FilePath"].apply(lambda x: int(re.search(r"smass2/a(.*).spfit", x).group(1)))
non_des_file_paths_df.loc[:,"DesNr"] = non_des_file_paths_df["FilePath"].apply(lambda x: re.search(r"smass2/au(.*).spfit", x).group(1))


In [130]:
# Map classses to the file paths
asteroid_class_df = pd.read_csv(os.path.join(core_path, data_path, "Bus.Taxonomy.txt"),
                                skiprows = 21,
                                sep="\t",
                                names=["Name", "Tholen_Class","Bus_Class","unknown1","unknown2"])

In [131]:
asteroid_class_df.loc[:,"Name"] = asteroid_class_df["Name"].apply(lambda x: x.strip()).copy()
des_asteroid_class_df = asteroid_class_df[:1403].copy()
non_des_asteroid_class_df = asteroid_class_df[1403:].copy()

des_asteroid_class_df.loc[:,"DesNr"] = des_asteroid_class_df["Name"].apply(lambda x: int(x.split(" ")[0]))
des_ast_class_df_full = des_asteroid_class_df.merge(des_file_paths_df, on="DesNr", how="left")

non_des_asteroid_class_df.loc[:,"DesNr"] = non_des_asteroid_class_df["Name"].apply(lambda x: x.replace(" ", ""))
non_des_ast_class_df_full = non_des_asteroid_class_df.merge(non_des_file_paths_df, on="DesNr", how="left")
non_des_ast_class_df_full = non_des_ast_class_df_full.dropna(subset=["FilePath"])

asteroid_data_df = pd.concat([des_ast_class_df_full, non_des_ast_class_df_full], ignore_index=True, axis = 0)
asteroid_data_df.reset_index(drop=True, inplace=True)
asteroid_data_df.drop(columns=["Tholen_Class", "unknown1", "unknown2"], inplace=True)
asteroid_data_df.dropna(subset=["Bus_Class"], inplace=True)
asteroid_data_df.dropna(subset=["FilePath"], inplace=True)

In [135]:
# Add astroid spectrum data to the data frame then save it to storage
asteroid_data_df.loc[:, "SpectrumDF"] = \
    asteroid_data_df["FilePath"].apply(lambda x: pd.read_csv(x, sep="\t", names=["Wavelength_in_microm","Reflectance_norm550nm"]))

# Convert the Designation Number to string
asteroid_data_df.loc[:, "DesNr"] = asteroid_data_df["DesNr"].astype(str)
asteroid_data_df.to_pickle(os.path.join(core_path, "data/", "asteroid_spectrum.pkl"), protocol=4)