## 1) Importing Libraries and Loading the Dataset

In [24]:
# Importing Libraries:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew, chi2

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.base import clone
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [25]:
# Loading Data:
df_raw = pd.read_csv("dataset.csv", dtype={"pdes":"str", 
                                           "name":"str",
                                           "prefix":"str"})

In [26]:
# Making a copy of the dataset:
df = df_raw.copy()

## 2) Data Cleaning

In [27]:
# Information about the dataset:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958524 entries, 0 to 958523
Data columns (total 45 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              958524 non-null  object 
 1   spkid           958524 non-null  int64  
 2   full_name       958524 non-null  object 
 3   pdes            958524 non-null  object 
 4   name            22064 non-null   object 
 5   prefix          18 non-null      object 
 6   neo             958520 non-null  object 
 7   pha             938603 non-null  object 
 8   H               952261 non-null  float64
 9   diameter        136209 non-null  float64
 10  albedo          135103 non-null  float64
 11  diameter_sigma  136081 non-null  float64
 12  orbit_id        958524 non-null  object 
 13  epoch           958524 non-null  float64
 14  epoch_mjd       958524 non-null  int64  
 15  epoch_cal       958524 non-null  float64
 16  equinox         958524 non-null  object 
 17  e         

In [28]:
# Deleting some useless columns:
df.drop(columns=["prefix", "name", "spkid"], inplace=True)
print(f"Shape: {df.shape}")

Shape: (958524, 42)


In [29]:
# Setting ID  and Full_name as a multindex:
df.set_index(["id", "full_name"], inplace=True)

In [30]:
# Looking at the first five rows:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,pdes,neo,pha,H,diameter,albedo,diameter_sigma,orbit_id,epoch,epoch_mjd,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
id,full_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
a0000001,1 Ceres,1,N,N,3.4,939.4,0.09,0.2,JPL 47,2458600.5,58600,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
a0000002,2 Pallas,2,N,N,4.2,545.0,0.101,18.0,JPL 37,2459000.5,59000,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
a0000003,3 Juno,3,N,N,5.33,246.596,0.214,10.594,JPL 112,2459000.5,59000,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
a0000004,4 Vesta,4,N,N,3.0,525.4,0.4228,0.2,JPL 35,2458600.5,58600,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
a0000005,5 Astraea,5,N,N,6.9,106.699,0.274,3.14,JPL 114,2459000.5,59000,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


In [31]:
# Looking for missing data:
df.isna().sum().sort_values(ascending=False)

albedo            823421
diameter_sigma    822443
diameter          822315
sigma_ad           19926
sigma_per          19926
sigma_ma           19922
sigma_a            19922
sigma_q            19922
sigma_i            19922
sigma_om           19922
sigma_w            19922
sigma_n            19922
sigma_e            19922
sigma_tp           19922
moid               19921
pha                19921
H                   6263
moid_ld              127
per                    4
ad                     4
neo                    4
rms                    2
ma                     1
per_y                  1
class                  0
pdes                   0
tp_cal                 0
tp                     0
w                      0
om                     0
i                      0
q                      0
a                      0
e                      0
equinox                0
epoch_cal              0
epoch_mjd              0
epoch                  0
orbit_id               0
n                      0


In [32]:
# Deleting columns that hava more than 50% of missing data:
df.dropna(thresh=0.5*len(df), axis=1, inplace=True)
print(f"Shape: {df.shape}")

Shape: (958524, 37)


## 3) Exploratory Data Analysis

## 4) Preprocessing

## 5) Fine Tuning

## 6) Predictions