### Start with some library imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Describe what the dataset is

In [2]:
# import the dataset
df = pd.read_csv('./archive/dataset.csv', low_memory=False)
df.head()

Unnamed: 0,id,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
0,a0000001,2000001,1 Ceres,1,Ceres,,N,N,3.4,939.4,...,4.6089e-09,6.1688e-08,6.6248e-08,7.8207e-09,1.1113e-11,1.1965e-12,3.7829e-08,9.4159e-09,MBA,0.43301
1,a0000002,2000002,2 Pallas,2,Pallas,,N,N,4.2,545.0,...,3.4694e-06,6.2724e-06,9.1282e-06,8.8591e-06,4.9613e-09,4.6536e-10,4.0787e-05,3.6807e-06,MBA,0.35936
2,a0000003,2000003,3 Juno,3,Juno,,N,N,5.33,246.596,...,3.2231e-06,1.6646e-05,1.7721e-05,8.1104e-06,4.3639e-09,4.4134e-10,3.5288e-05,3.1072e-06,MBA,0.33848
3,a0000004,2000004,4 Vesta,4,Vesta,,N,N,3.0,525.4,...,2.1706e-07,3.8808e-07,1.7893e-07,1.2068e-06,1.6486e-09,2.6125e-10,4.1037e-06,1.2749e-06,MBA,0.3998
4,a0000005,2000005,5 Astraea,5,Astraea,,N,N,6.9,106.699,...,2.7408e-06,2.8949e-05,2.9842e-05,8.3038e-06,4.729e-09,5.5227e-10,3.4743e-05,3.4905e-06,MBA,0.52191


In [3]:
# print all the column names of the dataset
df.columns

Index(['id', 'spkid', 'full_name', 'pdes', 'name', 'prefix', 'neo', 'pha', 'H',
       'diameter', 'albedo', 'diameter_sigma', 'orbit_id', 'epoch',
       'epoch_mjd', 'epoch_cal', 'equinox', 'e', 'a', 'q', 'i', 'om', 'w',
       'ma', 'ad', 'n', 'tp', 'tp_cal', 'per', 'per_y', 'moid', 'moid_ld',
       'sigma_e', 'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w',
       'sigma_ma', 'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per', 'class',
       'rms'],
      dtype='object')

In [4]:
# show what data types are in each column
df.dtypes

id                 object
spkid               int64
full_name          object
pdes               object
name               object
prefix             object
neo                object
pha                object
H                 float64
diameter          float64
albedo            float64
diameter_sigma    float64
orbit_id           object
epoch             float64
epoch_mjd           int64
epoch_cal         float64
equinox            object
e                 float64
a                 float64
q                 float64
i                 float64
om                float64
w                 float64
ma                float64
ad                float64
n                 float64
tp                float64
tp_cal            float64
per               float64
per_y             float64
moid              float64
moid_ld           float64
sigma_e           float64
sigma_a           float64
sigma_q           float64
sigma_i           float64
sigma_om          float64
sigma_w           float64
sigma_ma    

In [5]:
# print the size and number of columns in the dataset
df.shape

(958524, 45)

In [6]:
# check for missing data and find out how many entries are missing data
missing_values = df.isnull()       # returns matrix of boolean T/F values for missing data (True if null)
num_mv = np.sum(missing_values, axis=0)

# print how much data is missing from each column
num_mv

id                     0
spkid                  0
full_name              0
pdes                   0
name              936460
prefix            958506
neo                    4
pha                19921
H                   6263
diameter          822315
albedo            823421
diameter_sigma    822443
orbit_id               0
epoch                  0
epoch_mjd              0
epoch_cal              0
equinox                0
e                      0
a                      0
q                      0
i                      0
om                     0
w                      0
ma                     1
ad                     4
n                      0
tp                     0
tp_cal                 0
per                    4
per_y                  1
moid               19921
moid_ld              127
sigma_e            19922
sigma_a            19922
sigma_q            19922
sigma_i            19922
sigma_om           19922
sigma_w            19922
sigma_ma           19922
sigma_ad           19926


### Explore what unique values exist in the non-float and non-integer columns

In [7]:
df['prefix'].unique()

array([nan, 'A'], dtype=object)

In [8]:
df['class'].unique()

array(['MBA', 'OMB', 'MCA', 'AMO', 'IMB', 'TJN', 'CEN', 'APO', 'ATE',
       'AST', 'TNO', 'IEO', 'HYA'], dtype=object)

In [9]:
df['equinox'].unique()

array(['J2000'], dtype=object)

In [10]:
df['orbit_id'].unique()

array(['JPL 47', 'JPL 37', 'JPL 112', ..., 'E2020H77', 'E2020K86',
       'E2020J48'], shape=(4690,), dtype=object)

In [11]:
df['pha'].unique()

array(['N', 'Y', nan], dtype=object)

In [12]:
df['neo'].unique()

array(['N', 'Y', nan], dtype=object)

### Find duplicate entries

In [13]:
find_dups = df[df.duplicated()]
find_dups

Unnamed: 0,id,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter,...,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,rms
