<a href="https://colab.research.google.com/github/angelakorm/nasa-asteroid-classification/blob/main/Asteroid_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NASA Asteroid Classification

## Import libraries

In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import kagglehub
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

## Import dataset

In [166]:
path = kagglehub.dataset_download("lovishbansal123/nasa-asteroids-classification")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/nasa-asteroids-classification


In [167]:
df = pd.read_csv(path + '/nasa.csv')
len(df)

4687

In [168]:
df[0:10]

Unnamed: 0,Neo Reference ID,Name,Absolute Magnitude,Est Dia in KM(min),Est Dia in KM(max),Est Dia in M(min),Est Dia in M(max),Est Dia in Miles(min),Est Dia in Miles(max),Est Dia in Feet(min),...,Asc Node Longitude,Orbital Period,Perihelion Distance,Perihelion Arg,Aphelion Dist,Perihelion Time,Mean Anomaly,Mean Motion,Equinox,Hazardous
0,3703080,3703080,21.6,0.12722,0.284472,127.219879,284.472297,0.079051,0.176763,417.388066,...,314.373913,609.599786,0.808259,57.25747,2.005764,2458162.0,264.837533,0.590551,J2000,True
1,3723955,3723955,21.3,0.146068,0.326618,146.067964,326.617897,0.090762,0.202951,479.22562,...,136.717242,425.869294,0.7182,313.091975,1.497352,2457795.0,173.741112,0.84533,J2000,False
2,2446862,2446862,20.3,0.231502,0.517654,231.502122,517.654482,0.143849,0.321655,759.521423,...,259.475979,643.580228,0.950791,248.415038,1.966857,2458120.0,292.893654,0.559371,J2000,True
3,3092506,3092506,27.4,0.008801,0.019681,8.801465,19.680675,0.005469,0.012229,28.876199,...,57.173266,514.08214,0.983902,18.707701,1.527904,2457902.0,68.741007,0.700277,J2000,False
4,3514799,3514799,21.6,0.12722,0.284472,127.219879,284.472297,0.079051,0.176763,417.388066,...,84.629307,495.597821,0.967687,158.263596,1.483543,2457814.0,135.142133,0.726395,J2000,True
5,3671135,3671135,19.6,0.319562,0.714562,319.561887,714.562102,0.198566,0.444008,1048.43142,...,178.971951,556.160556,0.5778,198.145969,2.069265,2458009.0,354.237368,0.647295,J2000,False
6,2495323,2495323,19.6,0.319562,0.714562,319.561887,714.562102,0.198566,0.444008,1048.43142,...,178.971953,556.160544,0.5778,198.14596,2.069265,2458009.0,354.237396,0.647295,J2000,False
7,2153315,2153315,19.2,0.384198,0.859093,384.197891,859.092601,0.238729,0.533815,1260.491809,...,112.562984,502.808758,0.680905,288.374651,1.794045,2458242.0,186.776932,0.715978,J2000,False
8,2162463,2162463,17.8,0.732074,1.636967,732.073989,1636.967205,0.45489,1.017164,2401.817627,...,80.211132,447.837013,0.872705,353.422394,1.418397,2458222.0,182.236432,0.803864,J2000,False
9,2306383,2306383,21.5,0.133216,0.297879,133.215567,297.879063,0.082776,0.185093,437.05896,...,2.613682,299.535161,0.39304,253.765937,1.359211,2457901.0,119.861382,1.201862,J2000,True


In [169]:
df.columns

Index(['Neo Reference ID', 'Name', 'Absolute Magnitude', 'Est Dia in KM(min)',
       'Est Dia in KM(max)', 'Est Dia in M(min)', 'Est Dia in M(max)',
       'Est Dia in Miles(min)', 'Est Dia in Miles(max)',
       'Est Dia in Feet(min)', 'Est Dia in Feet(max)', 'Close Approach Date',
       'Epoch Date Close Approach', 'Relative Velocity km per sec',
       'Relative Velocity km per hr', 'Miles per hour',
       'Miss Dist.(Astronomical)', 'Miss Dist.(lunar)',
       'Miss Dist.(kilometers)', 'Miss Dist.(miles)', 'Orbiting Body',
       'Orbit ID', 'Orbit Determination Date', 'Orbit Uncertainity',
       'Minimum Orbit Intersection', 'Jupiter Tisserand Invariant',
       'Epoch Osculation', 'Eccentricity', 'Semi Major Axis', 'Inclination',
       'Asc Node Longitude', 'Orbital Period', 'Perihelion Distance',
       'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly',
       'Mean Motion', 'Equinox', 'Hazardous'],
      dtype='object')

In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4687 entries, 0 to 4686
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Neo Reference ID              4687 non-null   int64  
 1   Name                          4687 non-null   int64  
 2   Absolute Magnitude            4687 non-null   float64
 3   Est Dia in KM(min)            4687 non-null   float64
 4   Est Dia in KM(max)            4687 non-null   float64
 5   Est Dia in M(min)             4687 non-null   float64
 6   Est Dia in M(max)             4687 non-null   float64
 7   Est Dia in Miles(min)         4687 non-null   float64
 8   Est Dia in Miles(max)         4687 non-null   float64
 9   Est Dia in Feet(min)          4687 non-null   float64
 10  Est Dia in Feet(max)          4687 non-null   float64
 11  Close Approach Date           4687 non-null   object 
 12  Epoch Date Close Approach     4687 non-null   int64  
 13  Rel

In [171]:
df.drop(['Name', 'Neo Reference ID', 'Est Dia in M(min)', 'Est Dia in M(max)',
       'Est Dia in Miles(min)', 'Est Dia in Miles(max)',
       'Est Dia in Feet(min)', 'Est Dia in Feet(max)', 'Close Approach Date', 'Miss Dist.(miles)', 'Orbiting Body', 'Orbit Determination Date', 'Equinox'], axis=1, inplace=True)

In [172]:
df.head()

Unnamed: 0,Absolute Magnitude,Est Dia in KM(min),Est Dia in KM(max),Epoch Date Close Approach,Relative Velocity km per sec,Relative Velocity km per hr,Miles per hour,Miss Dist.(Astronomical),Miss Dist.(lunar),Miss Dist.(kilometers),...,Inclination,Asc Node Longitude,Orbital Period,Perihelion Distance,Perihelion Arg,Aphelion Dist,Perihelion Time,Mean Anomaly,Mean Motion,Hazardous
0,21.6,0.12722,0.284472,788947200000,6.115834,22017.003799,13680.509944,0.419483,163.178711,62753692.0,...,6.025981,314.373913,609.599786,0.808259,57.25747,2.005764,2458162.0,264.837533,0.590551,True
1,21.3,0.146068,0.326618,788947200000,18.113985,65210.346095,40519.173105,0.383014,148.99263,57298148.0,...,28.412996,136.717242,425.869294,0.7182,313.091975,1.497352,2457795.0,173.741112,0.84533,False
2,20.3,0.231502,0.517654,789552000000,7.590711,27326.560182,16979.661798,0.050956,19.82189,7622911.5,...,4.237961,259.475979,643.580228,0.950791,248.415038,1.966857,2458120.0,292.893654,0.559371,True
3,27.4,0.008801,0.019681,790156800000,11.173874,40225.948191,24994.839864,0.285322,110.990387,42683616.0,...,7.905894,57.173266,514.08214,0.983902,18.707701,1.527904,2457902.0,68.741007,0.700277,False
4,21.6,0.12722,0.284472,790156800000,9.840831,35426.991794,22012.954985,0.407832,158.646713,61010824.0,...,16.793382,84.629307,495.597821,0.967687,158.263596,1.483543,2457814.0,135.142133,0.726395,True


In [173]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [174]:
print(X[:5])

[[2.16000000e+01 1.27219878e-01 2.84472296e-01 7.88947200e+11
  6.11583439e+00 2.20170038e+04 1.36805099e+04 4.19482530e-01
  1.63178711e+02 6.27536920e+07 1.70000000e+01 5.00000000e+00
  2.52819000e-02 4.63400000e+00 2.45800050e+06 4.25549083e-01
  1.40701130e+00 6.02598129e+00 3.14373913e+02 6.09599786e+02
  8.08258933e-01 5.72574699e+01 2.00576367e+00 2.45816164e+06
  2.64837533e+02 5.90551388e-01]
 [2.13000000e+01 1.46067964e-01 3.26617897e-01 7.88947200e+11
  1.81139850e+01 6.52103461e+04 4.05191731e+04 3.83014463e-01
  1.48992630e+02 5.72981480e+07 2.10000000e+01 3.00000000e+00
  1.86935000e-01 5.45700000e+00 2.45800050e+06 3.51674305e-01
  1.10777595e+00 2.84129964e+01 1.36717242e+02 4.25869294e+02
  7.18199616e-01 3.13091975e+02 1.49735229e+00 2.45779497e+06
  1.73741112e+02 8.45329788e-01]
 [2.03000000e+01 2.31502122e-01 5.17654482e-01 7.89552000e+11
  7.59071116e+00 2.73265602e+04 1.69796618e+04 5.09560159e-02
  1.98218899e+01 7.62291150e+06 2.20000000e+01 0.00000000e+00
  4.

In [175]:
print(y[:5])

[ True False  True False  True]


In [176]:
df.isna().sum()

Unnamed: 0,0
Absolute Magnitude,0
Est Dia in KM(min),0
Est Dia in KM(max),0
Epoch Date Close Approach,0
Relative Velocity km per sec,0
Relative Velocity km per hr,0
Miles per hour,0
Miss Dist.(Astronomical),0
Miss Dist.(lunar),0
Miss Dist.(kilometers),0


## Encoding the dependent variable

In [177]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [178]:
print(y[:5])

[1 0 1 0 1]


In [179]:
df['Hazardous'].value_counts()

Unnamed: 0_level_0,count
Hazardous,Unnamed: 1_level_1
False,3932
True,755


## Train and test set splitting

In [180]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature scaling

In [181]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [182]:
print(X_train[:5])

[[-1.04797895  0.44908876  0.44908876  0.57661529  1.63534439  1.63534439
   1.63534439 -0.33501393 -0.33501393 -0.33501395  0.28831138 -1.13752148
   0.56060121  0.12341457  0.29797292  2.49739196 -0.58390544  0.96174255
   1.33744571 -0.59059446 -2.62330165 -1.59983758  0.02237317  0.33395634
   1.2594673   0.35100504]
 [-0.46179083 -0.08499171 -0.08499171  0.38013065  0.98645063  0.98645063
   0.98645063 -1.66652453 -1.66652453 -1.66652453 -0.47330034  1.45847184
  -0.75530625  1.51909194 -2.84229716 -0.38622338 -1.12627727  2.65361287
   0.29754499 -1.00413638 -1.05142532 -1.47958392 -0.972141   -2.88700845
  -0.41341416  1.7514008 ]
 [ 1.08988359 -0.47803667 -0.47803667 -1.54655027  0.82421001  0.82421001
   0.82421001  1.05751692  1.05751704  1.05751701 -0.24481683 -0.48852315
  -0.88404196  0.31723613  0.29797292 -0.54701376 -0.49299895 -1.16586244
  -1.37359067 -0.51550796  0.02694339 -1.14837452 -0.54892247  0.17488268
  -0.95986318  0.19781941]
 [-2.04794691  3.20540537  3.20

In [183]:
print(X_test[:5])

[[ 1.40021847 -0.49724412 -0.49724412  1.17734782 -0.17020041 -0.17020041
  -0.17020041 -1.56213168 -1.56213167 -1.56213167 -0.62562268  1.782971
  -0.64124375 -1.08979481 -0.84175589  0.888242    1.03750868 -0.71712603
  -1.34247326  0.96258478  0.30601732  1.10151231  1.0633291  -0.88077431
  -1.54359916 -1.08762393]
 [-0.80660737  0.17800722  0.17800722  0.9411515   0.69854806  0.69854806
   0.69854806  1.07554563  1.07554554  1.07554562  1.10069721 -1.13752148
   0.19644527 -0.59104897  0.29797292 -0.22371012  0.30434485  0.36860345
  -1.4333902   0.20682766  0.88226437 -0.5013187   0.111195    0.06842176
  -0.7672105  -0.67365422]
 [ 0.40025051 -0.39245188 -0.39245188  1.12408359  0.23843287  0.23843287
   0.23843287  1.47385595  1.47385595  1.47385587 -0.19404271 -1.13752148
  -0.77700009  1.84591239  0.29797292 -0.05723371 -1.18429484 -0.767045
  -1.53565131 -1.04465453 -1.33187921  0.17833611 -0.96490318  0.37102867
   0.54949213  1.97761441]
 [-1.32383218  0.88728273  0.887282

## Model training



In [184]:
models = [ LogisticRegression(random_state=42),
          KNeighborsClassifier(n_neighbors = 5, metric='minkowski', p=2, weights='distance'),
          GaussianNB(),
          SVC(kernel='linear', random_state=42),
          SVC(kernel='rbf', random_state=42),
          DecisionTreeClassifier(criterion='gini', random_state=42),
          RandomForestClassifier(n_estimators = 10, criterion='gini', random_state=42)]

for model in models:
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(f"{model.__class__.__name__}")
  cm = confusion_matrix(y_test, y_pred)
  print(cm)
  acc = accuracy_score(y_test, y_pred)
  print(acc, "\n")

LogisticRegression
[[780  13]
 [ 20 125]]
0.964818763326226 

KNeighborsClassifier
[[771  22]
 [ 65  80]]
0.9072494669509595 

GaussianNB
[[764  29]
 [ 21 124]]
0.9466950959488273 

SVC
[[780  13]
 [ 16 129]]
0.9690831556503199 

SVC
[[778  15]
 [ 32 113]]
0.9498933901918977 

DecisionTreeClassifier
[[792   1]
 [  5 140]]
0.9936034115138592 

RandomForestClassifier
[[792   1]
 [  8 137]]
0.990405117270789 

