In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('../dataset/asteroid_training.csv')

In [2]:
df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,26150198,Object_26150198,0.031956,0.071456,75374.759095,1070689.0,Earth,False,24.6,False
1,7025688,Object_7025688,0.133216,0.297879,33274.11479,15982170.0,Earth,False,21.5,False
2,43368461,Object_43368461,0.043507,0.097284,74702.349802,2330585.0,Earth,False,23.93,False
3,41099354,Object_41099354,0.012149,0.027167,33078.313997,45611780.0,Earth,False,26.7,False
4,25572576,Object_25572576,0.058151,0.130029,51956.093518,3613123.0,Earth,False,23.3,False


In [4]:
df.columns

Index(['id', 'name', 'est_diameter_min', 'est_diameter_max',
       'relative_velocity', 'miss_distance', 'orbiting_body', 'sentry_object',
       'absolute_magnitude', 'hazardous'],
      dtype='object')

In [6]:
unknown_objects = pd.read_csv('../dataset/unknown_asteroids.csv')

In [7]:
X = data.drop(columns='hazardous').copy()
y = data.hazardous

In [8]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = .8)

In [9]:
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config

set_config(display = "diagram")
num_cols = X_train.select_dtypes(include = np.number).columns
num_cols

Index(['id', 'est_diameter_min', 'est_diameter_max', 'relative_velocity',
       'miss_distance', 'absolute_magnitude'],
      dtype='object')

In [10]:
num_pipe = Pipeline([('imputer',SimpleImputer()),('scaler',MinMaxScaler())])
num_pipe


In [11]:
num_pipe.fit(X_train[num_cols])
X_train[num_cols] = num_pipe.transform(X_train[num_cols])
X_test[num_cols] = num_pipe.transform(X_test[num_cols])

In [12]:
enc = LabelEncoder().fit(y_train)
y_train = enc.transform(y_train)
y_test = enc.transform(y_test)

In [13]:
X_train.corr(method = "pearson")

  X_train.corr(method = "pearson")


Unnamed: 0,id,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,sentry_object,absolute_magnitude
id,1.0,0.009744,0.009744,0.000881,0.009011,,-0.011914
est_diameter_min,0.009744,1.0,1.0,0.17267,0.101659,,-0.487971
est_diameter_max,0.009744,1.0,1.0,0.17267,0.101659,,-0.487971
relative_velocity,0.000881,0.17267,0.17267,1.0,0.329279,,-0.353171
miss_distance,0.009011,0.101659,0.101659,0.329279,1.0,,-0.262424
sentry_object,,,,,,,
absolute_magnitude,-0.011914,-0.487971,-0.487971,-0.353171,-0.262424,,1.0


In [14]:
X_train.drop(columns=['sentry_object','orbiting_body','name'],inplace=True)

In [15]:
X_test.drop(columns=['sentry_object','orbiting_body','name'],inplace=True)

In [16]:
est_daimeter_min_mean = X_train['est_diameter_min'].mean()
est_diameter_max_mean = X_train['est_diameter_max'].mean()
relative_velocity_mean = X_train['relative_velocity'].mean()

In [17]:
#removing outliers

for data in X_train['est_diameter_min']:
    if data>est_daimeter_min_mean:
        data = np.nan
        
for data in X_train['est_diameter_max']:
    if data>est_diameter_max_mean:
        data = np.nan
        
for data in X_train['relative_velocity']:
    if data>relative_velocity_mean:
        data = np.nan      
        
for data in X_test['est_diameter_min']:
    if data>est_daimeter_min_mean:
        data = np.nan
                
for data in X_test['est_diameter_max']:
    if data>est_diameter_max_mean:
        data = np.nan        

for data in X_test['relative_velocity']:
    if data>relative_velocity_mean:
        data = np.nan


In [18]:
X_train.isnull().sum()

id                    0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
absolute_magnitude    0
dtype: int64

In [19]:
X_test.isnull().sum()


id                    0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
absolute_magnitude    0
dtype: int64