# Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import os
import sklearn

# Importing the Dataset

In [2]:
dataset = pd.read_csv(r'C:\Users\Aditya Deepak\Downloads\Projects\Nasa\neo_v2.csv')

# Understanding the Data

In [3]:
dataset.shape
# Returns the number of Rows and Columns --> (Rows, Columns)

(90836, 10)

In [4]:
dataset.info()
# Returns the whole information regarding the dataset --> Columns, Non-Null Values, DataTypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90836 entries, 0 to 90835
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  90836 non-null  int64  
 1   name                90836 non-null  object 
 2   est_diameter_min    90836 non-null  float64
 3   est_diameter_max    90836 non-null  float64
 4   relative_velocity   90836 non-null  float64
 5   miss_distance       90836 non-null  float64
 6   orbiting_body       90836 non-null  object 
 7   sentry_object       90836 non-null  bool   
 8   absolute_magnitude  90836 non-null  float64
 9   hazardous           90836 non-null  bool   
dtypes: bool(2), float64(5), int64(1), object(2)
memory usage: 5.7+ MB


In [5]:
dataset.isnull().sum()
# Returns the sum of all the null values in the dataset

id                    0
name                  0
est_diameter_min      0
est_diameter_max      0
relative_velocity     0
miss_distance         0
orbiting_body         0
sentry_object         0
absolute_magnitude    0
hazardous             0
dtype: int64

In [6]:
dataset.head()
# Returns top 5 rows and columns from the dataset

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [7]:
dataset.tail()
# Returns bottom 5 rows and columns from the dataset

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
90831,3763337,(2016 VX1),0.02658,0.059435,52078.886692,12300390.0,Earth,False,25.0,False
90832,3837603,(2019 AD3),0.016771,0.037501,46114.605073,54321210.0,Earth,False,26.0,False
90833,54017201,(2020 JP3),0.031956,0.071456,7566.807732,28400770.0,Earth,False,24.6,False
90834,54115824,(2021 CN5),0.007321,0.01637,69199.154484,68692060.0,Earth,False,27.8,False
90835,54205447,(2021 TW7),0.039862,0.089133,27024.455553,59772130.0,Earth,False,24.12,False


In [8]:
dataset.drop(['id','name'], axis=1)

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,Earth,False,16.73,False
1,0.265800,0.594347,73588.726663,6.143813e+07,Earth,False,20.00,True
2,0.722030,1.614507,114258.692129,4.979872e+07,Earth,False,17.83,False
3,0.096506,0.215794,24764.303138,2.543497e+07,Earth,False,22.20,False
4,0.255009,0.570217,42737.733765,4.627557e+07,Earth,False,20.09,True
...,...,...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,Earth,False,25.00,False
90832,0.016771,0.037501,46114.605073,5.432121e+07,Earth,False,26.00,False
90833,0.031956,0.071456,7566.807732,2.840077e+07,Earth,False,24.60,False
90834,0.007321,0.016370,69199.154484,6.869206e+07,Earth,False,27.80,False


In [9]:
dataset.describe()
# Returns the statistical summary of the dataset

Unnamed: 0,id,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude
count,90836.0,90836.0,90836.0,90836.0,90836.0,90836.0
mean,14382880.0,0.127432,0.284947,48066.918918,37066550.0,23.527103
std,20872020.0,0.298511,0.667491,25293.296961,22352040.0,2.894086
min,2000433.0,0.000609,0.001362,203.346433,6745.533,9.23
25%,3448110.0,0.019256,0.043057,28619.020645,17210820.0,21.34
50%,3748362.0,0.048368,0.108153,44190.11789,37846580.0,23.7
75%,3884023.0,0.143402,0.320656,62923.604633,56549000.0,25.7
max,54275910.0,37.89265,84.730541,236990.128088,74798650.0,33.2


In [10]:
dataset.nunique()
# Returns the number of unique values from each column of the dataset

id                    27423
name                  27423
est_diameter_min       1638
est_diameter_max       1638
relative_velocity     90828
miss_distance         90536
orbiting_body             1
sentry_object             1
absolute_magnitude     1638
hazardous                 2
dtype: int64

In [11]:
dataset.columns
# Returns all the columns names in the dataset

Index(['id', 'name', 'est_diameter_min', 'est_diameter_max',
       'relative_velocity', 'miss_distance', 'orbiting_body', 'sentry_object',
       'absolute_magnitude', 'hazardous'],
      dtype='object')

In [12]:
print(dataset['hazardous'].unique())
print(dataset['sentry_object'].unique())
print(dataset['orbiting_body'].unique())

# .unique() returns the unique values in the column

[False  True]
[False]
['Earth']


In [13]:
for i in dataset.columns:
    print(dataset[i].value_counts())
#Returns the count of values in the data

2277810     43
2469219     43
3743123     40
2138175     39
3893865     38
            ..
2001917      1
3160795      1
54240280     1
54054686     1
3557824      1
Name: id, Length: 27423, dtype: int64
277810 (2006 FV35)               43
469219 Kamo`oalewa (2016 HO3)    43
(2016 CA138)                     40
138175 (2000 EE104)              39
(2019 XS)                        38
                                 ..
(2010 HS20)                       1
(2015 XN128)                      1
(2006 YE)                         1
(2017 XU61)                       1
(2001 RP3)                        1
Name: name, Length: 27423, dtype: int64
0.035039    1130
0.029144    1058
0.023150     995
0.031956     988
0.022108     977
            ... 
0.969516       1
0.013488       1
0.009740       1
0.004592       1
0.012189       1
Name: est_diameter_min, Length: 1638, dtype: int64
0.078350    1130
0.065169    1058
0.051765     995
0.071456     988
0.049436     977
            ... 
0.002582       1
0.18

In [14]:
dataset.columns

Index(['id', 'name', 'est_diameter_min', 'est_diameter_max',
       'relative_velocity', 'miss_distance', 'orbiting_body', 'sentry_object',
       'absolute_magnitude', 'hazardous'],
      dtype='object')

In [15]:
final_dataset = dataset[['est_diameter_min', 'est_diameter_max',
       'relative_velocity', 'miss_distance','absolute_magnitude', 'orbiting_body','sentry_object','hazardous']]

# Splitting dataset into Independent Variable and Dependent Variable

In [16]:
X = final_dataset.iloc[:, :-1].values
y = final_dataset.iloc[:, -1].values

In [17]:
print(X)
print(y)

[[1.1982708007 2.6794149658 13569.2492241812 ... 16.73 'Earth' False]
 [0.2658 0.5943468684 73588.7266634981 ... 20.0 'Earth' False]
 [0.7220295577 1.6145071727 114258.6921290512 ... 17.83 'Earth' False]
 ...
 [0.0319561887 0.0714562102 7566.8077324922 ... 24.6 'Earth' False]
 [0.0073207399 0.016369672 69199.1544835094 ... 27.8 'Earth' False]
 [0.0398616229 0.0891332986 27024.4555527296 ... 24.12 'Earth' False]]
[False  True False ... False False False]


# Encoding/Converting Categorical Variables into Dummy Variables

In [18]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
y = label.fit_transform(y)

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
mycompdt = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [-1,-2])], remainder="passthrough")
X = np.array(mycompdt.fit_transform(X))

In [20]:
print(X)
print(y)

[[1.0 1.0 1.1982708007 ... 13569.2492241812 54839744.08284605 16.73]
 [1.0 1.0 0.2658 ... 73588.7266634981 61438126.52395093 20.0]
 [1.0 1.0 0.7220295577 ... 114258.6921290512 49798724.94045679 17.83]
 ...
 [1.0 1.0 0.0319561887 ... 7566.8077324922 28400768.1610167 24.6]
 [1.0 1.0 0.0073207399 ... 69199.1544835094 68692060.5345607 27.8]
 [1.0 1.0 0.0398616229 ... 27024.4555527296 59772130.59268528 24.12]]
[0 1 0 ... 0 0 0]


# Splitting Dataset into Training Set and Test Set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

In [22]:
print(X_train)
print(X_test)
print(y_train)
print(y_test)

[[1.0 1.0 0.0127219879 ... 19587.4731993585 25028887.94130264 26.6]
 [1.0 1.0 0.0987540639 ... 22304.5957509006 41636139.40650246 22.15]
 [1.0 1.0 0.0715409951 ... 33495.1177159472 10360744.083237816 22.85]
 ...
 [1.0 1.0 0.1058168859 ... 51161.6072003717 16194464.222455025 22.0]
 [1.0 1.0 0.1170994827 ... 58978.6270279034 2326580.676227331 21.78]
 [1.0 1.0 0.160160338 ... 84840.248572133 71948001.52239732 21.1]]
[[1.0 1.0 0.0105816886 ... 67869.9252335635 33796821.675086 27.0]
 [1.0 1.0 0.0253837029 ... 46806.0499065928 2042375.64510706 25.1]
 [1.0 1.0 0.0530340723 ... 73992.5888574013 47007877.89613322 23.5]
 ...
 [1.0 1.0 0.0139493823 ... 28948.1773598464 17249177.077273425 26.4]
 [1.0 1.0 0.0201629919 ... 62030.0431195589 973380.472202742 25.6]
 [1.0 1.0 0.2550086879 ... 60706.7804125159 46886590.58528265 20.09]]
[0 1 0 ... 1 1 1]
[0 0 0 ... 0 0 1]


# Performing SMOTE Analysis

In [23]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# Feature Scaling

In [24]:
#from sklearn.preprocessing import StandardScaler
#mysc = StandardScaler()
#X_res[:,5:] = mysc.fit_transform(X_res[:,5:])
#X_test[:,5:] = mysc.transform(X_test[:,5:])

In [25]:
#from sklearn.preprocessing import MinMaxScaler
#mms = MinMaxScaler()
#X_res[:,5:] = mms.fit_transform(X_res[:,5:])
#X_test[:,5:] = mms.transform(X_test[:,5:])

# Importing the Machine Learning Algorithms

In [26]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Training XGBoost Algo on Training set

In [27]:
#classifier = XGBClassifier()
#classifier.fit(X_res, y_res)

# Training Logistic Regression Algo on Training set

In [28]:
#classifier = LogisticRegression(random_state = 0)
#classifier.fit(X_res, y_res)

# Training KNN Algo on Training set

In [29]:
#classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
#classifier.fit(X_res, y_res)

# Training Decision Trees Algo on Training set

In [30]:
#classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
#classifier.fit(X_res, y_res)

# Training SVM Algo on Training set

In [31]:
#classifier = SVC(kernel = 'rbf', random_state = 0)
#classifier.fit(X_res, y_res)

# Training Random Forest Algo on Training set

In [32]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_res, y_res)

# Confusion Matrix - Accuracy Score

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[23256  1359]
 [ 1218  1418]]


0.9054346629481487

# K-Fold Cross Validation

In [34]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_res, y = y_res, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 93.28 %
Standard Deviation: 7.63 %


# Grid Search

In [35]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': [10, 20, 40, 60, 80, 100, 120, 140, 160, 240, 360, 400], 'criterion': ['gini', 'entropy', 'log_loss']}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 91.78 %
Best Parameters: {'criterion': 'entropy', 'n_estimators': 360}
