In [87]:
import pandas as pd

data = pd.read_csv("../data/neo_task.csv")
data.drop(columns=["id", "name"], inplace=True)
data = data.dropna()

In [88]:
X = data.drop(columns=["hazardous"])
Y = data["hazardous"].astype(int)

In [50]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [89]:
from imblearn.over_sampling import SMOTE
import numpy as np

oversampled = SMOTE(random_state=0)
X_smote, y_smote = oversampled.fit_resample(np.array(X_scaled), np.array(Y))

In [90]:
result = pd.concat([pd.DataFrame(X_smote, columns=X.columns), pd.DataFrame(y_smote, columns=pd.DataFrame(Y).columns)], axis=1)
result

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,0.000407,0.000407,0.235700,0.013606,0.703796,0
1,0.000789,0.000789,0.032354,0.436880,0.645390,0
2,0.001450,0.001450,0.232505,0.874154,0.591156,0
3,0.000492,0.000492,0.174537,0.168484,0.687109,0
4,0.003665,0.003665,0.284797,0.953300,0.507718,0
...,...,...,...,...,...,...
160947,0.007207,0.007207,0.184602,0.323308,0.446664,1
160948,0.002985,0.002985,0.228412,0.364040,0.526252,1
160949,0.007155,0.007155,0.144024,0.895314,0.447463,1
160950,0.002873,0.002873,0.311526,0.168684,0.529701,1


In [98]:
result.to_csv("classification_pred.csv", encoding='utf-8', index=False)

In [99]:
data = pd.read_csv("../data/classification_pred.csv")
data = data.dropna()
data
X = data.drop(columns=["hazardous"])
Y = data["hazardous"].astype(int)

In [100]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [101]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.3, stratify=Y, shuffle=True)

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

knn = KNeighborsClassifier(n_neighbors=3, p=2).fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_pred, y_test))
knn_pipeline = Pipeline([('scaler', scaler), ('knn', knn)])

              precision    recall  f1-score   support

           0       0.85      0.98      0.91     20940
           1       0.99      0.87      0.93     27346

    accuracy                           0.92     48286
   macro avg       0.92      0.93      0.92     48286
weighted avg       0.93      0.92      0.92     48286



In [165]:
import joblib

joblib.dump(knn_pipeline, "knn.joblib")

['knn.joblib']

In [119]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(knn, n_estimators=11).fit(X_train, y_train)
y_pred = bagging.predict(X_test)
print(classification_report(y_test, y_pred))
pipeline_bagging = Pipeline([('scaler', scaler), ('bagging', bagging)])

              precision    recall  f1-score   support

           0       0.99      0.85      0.91     24143
           1       0.87      0.99      0.92     24143

    accuracy                           0.92     48286
   macro avg       0.93      0.92      0.92     48286
weighted avg       0.93      0.92      0.92     48286



In [166]:
joblib.dump(knn_pipeline, "bagging.joblib")

['bagging.joblib']

In [142]:
import tensorflow as tf

model_classification = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model_classification.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss="binary_crossentropy")
model_classification.fit(X_train, y_train, epochs=25, verbose=None)

<keras.callbacks.History at 0x1eaa912a9b0>

In [143]:
y_pred = [np.argmax(pred) for pred in model_classification.predict(X_test, verbose=None)]
print(classification_report(y_test, y_pred))
pipeline_tf = Pipeline([('scaler', scaler), ('dence', model_classification)])

              precision    recall  f1-score   support

           0       0.50      1.00      0.67     24143
           1       0.00      0.00      0.00     24143

    accuracy                           0.50     48286
   macro avg       0.25      0.50      0.33     48286
weighted avg       0.25      0.50      0.33     48286



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [167]:
joblib.dump(pipeline_tf, "tf.joblib")

['tf.joblib']