In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
def visualize_preds(table,y_true, y_pred, title):
    ls = []
    f = plt.figure(figsize = (8,8))
    cf = confusion_matrix(y_true, y_pred)
    sb.heatmap(cf, fmt='.0f', annot=True)
    
    accuracy  = np.trace(cf) / float(np.sum(cf))

    # Metrics for Binary Confusion Matrices
    if sum(cf[:,1]) == 0:
        precision = 0.0;
    else:
        precision = cf[1,1] / sum(cf[:,1])
        
    if sum(cf[1,:]) == 0:
        recall = 0.0;
    else:
        recall    = cf[1,1] / sum(cf[1,:])
        
    if (precision + recall) == 0:
        f1_score = 0.0;
    else:
        f1_score  = 2 *precision * recall / (precision + recall)
    stats_text = "\n\nAccuracy={:0.2f}\nPrecision={:0.2f}\nRecall={:0.2f}\nF1 Score={:0.2f}".format(
        accuracy,precision,recall,f1_score)
    
    ls.append(accuracy)
    ls.append(precision)
    ls.append(recall)
    ls.append(f1_score)
    ls = pd.DataFrame(ls)
    table = pd.concat([table,ls], axis = 1)
    
    plt.xlabel('Predicted' + stats_text)
    plt.ylabel('Actual')
    plt.title(title)
    
    return table

In [4]:
table1 = pd.DataFrame()
table2 = pd.DataFrame()

#### Importing our Dataset (Oversampled)

In [5]:
nasaOversample_train = pd.read_csv("Datasets/oversampled_train.csv", index_col = 0)
nasaOversample_train = nasaOversample_train.drop(['NEO_REFERENCE_ID', 'EST_DIA_IN_M(MIN)', 'MISS_DIST.(KILOMETERS)', 'PERIHELION_DISTANCE', 'APHELION_DIST', 'PERIHELION_TIME', 'MEAN_ANOMALY', 'MEAN_MOTION', 'HAZARDOUS'], axis = 1)
nasaOversample_train

Unnamed: 0,EST_DIA_IN_M(MAX),RELATIVE_VELOCITY_KM_PER_SEC,RISK_CATEGORY
0,411.187571,9.551351,1
1,113.250461,3.838017,0
2,567.596853,6.477856,1
3,748.238376,18.027267,2
4,156.329154,24.300910,1
...,...,...,...
3211,1132.504611,17.495185,2
3212,1081.533507,12.736781,2
3213,986.370281,19.408931,2
3214,542.050786,4.099881,2


#### Describe risk category 

#### Importing our Dataset (Smotetomek)

In [6]:
#IMPORTING CLEANED TRAIN DATA USING SMOTETOMEK SAMPLING
nasaSmotetomek_train = pd.read_csv("Datasets/smotetomek_train.csv", index_col = 0)
nasaSmotetomek_train = nasaSmotetomek_train.drop(['NEO_REFERENCE_ID', 'EST_DIA_IN_M(MIN)', 'MISS_DIST.(KILOMETERS)', 'PERIHELION_DISTANCE', 'APHELION_DIST', 'PERIHELION_TIME', 'MEAN_ANOMALY', 'MEAN_MOTION'], axis = 1)
nasaSmotetomek_train

Unnamed: 0,EST_DIA_IN_M(MAX),RELATIVE_VELOCITY_KM_PER_SEC,HAZARDOUS,RISK_CATEGORY
0,113.250461,3.838017,0,0
1,567.596853,6.477856,0,1
2,748.238376,18.027267,1,2
3,156.329154,24.300910,0,1
4,594.346868,10.896092,0,1
...,...,...,...,...
2411,322.420730,16.066831,1,2
2412,407.979344,28.224700,0,2
2413,880.558727,13.826735,1,2
2414,443.214353,23.729151,0,2


In [7]:
#IMPORTING CLEANED TEST DATASETS
nasaTest = pd.read_csv("Datasets/test.csv", index_col = 0)
nasaTest = nasaTest.drop(['NEO_REFERENCE_ID', 'EST_DIA_IN_M(MIN)', 'MISS_DIST.(KILOMETERS)', 'PERIHELION_DISTANCE', 'APHELION_DIST', 'PERIHELION_TIME', 'MEAN_ANOMALY', 'MEAN_MOTION'], axis = 1)
nasaTest

Unnamed: 0,EST_DIA_IN_M(MAX),RELATIVE_VELOCITY_KM_PER_SEC,HAZARDOUS,RISK_CATEGORY
2050,68.240151,13.353029,0,1
1080,311.917670,4.364276,0,1
428,651.688382,16.916139,0,2
942,358.129403,7.806605,0,1
2729,108.153351,8.298877,0,1
...,...,...,...,...
1564,748.238376,23.653371,0,2
1137,225.964377,13.142732,0,1
443,472.106499,12.437367,0,1
251,35.812940,16.351481,0,0


#### Classification of oversampling train by Logistic Regression

In [8]:
#Assign x and y value
y_train = nasaOversample_train['RISK_CATEGORY']
x_train = nasaOversample_train.drop(['RISK_CATEGORY'], axis = 1) # Remove the response variable
y_test = nasaOversample_train['RISK_CATEGORY']
x_test = nasaOversample_train.drop(['RISK_CATEGORY'], axis = 1) # Remove the response variable

#The correlation of data
nasaOversample_train.corr()

Unnamed: 0,EST_DIA_IN_M(MAX),RELATIVE_VELOCITY_KM_PER_SEC,RISK_CATEGORY
EST_DIA_IN_M(MAX),1.0,0.371386,0.77081
RELATIVE_VELOCITY_KM_PER_SEC,0.371386,1.0,0.550304
RISK_CATEGORY,0.77081,0.550304,1.0
