In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, Lasso, RidgeCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, classification_report
import scipy
%matplotlib inline
df=pd.read_csv("neo.csv")
print(df)
encoded_df = pd.get_dummies(df["hazardous"])
X=df.drop('absolute_magnitude',axis=1)
y=df['absolute_magnitude']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
# Replace non-numeric values with NaN
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)  # Replace with 0 or any other value
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)

# Initialize StandardScaler and fit-transform on training data
scaler = StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use transform only on test set
print("X_train_scaled shape:", X_train_scaled.shape)
print("y_train shape:", y_train.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
#Logistic Regression
log=LogisticRegression()
log.fit(X_train_scaled,y_train)
y_log_predict=log.predict(X_test)
#Linear_Discriminant_Analysis
lda=LDA()
lda.fit(X_train_scaled,y_train)
#Quadratic_Discriminant_Analysis
qda=QuadraticDiscriminantAnalysis()
qda.fit(X_train_scaled,y_train)
#Lasso
las=Lasso(alpha=0.1)
las.fit(X_train_scaled,y_train)
#RidgeCV
ridge=RidgeCV(alpha=0.4)
ridge.fit(X_train,y_train)
# Predictions
y_log_predict = log.predict(X_test_scaled)
y_lda_predict = lda.predict(X_test_scaled)
y_qda_predict = qda.predict(X_test_scaled)
y_las_predict = las.predict(X_test_scaled)
y_ridge_predict = ridge.predict(X_test_scaled)
#Evaluate
# Compute confusion matrix for Logistic Regression
cm_log = confusion_matrix(y_test, y_log_predict)
print("Confusion Matrix for Logistic Regression:")
print(cm_log)

# Compute confusion matrix for Linear Discriminant Analysis
cm_lda = confusion_matrix(y_test, y_lda_predict)
print("\nConfusion Matrix for Linear Discriminant Analysis:")
print(cm_lda)

# Compute confusion matrix for Quadratic Discriminant Analysis
cm_qda = confusion_matrix(y_test, y_qda_predict)
print("\nConfusion Matrix for Quadratic Discriminant Analysis:")
print(cm_qda)
# Display confusion matrix using ConfusionMatrixDisplay (optional)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_log, display_labels=log.classes_)
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()

#ROC AND AUC
# Logistic Regression
fpr_log, tpr_log, _ = roc_curve(y_test, log.predict_proba(X_test_scaled)[:, 1])
roc_auc_log = auc(fpr_log, tpr_log)

# LDA
fpr_lda, tpr_lda, _ = roc_curve(y_test, lda.predict_proba(X_test_scaled)[:, 1])
roc_auc_lda = auc(fpr_lda, tpr_lda)

# QDA
fpr_qda, tpr_qda, _ = roc_curve(y_test, qda.predict_proba(X_test_scaled)[:, 1])
roc_auc_qda = auc(fpr_qda, tpr_qda)
#Visualization for evaluation matrix
#Confusion matrix Heatmap Function
def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {title}')
    plt.show()
plot_confusion_matrix(cm_log, 'Logistic Regression')
plot_confusion_matrix(cm_lda, 'Linear Discriminant Analysis')
plot_confusion_matrix(cm_qda, 'Quadratic Discriminant Analysis')

# ROC and AUC Visualization
plt.figure(figsize=(10,8))
plt.plot(fpr_log, tpr_log, color='darkorange', lw=2, label='ROC curve (area = %0.2f) [Logistic Regression]' % roc_auc_log)
plt.plot(fpr_lda, tpr_lda, color='blue', lw=2, label='ROC curve (area = %0.2f) [LDA]' % roc_auc_lda)
plt.plot(fpr_qda, tpr_qda, color='green', lw=2, label='ROC curve (area = %0.2f) [QDA]' % roc_auc_qda)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

             id                 name  est_diameter_min  est_diameter_max  \
0       2162635  162635 (2000 SS164)          1.198271          2.679415   
1       2277475    277475 (2005 WK4)          0.265800          0.594347   
2       2512244   512244 (2015 YE18)          0.722030          1.614507   
3       3596030          (2012 BV13)          0.096506          0.215794   
4       3667127          (2014 GE35)          0.255009          0.570217   
...         ...                  ...               ...               ...   
90831   3763337           (2016 VX1)          0.026580          0.059435   
90832   3837603           (2019 AD3)          0.016771          0.037501   
90833  54017201           (2020 JP3)          0.031956          0.071456   
90834  54115824           (2021 CN5)          0.007321          0.016370   
90835  54205447           (2021 TW7)          0.039862          0.089133   

       relative_velocity  miss_distance orbiting_body  sentry_object  \
0           135

ValueError: Found input variables with inconsistent numbers of samples: [27251, 63585]