In [None]:
# flag = 0

# if flag == 0:
#   !pip install pandas numpy scikit-learn seaborn matplotlib scipy nltk tensorflow keras transformers
#   flag = 1

import tensorflow as tf
from tensorflow.keras import backend as K
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import concatenate, Concatenate
from keras.layers import Input, Embedding, Conv1D, Conv2D, GlobalMaxPooling1D, GlobalAveragePooling1D, Flatten, MaxPooling2D, MaxPooling1D, Dense, Dropout, Reshape
from keras.models import Model
from sklearn.metrics import accuracy_score
import transformers
import requests
import zipfile
import io
import os
import glob
import re
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers.legacy import Adam
from sklearn.metrics import mean_squared_error
import time
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE
from scipy.stats import ks_2samp
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

#### Read the dataset

In [None]:
df = pd.read_csv('SDSS_DR18.csv')
df.head()

#### Check column types

In [None]:
df.info()

#### Check duplicates

In [None]:
num_duplicate_rows = df.duplicated().sum()

print("Number of duplicate rows:", num_duplicate_rows)

#### Check for missing values

In [None]:
column_names = df.columns.values.tolist()

print("Column name \t Count of missing values \t Percentage of missing value to total rows")
for col in column_names:
    count_nan = df[col].isnull().sum()
    pct_nan = count_nan / len(df) * 100
    if col in ['ra', 'dec', 'u', 'g', 'r', 'i', 'z']:
        print(col + " - \t\t\t" + str(count_nan) + " \t\t\t\t" + str(round(pct_nan, 2)) + "%")
    else:
        print(col + " - \t\t" + str(count_nan) + " \t\t\t\t" + str(round(pct_nan, 2)) + "%")

#### Bar representing instances per class

In [None]:
class_counts = df['class'].value_counts()

plt.bar(class_counts.index, class_counts.values)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.show()

#### Box plots representing the statistical summary of all columns

In [None]:
columns_to_exclude = ['objid', 'specobjid', 'class']

column_names = [col for col in df.columns if col not in columns_to_exclude]

num_columns = len(column_names)
num_rows = math.ceil(num_columns / 5)

fig, axs = plt.subplots(num_rows, 5, figsize=(13, 3 * num_rows))

axs = axs.flatten()

for i, column in enumerate(column_names):
    current_row = i // 5
    position_in_row = i % 5

    ax = axs[i]

    sns.boxplot(x=df['class'], y=df[column], ax=ax)
    ax.set_title(column)
    ax.set_ylabel(column)

    ax.set_xlabel('')

    if current_row == 0 and position_in_row < 2:
        ax.set_title(column)
    else:
        current_row += 1

for i in range(num_columns, len(axs)):
    fig.delaxes(axs[i])

plt.tight_layout()
plt.show()


#### Histograms for representing data distribution across all columns

In [None]:
num_columns = len(column_names)
num_rows = math.ceil(num_columns / 3)

plt.figure(figsize=(19, num_rows * 5))
plt.subplots_adjust(hspace=0.5)

for i, column in enumerate(column_names):
    plt.subplot(num_rows, 3, i + 1)
    plt.hist(df[column], bins=30, color='skyblue')
    plt.title(column)
    plt.xlabel('Values')
    plt.ylabel('Frequency')

plt.show()


#### Data split for model training

In [None]:
X = df.drop(columns=['class', 'objid', 'specobjid'])
y = df['class']

In [None]:
print(X.shape)
print(y.shape)

#### Shuffling the dataset

In [None]:
X, y = shuffle(X, y, random_state=42)

#### Dataset split into train, test and validate

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

In [None]:
num_classes = 3

#### Scaling all the numerical data to  standard scale

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

#### Label encoding the target feature

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [None]:
class_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
galaxy_index = class_mapping['GALAXY']
qso_index = class_mapping['QSO']
star_index = class_mapping['STAR']
print("Galaxy class index:", galaxy_index)
print("QSO class index:", qso_index)
print("Star class index:", star_index)

#### One hot encoding the target

In [None]:
y_train_one_hot = to_categorical(y_train, num_classes)
y_val_one_hot = to_categorical(y_val, num_classes)
y_test_one_hot = to_categorical(y_test, num_classes)

#### Checking number of instances for train, test and validate

In [None]:
X_train_rows = X_train.shape[0]
X_test_rows = X_test.shape[0]
X_val_rows = X_val.shape[0]
y_train_rows = y_train.shape[0]
y_test_rows = y_test.shape[0]
y_val_rows = y_val.shape[0]

print("Input for train:", X_train_rows)
print("Input for test:", X_test_rows)
print("Input for validation:", X_val_rows)
print("Target for train:", y_train_rows)
print("Target for test:", y_test_rows)
print("Target for validation:", y_val_rows)

#### Checking shape of input features

In [None]:
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

#### Checking shape of target

In [None]:
print("y_train shape:", y_train_one_hot.shape)
print("y_val shape:", y_val_one_hot.shape)
print("y_test shape:", y_test_one_hot.shape)

In [None]:
qso_class_label = label_encoder.transform(['QSO'])[0]
qso_data = X_train[y_train == qso_class_label]
qso_labels = y_train[y_train == qso_class_label]

In [None]:
subset_size = int(len(qso_data) * 0.5)  # Example ratio, adjust as needed
other_data = X_train[y_train != qso_class_label][:subset_size]
other_labels = y_train[y_train != qso_class_label][:subset_size]

combined_data = np.concatenate([qso_data, other_data])
combined_labels = np.concatenate([qso_labels, other_labels])

### Applying SMOTE

In [None]:
smote = SMOTE(random_state=4232, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(combined_data, combined_labels)

In [None]:
X_qso_resampled = X_resampled[y_resampled == qso_class_label]
y_qso_resampled = y_resampled[y_resampled == qso_class_label]

X_train_resampled = np.concatenate((X_train[y_train != qso_class_label], X_qso_resampled))
y_train_resampled = np.concatenate((y_train[y_train != qso_class_label], y_qso_resampled))

shuffled_indices = np.random.permutation(len(X_train_resampled))
X_train_resampled = X_train_resampled[shuffled_indices]
y_train_resampled = y_train_resampled[shuffled_indices]

y_train_resampled_one_hot = to_categorical(y_train_resampled, num_classes)

### Perform Kolmogorov-Smirnov Test

In [None]:
num_features = X_train.shape[1]
ks_test_results = []

for i in range(num_features):
    ks_statistic, p_value = ks_2samp(X_train[:, i], X_train_resampled[:, i])
    ks_test_results.append({'Feature Index': i,
                            'KS Statistic': ks_statistic,
                            'P-Value': p_value,
                            'Similar Distribution': 'Yes' if p_value >= 0.05 else 'No'})


ks_test_results_df = pd.DataFrame(ks_test_results)
print(ks_test_results_df)

#### Reshaping data for CNN

In [None]:
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0], X_train_resampled.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

### Augmented CNN

In [None]:
print(X_train_resampled.shape)
print(y_train_resampled.shape)

#### Defining metrics

In [None]:
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    actual_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    return true_positives / (actual_positives + K.epsilon())

def custom_f1_score(y_true, y_pred):
    precision_value = precision(y_true, y_pred)
    recall_value = recall(y_true, y_pred)
    return 2 * ((precision_value * recall_value) / (precision_value + recall_value + K.epsilon()))

def fnr(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    false_negatives = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))
    return false_negatives / (true_positives + false_negatives + K.epsilon())


### CNN Architecture

In [None]:
def cnn_model(X_train, y_train, X_val, y_val, num_classes):
    model = Sequential()
    model.add(Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy', precision, recall, custom_f1_score, fnr])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        
    history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_val, y_val), callbacks=[early_stopping])
    return model, history


In [None]:
model, history = cnn_model(X_train_resampled, y_train_resampled_one_hot, X_val, y_val_one_hot, num_classes)

In [None]:
def calculate_fnr(y_true, y_pred):

    cm = confusion_matrix(y_true, y_pred)
    
    fnr_per_class = []
    for i in range(len(cm)):
        FN = cm[i, :].sum() - cm[i, i]
        TP = cm[i, i]
        fnr = FN / (FN + TP) if (FN + TP) != 0 else 0
        fnr_per_class.append(fnr)

    avg_fnr = np.mean(fnr_per_class)
    return avg_fnr

In [None]:
def monte_carlo_evaluation(model, X_val, y_val, num_simulations=30):
    metric_values = {'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'fnr': []}

    for _ in range(num_simulations):
        preds = model.predict(X_val)
        preds_labels = preds.argmax(axis=1)
        y_val_labels = y_val.argmax(axis=1)

        metric_values['accuracy'].append(accuracy_score(y_val_labels, preds_labels))
        metric_values['precision'].append(precision_score(y_val_labels, preds_labels, average='macro'))
        metric_values['recall'].append(recall_score(y_val_labels, preds_labels, average='macro'))
        metric_values['f1_score'].append(f1_score(y_val_labels, preds_labels, average='macro'))
        metric_values['fnr'].append(calculate_fnr(y_val_labels, preds_labels))

    metric_stats = {metric: {'mean': np.mean(metric_values[metric]), 'std': np.std(metric_values[metric])} for metric in metric_values}

    return metric_stats


In [None]:
# After training the model
metric_stats = monte_carlo_evaluation(model, X_val, y_val_one_hot, num_simulations=30)

In [None]:
for metric in metric_stats:
    print(f"{metric}: Mean = {metric_stats[metric]['mean']}, Std Dev = {metric_stats[metric]['std']}")

In [None]:
metrics = ['accuracy', 'precision', 'recall', 'custom_f1_score', 'fnr']

In [None]:
for metric in metrics:
    print(metric, ":", history.history[metric])

### Visualizing the metrics

In [None]:

plt.figure(figsize=(12, 8))
for metric in metrics:
    plt.plot(history.history[metric], label=metric)

plt.title('Model Performance Metrics Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Metric')
plt.legend()
plt.show()

#### Model evaluation on the test set

In [None]:
test_evaluation = model.evaluate(X_test, y_test_one_hot)
print("Test Set Evaluation Metrics:", test_evaluation)

In [None]:

test_loss, test_accuracy, test_precision, test_recall, test_f1_score, test_fnr = model.evaluate(X_test, y_test_one_hot)
y_pred = model.predict(X_test).argmax(axis=1)
y_true = y_test_one_hot.argmax(axis=1)
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

In [None]:
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1_score}")
print(f"Test FNR: {test_fnr}")

In [None]:
def plot_test_metrics_and_confusion_matrix(test_metrics, confusion_matrix, class_labels):
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    metrics = list(test_metrics.keys())
    values = [test_metrics[metric] for metric in metrics]
    plt.bar(metrics, values, color='skyblue')
    plt.xlabel('Metrics')
    plt.ylabel('Values')
    plt.title('Test Metrics')
    for i, value in enumerate(values):
        plt.text(i, value, f'{value:.2f}', ha='center', va='bottom')

    plt.subplot(1, 2, 2)
    sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')

    plt.tight_layout()
    plt.show()

test_metrics = {
    'accuracy': test_accuracy,
    'precision': test_precision,
    'recall': test_recall,
    'f1_score': test_f1_score,
    'fnr': test_fnr
}

class_labels = label_encoder.classes_

plot_test_metrics_and_confusion_matrix(test_metrics, cm, class_labels)