In [47]:
#Importing libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix, classification_report
pd.options.display.max_colwidth = 100
import os

### Processing Training Data

In [12]:
df_train_moon= pd.read_csv('/content/drive/MyDrive/NASA/data/training/df_train_moon.csv')
df_train=df_train_moon

In [24]:
#df_train= pd.read_csv('/content/drive/MyDrive/NASA/data/training/df_train_moon_mars.csv')
#df_train = df_train[df_train['test'] == 0]

In [13]:
df_train.head()

Unnamed: 0,filename,seismic_detected,mean,median,std,var,max,min,peak_to_peak,abs_mean,...,signal_energy,crest_factor,shape_factor,impulse_factor,margin_factor,sample_entropy,permutation_entropy,spectral_entropy,test,source
0,data/lunar/training/data/S12_GradeA/xa.s12.00....,1,-1.027981e-12,-1.096381e-16,1.015371e-10,1.030978e-20,2.092274e-09,-2.044729e-09,4.137003e-09,4.493212e-11,...,1.031084e-20,20.60495,2.259904,46.56521,94.79659,0.122385,2.09017,15.499982,0,moon
1,data/lunar/training/data/S12_GradeA/xa.s12.00....,1,-1.575229e-13,3.067792e-16,1.790563e-10,3.2061139999999996e-20,3.409087e-09,-3.642109e-09,7.051196e-09,6.153209e-11,...,3.206117e-20,20.340576,2.909967,59.190399,149.944812,0.084896,2.080752,15.414762,0,moon
2,data/lunar/training/data/S12_GradeA/xa.s12.00....,1,-1.125971e-12,-1.625741e-13,2.432037e-10,5.914806e-20,3.335532e-09,-3.051295e-09,6.386827e-09,1.588217e-10,...,5.914933e-20,13.714823,1.531317,21.001744,29.332543,0.349223,2.084922,15.006075,0,moon
3,data/lunar/training/data/S12_GradeA/xa.s12.00....,0,-8.511474e-13,-1.988552e-12,2.741845e-10,7.517714e-20,2.164536e-09,-2.2416e-09,4.406136e-09,2.06862e-10,...,7.517786999999999e-20,8.175476,1.325453,10.83621,13.322545,0.776815,2.062384,14.294525,0,moon
4,data/lunar/training/data/S12_GradeA/xa.s12.00....,0,6.636712e-15,1.823276e-12,3.1717e-10,1.0059679999999999e-19,2.789557e-09,-2.797721e-09,5.587278e-09,2.317481e-10,...,1.0059679999999999e-19,8.820887,1.368598,12.072251,15.244375,0.707411,2.066922,15.349641,0,moon


In [14]:
print(df_train['seismic_detected'].value_counts())
print(df_train['test'].value_counts())
print(df_train['source'].value_counts())

seismic_detected
1    63
0    13
Name: count, dtype: int64
test
0    76
Name: count, dtype: int64
source
moon    76
Name: count, dtype: int64


### Oversampling

In [15]:
# Split into features and target variable
X_train = df_train.drop(columns=["seismic_detected","filename", "source"])
y_train = df_train["seismic_detected"]

# Apply SMOTE
smote = SMOTE(random_state=42)
X__train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print class distribution after SMOTE
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Class distribution after SMOTE:
seismic_detected
1    63
0    63
Name: count, dtype: int64


### Classification with XGBoost

In [16]:
# Preprocessing and preparation
X = X__train_resampled
y = y_train_resampled
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for dimensionality reduction
pca = PCA(n_components=0.9)  # Keep 90% of variance
X_pca = pca.fit_transform(X_scaled)

# Feature selection based on variance threshold
selector = VarianceThreshold(threshold=0.05)  # Remove features with variance below 5%
X_selected = selector.fit_transform(X_pca)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Train XGBoost classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Predict on training set and evaluate recall
y_pred_train = xgb.predict(X_train)
recall_train = recall_score(y_train, y_pred_train)
print("Recall on training set:", recall_train)

# Predict on testing set and evaluate recall
y_pred_test = xgb.predict(X_test)
recall_test = recall_score(y_test, y_pred_test)
print("Recall on testing set:", recall_test)

Recall on training set: 1.0
Recall on testing set: 0.8571428571428571


##### Hyperparameter Tuning & Evaluation

In [17]:
# Define the parameter grid for random search
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'max_depth': [4, 6, 8],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.5]
}

# Create a RandomSearchCV object with XGBoost classifier
xgb_model = XGBClassifier()
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, n_iter=50, cv=5, scoring='recall', random_state=42)

# Train the model with random search
random_search.fit(X_train, y_train)

# Print the best parameters found by RandomSearchCV
print("Best parameters:", random_search.best_params_)

# Get the best model from the search
best_xgb = random_search.best_estimator_

# Predict on training set and evaluate recall
y_pred_train = best_xgb.predict(X_train)
recall_train = recall_score(y_train, y_pred_train)
print("Recall on training set:", recall_train)

# Evaluate the best model on testing set
y_pred_test = best_xgb.predict(X_test)
recall_test = recall_score(y_test, y_pred_test)
print("Recall on testing set with best parameters:", recall_test)

Best parameters: {'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.5}
Recall on training set: 0.8979591836734694
Recall on testing set with best parameters: 0.9285714285714286


In [18]:
# Print other important metrics
print("Precision on testing set:", precision_score(y_test, y_pred_test))
print("F1-score on testing set:", f1_score(y_test, y_pred_test))
print("Accuracy on testing set:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))


Precision on testing set: 0.8666666666666667
F1-score on testing set: 0.896551724137931
Accuracy on testing set: 0.8846153846153846
Confusion Matrix:
 [[10  2]
 [ 1 13]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.83      0.87        12
           1       0.87      0.93      0.90        14

    accuracy                           0.88        26
   macro avg       0.89      0.88      0.88        26
weighted avg       0.89      0.88      0.88        26



### Classification with RNN

In [22]:
# Preprocessing and preparation
X = X__train_resampled
y = y_train_resampled

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for dimensionality reduction
pca = PCA(n_components=0.9)  # Keep 90% of variance
X_pca = pca.fit_transform(X_scaled)

# Feature selection based on variance threshold
selector = VarianceThreshold(threshold=0.05)  # Remove features with variance below 5%
X_selected = selector.fit_transform(X_pca)

# Reshape data for RNN
X_selected = X_selected.reshape(X_selected.shape[0], X_selected.shape[1], 1)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Create RNN model
model = Sequential()
model.add(LSTM(units=64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(units=32))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy','recall'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Predict on training set and evaluate recall
y_pred_train = model.predict(X_train) > 0.5

# Predict on testing set and evaluate recall
y_pred_test = model.predict(X_test) > 0.5

Epoch 1/10


  super().__init__(**kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.5023 - loss: 0.6982 - recall: 0.3014
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6677 - loss: 0.6800 - recall: 0.5261
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6690 - loss: 0.6642 - recall: 0.4849 
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6614 - loss: 0.6549 - recall: 0.4861
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7156 - loss: 0.6422 - recall: 0.5840
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6980 - loss: 0.6289 - recall: 0.5222
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6989 - loss: 0.6235 - recall: 0.5515
Epoch 8/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

##### Evaluation

In [23]:
# Print other important metrics for RNN
print("Precision on testing set:", precision_score(y_test, y_pred_test))
print("F1-score on testing set:", f1_score(y_test, y_pred_test))
print("Accuracy on testing set:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

Precision on testing set: 0.6875
F1-score on testing set: 0.7333333333333333
Accuracy on testing set: 0.6923076923076923
Confusion Matrix:
 [[ 7  5]
 [ 3 11]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.58      0.64        12
           1       0.69      0.79      0.73        14

    accuracy                           0.69        26
   macro avg       0.69      0.68      0.68        26
weighted avg       0.69      0.69      0.69        26



### Processing Test Data

In [56]:
df_test1 = pd.read_csv('/content/drive/MyDrive/NASA/data/training/df_train_mars.csv')
df_test2 = pd.read_csv('/content/drive/MyDrive/NASA/data/test/S12_gradeB.csv')
df_test3 = pd.read_csv('/content/drive/MyDrive/NASA/data/test/S15_GradeA.csv')
df_test4 = pd.read_csv('/content/drive/MyDrive/NASA/data/test/S15_GradeB.csv')
df_test5 = pd.read_csv('/content/drive/MyDrive/NASA/data/test/S16_GradeA.csv')
df_test6 = pd.read_csv('/content/drive/MyDrive/NASA/data/test/S16_GradeB.csv')

In [57]:
df_test1 = df_test1[df_test1['test'] == 1] #We choose the test data from this df

In [58]:
df_test1.head()

Unnamed: 0,filename,seismic_detected,mean,median,std,var,max,min,peak_to_peak,abs_mean,...,signal_energy,crest_factor,shape_factor,impulse_factor,margin_factor,sample_entropy,permutation_entropy,spectral_entropy,test,source
0,data/mars/test/data/XB.ELYSE.02.BHV.2021-05-02HR01_evid0017.mseed,0,-1.966025,-28.556683,190.475101,36280.764254,1343.785889,-1951.099243,3294.885132,147.872456,...,36284.629507,10.242784,1.288173,13.194474,15.418465,0.966441,2.552283,7.969273,1,mars
1,data/mars/test/data/XB.ELYSE.02.BHV.2019-07-26HR12_evid0033.mseed,1,-0.106292,-11.556779,163.525782,26740.681313,2894.658691,-2175.483398,5070.14209,71.373738,...,26740.692611,17.701539,2.29112,40.556355,55.533453,0.671523,2.573212,8.206949,1,mars
2,data/mars/test/data/XB.ELYSE.02.BHV.2021-10-11HR23_evid0011.mseed,0,-1.188825,8.703556,260.773395,68002.763792,2583.924561,-3110.947998,5694.872559,178.515836,...,68004.177097,11.929575,1.460801,17.426734,22.284479,1.154344,2.549071,11.929979,1,mars
3,data/mars/test/data/XB.ELYSE.02.BHV.2022-04-09HR22_evid0002.mseed,0,0.328625,-3.705765,156.143322,24380.737067,1246.065552,-852.902222,2098.967773,116.874739,...,24380.845062,7.98025,1.335992,10.661547,12.931218,1.866552,2.550966,11.490576,1,mars
4,data/mars/test/data/XB.ELYSE.02.BHV.2019-07-26HR12_evid0034.mseed,1,-0.106292,-11.556779,163.525782,26740.681313,2894.658691,-2175.483398,5070.14209,71.373738,...,26740.692611,17.701539,2.29112,40.556355,55.533453,0.671523,2.573212,8.206949,1,mars


In [60]:
frames = [df_test1, df_test2, df_test3,df_test4,df_test5,df_test6]
df_test = pd.concat(frames)

df_test['filename'] = df_test['filename'].apply(lambda x: os.path.basename(x))
df_test.head()

Unnamed: 0,filename,seismic_detected,mean,median,std,var,max,min,peak_to_peak,abs_mean,...,signal_energy,crest_factor,shape_factor,impulse_factor,margin_factor,sample_entropy,permutation_entropy,spectral_entropy,test,source
0,XB.ELYSE.02.BHV.2021-05-02HR01_evid0017.mseed,0,-1.966025,-28.556683,190.475101,36280.764254,1343.785889,-1951.099243,3294.885132,147.872456,...,36284.629507,10.242784,1.288173,13.194474,15.418465,0.966441,2.552283,7.969273,1,mars
1,XB.ELYSE.02.BHV.2019-07-26HR12_evid0033.mseed,1,-0.106292,-11.556779,163.525782,26740.681313,2894.658691,-2175.483398,5070.14209,71.373738,...,26740.692611,17.701539,2.29112,40.556355,55.533453,0.671523,2.573212,8.206949,1,mars
2,XB.ELYSE.02.BHV.2021-10-11HR23_evid0011.mseed,0,-1.188825,8.703556,260.773395,68002.763792,2583.924561,-3110.947998,5694.872559,178.515836,...,68004.177097,11.929575,1.460801,17.426734,22.284479,1.154344,2.549071,11.929979,1,mars
3,XB.ELYSE.02.BHV.2022-04-09HR22_evid0002.mseed,0,0.328625,-3.705765,156.143322,24380.737067,1246.065552,-852.902222,2098.967773,116.874739,...,24380.845062,7.98025,1.335992,10.661547,12.931218,1.866552,2.550966,11.490576,1,mars
4,XB.ELYSE.02.BHV.2019-07-26HR12_evid0034.mseed,1,-0.106292,-11.556779,163.525782,26740.681313,2894.658691,-2175.483398,5070.14209,71.373738,...,26740.692611,17.701539,2.29112,40.556355,55.533453,0.671523,2.573212,8.206949,1,mars


In [61]:
print(df_test['seismic_detected'].value_counts())
print(df_test['test'].value_counts())
print(df_test['source'].value_counts())

seismic_detected
1    69
0    36
Name: count, dtype: int64
test
1    105
Name: count, dtype: int64
source
moon    96
mars     9
Name: count, dtype: int64


In [62]:
X_test_real = df_test.drop(columns=["seismic_detected","filename","source"])
X_test_real_scaled = scaler.transform(X_test_real)
X_test_real_pca = pca.transform(X_test_real_scaled)
X_test_real_selected = selector.transform(X_test_real_pca)

### Predictions on Test Dataset

In [63]:
y_pred_real = best_xgb.predict(X_test_real_selected)

In [64]:
y_pred_real

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1])

In [67]:
# Create a new DataFrame with filename and predictions
results_df = pd.DataFrame({'filename': df_test['filename'], 'seismic_detected': y_pred_real})
results_df = results_df[results_df['seismic_detected'] == 1]

# Export the results to a CSV file
results_df.to_csv('/content/drive/MyDrive/NASA/data/predictions.csv', index=False)