# CREMER

## Data Cleaning

In [1]:
import sklearn 
import imblearn
import time
import xgboost
import pandas as pd
import numpy as np
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.metrics import recall_score
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('SEU Occurance Dataset.csv')
print(df.sample(n=10))
print('\n')
print('Number of samples: ', df.size)

        Number   Lon (deg)  Lat (deg)        Alt (m)  Timestamp (ms UTC Unix)  \
72824    72825  -42.088117 -16.628635  599378.852916            1509637385920   
171397  171398   57.456147 -55.080587  608985.955780            1514945766849   
159840  159841  -76.178403   7.098535  588791.475168            1526434315228   
163698  163699 -146.341418 -42.538171  606205.646457            1526528880748   
104458  104459  -92.685061  -9.465018  578391.180429            1503760506593   
92315    92316  -56.322774  -4.473178  602205.721654            1506819921389   
85587    85588  -11.180755 -70.399523  585849.498499            1521035871786   
14062    14063  -69.886821  55.783444  581672.284835            1514006861799   
139804  139805  -90.611845  41.520454  601668.680989            1522563752308   
83325    83326  148.047553  72.184514  603603.421205            1504663466253   

       Time Position Source  SEU  
72824   NaN             NaN    0  
171397  NaN             NaN    0  
159

In [3]:
print(df.head(n=10))

   Number  Lon (deg)  Lat (deg)        Alt (m)  Timestamp (ms UTC Unix)  \
0       1 -53.718376  79.695173  617150.437500            1527486385447   
1       2 -53.718376  79.695173  617150.437500            1527486385447   
2       3 -24.780551  82.160168  617444.375000            1527486303841   
3       4 -38.133264 -19.880011  596665.263226            1527470646745   
4       5 -45.491699 -80.723138  599915.022891            1527442867944   
5       6 -64.639836 -39.309800  602378.198659            1527389777008   
6       7 -63.863840 -36.600821  588964.257895            1527389732796   
7       8 -63.863840 -36.600821  588964.257895            1527389732796   
8       9 -39.735791 -36.724916  589059.315441            1527383935948   
9      10  30.074085  52.083323  607484.076194            1527370896476   

                  Time Position Source  SEU  
0  2018-05-28T05:46:25             TLE    1  
1  2018-05-28T05:46:25             TLE    1  
2  2018-05-28T05:45:03             T

In [4]:
df['SEU'].value_counts()

0    197870
1      2129
Name: SEU, dtype: int64

In [5]:
x = np.asanyarray(df[['Lon (deg)', 'Lat (deg)', 'Alt (m)']])
y = np.asanyarray(df[['SEU']])

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 4)

### Data Sampling

In [7]:
print(y_train.size)

159999


In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

oversample = SMOTE()
undersample = ClusterCentroids(voting = 'soft') # soft as we ourselves augmented the data
combine = SMOTEENN()

x_train, y_train = undersample.fit_resample(x_train, y_train)
print(y_train.size)
print(Counter(y_train))

3392
Counter({0: 1696, 1: 1696})


## Performance Metrics

In [16]:
def evaluation_classificationReport(y, y_hat):
    print('Balanced Accuracy Score: ', sklearn.metrics.balanced_accuracy_score(y, y_hat))
    print('\n')
    print(classification_report(y, y_hat))
    CM = confusion_matrix(y, y_hat)
    print(CM)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    tn, fp, fn, tp = CM.ravel()
    print('True Negative:' ,tn)
    print('False Negative:', fn)
    print('True Positive:' ,tp)
    print('False Positive:', fp)
    print('Recall: ', recall_score(y, y_hat))
    print('ROC AUC Score: ', sklearn.metrics.roc_auc_score(y, y_hat))
    # Positive is SEU

## CREMER Algorithm

In [17]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [18]:
clf_rf = RandomForestClassifier(class_weight = {0:1, 1:100},
                                min_samples_leaf = 8,
                               )

clf_xgb = XGBClassifier(booster = 'gbtree',
                        gamma = 0.01,
                        min_child_weight = 1, # as the data is highly imbalanced, leaf nodes can have smaller sized groups
                        max_delta_step = 1, 
                        objective = 'reg:squaredlogerror',
                        scale_pos_weight = 100, # to control the balance of pos and neg weight, = #neg/#pos
                        # predictor = 'gpu_predictor',
                        use_label_encoder = False,
                        verbosity = 0
                       )

In [20]:
from sklearn.ensemble import VotingClassifier

estimator = []
estimator.append(('XGBoost', clf_xgb))
estimator.append(('Random Forest', clf_rf))

clf_voting = VotingClassifier(estimators = estimator, voting = 'soft')
clf_voting.fit(x_train, y_train)
y_hat_voting = clf_voting.predict(x_test)

In [21]:
evaluation_classificationReport(y_test, y_hat_voting)

Balanced Accuracy Score:  0.6279925341941995


              precision    recall  f1-score   support

           0       1.00      0.30      0.46     39567
           1       0.01      0.96      0.03       433

    accuracy                           0.30     40000
   macro avg       0.51      0.63      0.24     40000
weighted avg       0.99      0.30      0.45     40000

[[11682 27885]
 [   17   416]]
True Negative: 11682
False Negative: 17
True Positive: 416
False Positive: 27885
Recall:  0.9607390300230947
ROC AUC Score:  0.6279925341941995


## Make A Prediction

### Non-SEU

In [72]:
print('Data point:', x_test[890])

Data point: [-4.80081543e+01  2.85151453e+01  5.93768858e+05]


In [73]:
print('True value:', y_test[809][0])
print('Predicted value:', y_hat_voting[809])

True value: 0
Predicted value: 0


### SEU

In [74]:
print('Data point:', x_test[89])

Data point: [-1.04219427e+02  7.37542178e+01  5.97760826e+05]


In [75]:
print('True value:', y_test[89][0])
print('Predicted value:', y_hat_voting[89])

True value: 1
Predicted value: 1


## Data Collection

In [22]:
recall = []
precision = []
f1 = []
aucroc = []
timer = []

In [None]:
import time
for i in range(1):
    
    start = time.time()


    
    from imblearn.under_sampling import ClusterCentroids
    undersample = ClusterCentroids(voting = 'soft') # soft as we ourselves augmented the data
    x_train, y_train = undersample.fit_resample(x_train, y_train)
    
    from sklearn.ensemble import VotingClassifier
    estimator = []
    estimator.append(('XGBoost', clf_xgb))
    estimator.append(('Random Forest', clf_rf))

    clf_voting = VotingClassifier(estimators = estimator, voting = 'soft')
    clf_voting.fit(x_train, y_train)
    start = time.time()
    y_hat_voting = clf_voting.predict(x_test)
    end = time.time()
    
    recall.append(sklearn.metrics.recall_score(y_test, y_hat_voting))
    precision.append(sklearn.metrics.precision_score(y_test, y_hat_voting))
    f1.append(sklearn.metrics.f1_score(y_test, y_hat_voting))
    aucroc.append(sklearn.metrics.roc_auc_score(y_test, y_hat_voting))
    timer.append(end-start)
    print(i)
    

In [None]:
print('Recall: ', sum(recall)/10)

In [None]:
print('F1: ', sum(f1)/10)

In [None]:
print('Precision: ', sum(precision)/10)

In [None]:
print('AUC: ', sum(aucroc)/10)

In [None]:
print('Time: ', sum(timer)/10)