In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [2]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("../magic+gamma+telescope/magic04.data", names=cols)

In [15]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [3]:
df["class"] = (df["class"] == "g").astype(int)

In [None]:
for label in cols[:-1]:
    plt.hist(df[df["class"]==1][label], color='blue', label='gamma', alpha=0.7, density=True)
    plt.hist(df[df["class"]==0][label], color='red', label='hadron', alpha=0.7, density=True)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

# Train, Validation, Test Datasets

In [4]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

  return bound(*args, **kwds)


In [5]:
def scale_dataset(dataframe, oversample=False):
    X = dataframe[dataframe.columns[:-1]].values
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y) # take more of the less

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    return data, X, y

In [6]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)


In [56]:
sum(y_train == 1)

7370

In [57]:
sum(y_train == 0)

7370

# KNN Implementation

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [61]:
knn_model = KNeighborsClassifier(n_neighbors = 1)
knn_model.fit(X_train, y_train)

In [62]:
y_pred = knn_model.predict(X_test)

In [67]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.68      0.73      1329
           1       0.84      0.89      0.86      2475

    accuracy                           0.82      3804
   macro avg       0.81      0.79      0.80      3804
weighted avg       0.82      0.82      0.82      3804



- Accuracy: Correct / Total
- Precision: Labeled True Positives / Total Labeled Positives (Out of all the ones we labeled as positive, what's actually positive)
- Recall: Labeled True Positives / True Positives (Out of all that are positive, how many did we label correctly)


# Naive Bayes

In [68]:
from sklearn.naive_bayes import GaussianNB

In [70]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [74]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.40      0.51      1329
           1       0.74      0.90      0.81      2475

    accuracy                           0.73      3804
   macro avg       0.72      0.65      0.66      3804
weighted avg       0.72      0.73      0.71      3804



# Log Regression

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [11]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.70      0.68      1307
           1       0.84      0.81      0.83      2497

    accuracy                           0.78      3804
   macro avg       0.75      0.76      0.75      3804
weighted avg       0.78      0.78      0.78      3804



# SVM

In [12]:
from sklearn.svm import SVC

In [18]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [19]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.70      0.68      1307
           1       0.84      0.81      0.83      2497

    accuracy                           0.78      3804
   macro avg       0.75      0.76      0.75      3804
weighted avg       0.78      0.78      0.78      3804



# Neural Netowrks

In [20]:
import tensorflow as tf

In [26]:
def train_model(X_train, y_train, epochs, batch_size, num_nodes, dropout_prob, lr):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid'),

    ])

    nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy',
                    metrics=['accuracy'])
    
    history = nn_model.fit(
        X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose = 0
    )
    return nn_model, history


In [27]:
model, history = train_model(X_train, y_train, 100, 32, 64, 0.2, 0.001)
y_pred = model.predict(X_test)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310us/step


In [29]:
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      1307
           1       0.88      0.93      0.91      2497

    accuracy                           0.87      3804
   macro avg       0.87      0.85      0.86      3804
weighted avg       0.87      0.87      0.87      3804

