In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('/Users/arya/Desktop/gamma vs hadrons/data/processed/magic04.data')
data.head()

In [None]:
data.columns

In [None]:
data.info

In [None]:
data.describe

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
data = pd.read_csv('/Users/arya/Desktop/gamma vs hadrons/data/processed/magic04.data', names = cols)
data.head()

In [None]:
data['class'] = data['class'].replace('g', 1)
data['class'] = data['class'].replace('h', 0)
data.head()

In [None]:
import matplotlib.pyplot as plt

for label in cols[:-1]:
    plt.hist(data[data['class']==1][label], label = 'gamma', alpha =0.5, color = 'red', density = True)
    plt.hist(data[data['class']==0][label], label = 'hadron', alpha =0.5, color = 'blue', density = True)
    plt.title(label)
    plt.ylabel('Probability')
    plt.xlabel(label)
    plt.legend()
    plt.show()


In [None]:
train, valid, test = np.split(data.sample(frac=1), [int(0.6*len(data)), int(0.8*len(data))])

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
def dataframe_scaler(dataframe, oversample= False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    if oversample==True:
        ros = RandomOverSampler()
        x, y = ros.fit_resample(x,y)

    data = np.hstack((x, np.reshape(y, (-1,1))))

    return data, x,y

In [None]:
train, x_train, y_train = dataframe_scaler(train, oversample=True)
valid, x_valid, y_valid = dataframe_scaler(valid, oversample=False)
test, x_test, y_test = dataframe_scaler(test, oversample=False)
print(len(x_train), len(y_train))

K Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

knn_model= KNeighborsClassifier(n_neighbors = 9)
knn_model.fit(x_train, y_train)
y_pred = knn_model.predict(x_test)
print(classification_report(y_test, y_pred))
print(y_pred)
print(y_test)

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model = nb_model.fit(x_train, y_train)
y_pred  = nb_model.predict(x_test)
print(classification_report(y_test, y_pred))

Logisitic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model = lr_model.fit(x_train, y_train)
y_pred = lr_model.predict(x_test)
print(classification_report(y_test, y_pred))


Support Vector Machines

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(x_train, y_train)
y_pred = svm_model.predict(x_test)
print(classification_report(y_test, y_pred))

In [None]:
import tensorflow as tf

In [None]:
import sys
sys.path.insert(1,'/Users/arya/Desktop/gamma vs hadrons/src')
from plot_history import plot_history
sys.path.insert(1,'/Users/arya/Desktop/gamma vs hadrons/src/modeling')
from train import train_model

In [None]:
#plot history contains a function to plot history of the training, the losses and accuracy over epochs

'''
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
    ax1.plot(history.history["loss"], label="loss")
    ax1.plot(history.history["val_loss"], label="val_loss")
    ax1.set_xlabel("Epoch")
    ax1.set_ylabel("Binary crossentropy")
    ax1.grid(True)
    ax2.plot(history.history["accuracy"], label="accuracy")
    ax2.plot(history.history["val_accuracy"], label="val_accuracy")
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Accuracy")
    ax2.grid(True)
    plt.show()

'''

#train contains a generic tensorflow neural network with variable parameters; number of nodes, dropout rate, batch_size, learning rate, epochs

'''
def train_model(x_train, y_train, num_nodes, dropout, batch_size, lr, epochs):
    nn_model = tf.keras.models.Sequential(
        [
            tf.keras.layers.Dense(num_nodes, activation="relu", input_shape=(10,)),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(num_nodes, activation="relu"),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(1, activation="sigmoid"),
        ]
    )

    nn_model.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=["accuracy"],
    )
    history = nn_model.fit(
        x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2
    )

    return nn_model, history
'''


In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs = 100
for num_nodes in [16,32,64]:
  for dropout in [0,0.2]:
    for batch_size in [32,64,128]:
      for lr in [0.01, 0.05, 0.001]:
        model, history = train_model(x_train, y_train, num_nodes = num_nodes, dropout = dropout, batch_size =batch_size, lr= lr, epochs = epochs)
        print(f"{num_nodes} nodes, dropout {dropout}, lr {lr}, batch size {batch_size}")
        plot_history(history)
        val_loss = model.evaluate(x_valid, y_valid)[0]
        print(f'Validation loss: {val_loss}')

        if val_loss < least_val_loss: #so that we can retrieve the set of parameters that gave us the lowest validation losses
          least_val_loss = val_loss
          least_loss_model = model



In [35]:
y_pred = least_loss_model.predict(x_test)
y_pred = (y_pred>0.5).astype(int).reshape(-1,)

print(classification_report(y_test, y_pred))

[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 328us/step
              precision    recall  f1-score   support

           0       0.92      0.68      0.78      1347
           1       0.85      0.97      0.90      2457

    accuracy                           0.86      3804
   macro avg       0.88      0.82      0.84      3804
weighted avg       0.87      0.86      0.86      3804

