In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("magic04.data", names = cols) #labelling the columns of the dataframe.
df.head() # printing only the first 5 values from the dataset.

In [12]:
df["class"] = (df["class"] == "g").astype(int) #converting the datatype of the 'class' column in the dataframe to integer.

In [None]:
df.head()

In [None]:
#plotting a histogram for the dataframe.
for label in cols[:-1]:
  plt.hist(df[df["class"] == 1][label], color = "blue", label = "gamma", alpha = 0.7, density = True)
  plt.hist(df[df["class"] == 0][label], color = "red", label = "hadron", alpha = 0.7, density = True)
  plt.title(label)
  plt.xlabel(label)
  plt.ylabel("probability")
  plt.legend()
  plt.show()

#train, validate, and test datasets

In [None]:
train, valid, test = np.split(df.sample(frac = 1), [int(0.6*len(df)), int(0.8*len(df))])
#validation dataset = everything between 60-80% of the originla dataset
#training dataset = the remainder.

In [16]:
def scale_dataset(dataframe, oversample = False):
  X = dataframe[dataframe.columns[:-1]] #the whole 2d array matrix of the dataset
  y = dataframe[dataframe.columns[-1]] # class column vector. (1d array)

  scalar = StandardScaler()
  X = scalar.fit_transform(X) # fitting and transforming the values in X .

  if oversample: # if in case needed to oversample.
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1)))) #horizontally merging the two arrays.

  return data, X, y

In [17]:
train, X_train, y_train = scale_dataset(train, oversample = True) #oversampling needed here.
valid, X_valid, y_valid = scale_dataset(valid, oversample = False) #oversampling not needed in this dataset
test, X_test, y_test = scale_dataset(test, oversample = False) #oversampling not needed here.

#kNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report # for obtaining the classification report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 4) #creating an instance of the knn model from the sklearn module.
knn_model.fit(X_train, y_train) #fitting the training data.

In [20]:
y_pred = knn_model.predict(X_test) #making predictions based on test dataset.

In [None]:
print(classification_report(y_test, y_pred))

#Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gaussian = GaussianNB() #creating an instance of the naive bayes model from the sklearn module.
gaussian.fit(X_train, y_train) #fitting the parameters.

In [24]:
y_pred = gaussian.predict(X_test) #making predictions based on test dataset.

In [None]:
print(classification_report(y_test, y_pred))

#Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_model = LogisticRegression() #creating an instance of the naive bayes model from the sklearn module.
log_model.fit(X_train, y_train) #fitting the parameters.

In [28]:
y_pred = log_model.predict(X_test) #making predictions based on test dataset.

In [None]:
print(classification_report(y_test, y_pred))

#SVM

In [30]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC() #creating an instance of the naive bayes model from the sklearn module.
svm_model.fit(X_train, y_train) #fitting the parameters.

In [None]:
y_pred = svm_model.predict(X_test) #making predictions based on test dataset.
print(classification_report(y_test, y_pred))

#Neural Networks

In [33]:
import tensorflow as tf

In [45]:
def plot_history(history):
  fig1, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4))
  ax1.plot(history.history['loss'], label='loss')
  ax1.plot(history.history['val_loss'], label='val_loss')
  ax1.set_xlabel('epochs')
  ax1.set_ylabel('Binary crossentropy')
  ax1.grid(True)

  ax2.plot(history.history['loss'], label='loss')
  ax2.plot(history.history['val_loss'], label='val_loss')
  ax2.set_xlabel('epochs')
  ax2.set_ylabel('Accuracy')
  ax2.grid(True)

  plt.show()

In [43]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
  nn_model = tf.keras.Sequential([
      tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(num_nodes, activation='relu'),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy', metrics=['accuracy'])

  history = nn_model.fit(
    X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0
  )
  return nn_model, history

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs=100 #no. of iterations
for num_nodes in [16, 32, 64]: #number of nodes
  for dropout_prob in [0, 0.2]:
    for lr in[0.01, 0.005, 0.001]: #learning rate
      for batch_size in[32, 64, 128]:
        print(f"{num_nodes} nodes, dropout {dropout_prob}, batch_size {batch_size}, lr {lr}")
        model, history= train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
        plot_history(history)
        val_loss = model.evaluate(X_valid, y_valid)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

In [None]:
y_pred = least_loss_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int).reshape(-1)
y_pred