## MSDS 7331 - Case Study 6 - Predicting the Existence of New Particles
Daniel Crouthamel

Sophia Wu

Fabio Savorgnan

Bo Yun

In [1]:
import pandas as pd
import tensorflow as tf 
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import os

# Introduction

The intention of this study is to predict the existence of new particles using a dense neural network.

# Business Understanding

We received an abrupt request from our client in the superconductors industry to help predict the existence of new particles using a dense neural network. The goal should be to maximized accuracy. The client requires a write up on the design of the network, along with information to indicate that the model was trained appropriately.

# Data Engineering

In [2]:
# Load Data
df = pd.read_csv('data/all_train.csv.gz')

In [3]:
# Load Dataset and profile it, only need to do once - so comment out.
# from pandas_profiling import ProfileReport
# profile = ProfileReport(df, title="New Particles EDA", minimal=True)
# profile.to_file(output_file="NewParticlesEDA.html")

Summarize dataset: 100%|██████████| 38/38 [01:59<00:00,  3.13s/it, Completed]
Generate report structure: 100%|██████████| 1/1 [00:22<00:00, 22.39s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 504.12it/s]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000000 entries, 0 to 6999999
Data columns (total 29 columns):
 #   Column   Dtype  
---  ------   -----  
 0   # label  float64
 1   f0       float64
 2   f1       float64
 3   f2       float64
 4   f3       float64
 5   f4       float64
 6   f5       float64
 7   f6       float64
 8   f7       float64
 9   f8       float64
 10  f9       float64
 11  f10      float64
 12  f11      float64
 13  f12      float64
 14  f13      float64
 15  f14      float64
 16  f15      float64
 17  f16      float64
 18  f17      float64
 19  f18      float64
 20  f19      float64
 21  f20      float64
 22  f21      float64
 23  f22      float64
 24  f23      float64
 25  f24      float64
 26  f25      float64
 27  f26      float64
 28  mass     float64
dtypes: float64(29)
memory usage: 1.5 GB


In [4]:
df.describe()

Unnamed: 0,# label,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f18,f19,f20,f21,f22,f23,f24,f25,f26,mass
count,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,...,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0,7000000.0
mean,0.5001256,0.01612528,0.0004770022,2.686578e-05,0.01056081,-0.0001050026,0.002765919,0.01815953,2.510948e-05,0.000434587,...,0.01164789,-0.0001127097,7.686731e-05,0.0002909202,0.01228774,0.009778378,0.005269844,-0.001760961,0.01533136,1000.107
std,0.5,1.004417,0.9974864,1.00008,0.9956003,0.999867,1.000957,0.9867746,0.9965867,1.000007,...,1.002725,1.000038,1.000033,1.00017,1.010477,1.005418,1.00999,0.9844511,0.9822799,353.4255
min,0.0,-1.960549,-2.365355,-1.732165,-9.980274,-1.732137,-1.054221,-3.034787,-2.757853,-1.732359,...,-1.728284,-2.281867,-1.731758,-0.5736825,-3.631608,-4.729473,-20.62223,-3.452634,-2.632761,500.0
25%,0.0,-0.7288206,-0.7332548,-0.8656704,-0.6092291,-0.8658025,-1.054221,-0.7566092,-0.7014146,-0.8656543,...,-0.742363,-0.7206846,-0.8656855,-0.5736825,-0.5417942,-0.5115522,-0.354387,-0.6925097,-0.7943804,750.0
50%,1.0,-0.03930319,0.0008523957,0.0003199154,0.01963316,-0.0005070131,-0.005983562,-0.1499527,-0.0001067553,0.001384781,...,-0.08992496,-6.735953e-05,-0.0004424527,-0.5736825,-0.160276,-0.3144032,-0.3265228,-0.3570301,-0.0882864,1000.0
75%,1.0,0.6900799,0.7347832,0.8659464,0.6798818,0.8657646,0.8504885,0.768669,0.7013194,0.8665976,...,0.6423185,0.7204921,0.8659566,-0.5736825,0.4812194,0.1634892,-0.2337671,0.4753128,0.7610846,1250.0
max,1.0,4.378282,2.365287,1.73237,4.148023,1.731978,4.482618,3.720345,2.75859,1.73145,...,5.866367,2.282217,1.73274,1.743123,7.29342,9.333287,14.99064,5.277313,4.44469,1500.0


In [5]:
df["# label"].value_counts()

1.0    3500879
0.0    3499121
Name: # label, dtype: int64

The information found in the Pandas Profile report reveals the following.

* 7 Million rows of data
* 28 features and 1 target
* No missing data
* Our reponse, # label, is nearly balanced, close to 50%.
* We have 4 features in which there are only two values, which we could consider treating as binary/categorical.
  * f9
  * f13
  * f17
  * f21
* The remaing features are distrubuted normally or have a uniform distribution.

In [6]:
# Relabel target
df.rename(columns={'# label': 'target'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000000 entries, 0 to 6999999
Data columns (total 29 columns):
 #   Column  Dtype  
---  ------  -----  
 0   target  float64
 1   f0      float64
 2   f1      float64
 3   f2      float64
 4   f3      float64
 5   f4      float64
 6   f5      float64
 7   f6      float64
 8   f7      float64
 9   f8      float64
 10  f9      float64
 11  f10     float64
 12  f11     float64
 13  f12     float64
 14  f13     float64
 15  f14     float64
 16  f15     float64
 17  f16     float64
 18  f17     float64
 19  f18     float64
 20  f19     float64
 21  f20     float64
 22  f21     float64
 23  f22     float64
 24  f23     float64
 25  f24     float64
 26  f25     float64
 27  f26     float64
 28  mass    float64
dtypes: float64(29)
memory usage: 1.5 GB


# Model Preparation

## Create training and test sets

In [7]:
scaler = StandardScaler()

X = df.loc[:, df.columns != 'target'].values

y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=30)

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Setup code for tensorboard use later

In [8]:
root_logdir = os.path.join(os.curdir, "my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

# Print current path
print(os.getcwd())
print(run_logdir)

tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)

d:\Projects\MSDS-7333-QTW
.\my_logs\run_2021_11_07-07_05_03


## Define the model

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(.2, input_shape=(2,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(.2, input_shape=(2,)),
    tf.keras.layers.Dense(1, activation='linear')
])

## Compile the model and define metrics

In [10]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

## Define an early stopping callback

This is a callback that will stop the training when the validation loss stops improving.

In [11]:
from tensorflow.keras.callbacks import EarlyStopping
safety= EarlyStopping(monitor='val_loss', patience=3)

## Fit the model

In [12]:
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), 
          callbacks=[tensorboard_cb, safety], batch_size=1000)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 28)                0         
_________________________________________________________________
dense (Dense)                (None, 128)               3712      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 20,353
Trainable params: 20,353
Non-trainable params: 0
____________________________________________________

# Model Evaluation

In [14]:
model.evaluate(X_test, y_test, batch_size=1000)



[0.3328903615474701, 0.8563519716262817]

## Tensorsboard

In [15]:
%load_ext tensorboard