# Import software libraries and load the dataset #

In [1]:
import sys                             # Read system parameters.
import os                              # Interact with the operating system.
import numpy as np                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                    # Manipulate and analyze data.
import matplotlib as mpl               # Create 2D charts.
import matplotlib.pyplot as plt
import sklearn                         # Perform data mining and analysis.
from sklearn.utils import shuffle
import VisualizeNN as VisNN            # Create neural network visualizations.
from time import time                  # Calculate training time.

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- Matplotlib {}'.format(mpl.__version__))
print('- scikit-learn {}\n'.format(sklearn.__version__))

# Load the dataset.
PROJECT_ROOT_DIR = "."
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, "occupancy_data")
print('Data files in this project:', os.listdir(DATA_PATH))
data_raw_file_train = os.path.join(DATA_PATH, 'train.csv')
data_raw_file_test = os.path.join(DATA_PATH, 'test.csv')
data_raw = pd.read_csv(data_raw_file_train)
data_raw_test = pd.read_csv(data_raw_file_test)
print('Loaded {} records from {}.'.format(len(data_raw), data_raw_file_train))
print('Loaded {} records from {}.'.format(len(data_raw_test), data_raw_file_test))

Libraries used in this project:
- Python 3.7.6 | packaged by conda-forge | (default, Mar 23 2020, 23:03:20) 
[GCC 7.3.0]
- NumPy 1.16.2
- pandas 0.24.2
- Matplotlib 3.0.3
- scikit-learn 0.20.3

Data files in this project: ['train.csv', 'test.csv']
Loaded 8143 records from ./occupancy_data/train.csv.
Loaded 2665 records from ./occupancy_data/test.csv.


# Get acquainted with the dataset #

In [None]:
# Shuffle the dataset.
data_raw = shuffle(data_raw.copy(), random_state = 765)
data_raw.reset_index(inplace = True, drop = True)

data_raw_test = shuffle(data_raw_test.copy(), random_state = 765)
data_raw_test.reset_index(inplace = True, drop = True)

print(data_raw.info())      # View data types and see if there are missing entries.
data_raw.head(10)           # View first 10 records.

# Examine the distribution of various features

In [None]:
# Use Matplotlib to plot figures.
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

data_raw.hist(figsize=(20,15));
plt.figure();

# Examine a general summary of statistics #

In [None]:
with pd.option_context('float_format', '{:.3f}'.format): 
    print(data_raw.describe())

# Split the label from the datasets

In [None]:
# Separate training and test sets already exist.

# 'Occupancy' is the dependent variable (value to be predicted), so it will be
# removed from the training and testing data and put into a separate DataFrame for labels.
label_columns = ['Occupancy']

training_columns = ['Date', 'Temperature', 'RelativeHumidity', 'Light', 'CO2', 'HumidityRatio']

# Split the training and test datasets and their labels.
X_train, y_train = data_raw[training_columns].copy(), data_raw[label_columns].copy()
X_test, y_test = data_raw_test[training_columns].copy(), data_raw_test[label_columns].copy()

# Compare the number of rows and columns in the original data to the training and test sets.
print(f'Original set:        {data_raw.shape}')
print('------------------------------')
print(f'Training features:   {X_train.shape}')
print(f'Test features:       {X_test.shape}')
print(f'Training labels:     {y_train.shape}')
print(f'Test labels:         {y_test.shape}')

# Convert the `Date` column to datetime format for processing

In [None]:
X_train['Date'] = pd.to_datetime(X_train['Date'])
X_test['Date'] = pd.to_datetime(X_test['Date'])

X_train.head()

# Determine which datetime components have unique values

In [None]:
# Extract specific datetime components and retrieve unique values.
print('Unique years:   {}'.format(X_train['Date'].dt.year.unique()))
print('Unique months:  {}'.format(X_train['Date'].dt.month.unique()))
print('Unique days:    {}'.format(X_train['Date'].dt.day.unique()))
print('Unique hours:   {}'.format(X_train['Date'].dt.hour.unique()))
print('Unique minutes: {}'.format(X_train['Date'].dt.minute.unique()))
print('Unique seconds: {}'.format(X_train['Date'].dt.second.unique()))

# Perform common preparation on the training and test sets

In [None]:
# Perform common cleaning and feature engineering tasks on datasets.
def prep_dataset(X):
    
    # FEATURE ENGINEERING
    
    # Extract days, hours, and minutes from timestamp.
    day = X['Date'].dt.day
    X['Day'] = day.astype('float64')
    
    hour = X['Date'].dt.hour
    X['Hour'] = hour.astype('float64')
    
    minute = X['Date'].dt.minute
    X['Minute'] = minute.astype('float64')

    return X

X_train = prep_dataset(X_train.copy())

X_test = prep_dataset(X_test.copy())

X_train.head()

# Drop columns that won't be used for training

In [None]:
# Drop unused columns from datasets.
def drop_unused(X):
    
    # This column been divided up into multiple columns.
    X = X.drop(['Date'], axis = 1)
    
    return X

print('Columns before drop:\n\n{}\n'.format(list(X_train.columns)))
X_train = drop_unused(X_train.copy())
print('Columns after drop:\n\n{}\n'.format(list(X_train.columns)))

X_test = drop_unused(X_test.copy())

# Standardize the features

In [None]:
def standardize(X):
    result = X.copy()
    
    for feature in X.columns:
        result[feature] = (X[feature] - X[feature].mean()) / X[feature].std()  # z-score formula.
        
    return result

X_train = standardize(X_train)

X_test = standardize(X_test)

print('The features have been standardized.')

In [None]:
with pd.option_context('float_format', '{:.2f}'.format): 
    print(X_train.describe())

# Train an MLP model

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes = (2),
                    activation = 'relu',
                    solver = 'adam',
                    alpha = 0.0001,
                    learning_rate_init = 0.001,
                    max_iter = 500,
                    tol = 1e-4,
                    n_iter_no_change = 10,
                    verbose = True,
                    random_state = 87)

mlp.fit(X_train, np.ravel(y_train))

score = mlp.score(X_test, y_test)

print('Accuracy: {:.0f}%'.format(score * 100))

# Visualize the loss minimization through gradient descent

In [None]:
def plot_loss(model):
    plt.plot(model.loss_curve_)
    plt.title('GD Loss Minimization')
    plt.xlabel('Steps')
    plt.ylabel('Loss')
    
plot_loss(mlp)

# Visualize the neural network architecture

In [None]:
def nn_diagram(X, y, model, show_weights):

    # Create structure of network from dataset shapes and hidden layer sizes.
    nn_struct = np.hstack(([X.shape[1]], np.asarray(model.hidden_layer_sizes), [y.shape[1]]))

    # Only plot weights if specified.
    if show_weights == True:
        network = VisNN.DrawNN(nn_struct, model.coefs_)
    else:
        network = VisNN.DrawNN(nn_struct)
        
    network.draw()
    
nn_diagram(X_train, y_train, mlp, False)

# Retrieve the neuron weights and bias terms and redraw the network architecture

In [None]:
print('Weights between input layer and hidden layer:')
print(mlp.coefs_[0], '\n')
print('Weights between hidden layer and output layer:')
print(mlp.coefs_[1], '\n')
print('Bias terms between input layer and hidden layer:')
print(mlp.intercepts_[0], '\n')
print('Bias terms between hidden layer and output layer:')
print(mlp.intercepts_[1])

In [None]:
nn_diagram(X_train, y_train, mlp, True)

# Fit an MLP model using grid search with cross-validation

In [None]:
from sklearn.model_selection import GridSearchCV

mlp = MLPClassifier(alpha = 0.0001,
                    learning_rate_init = 0.001,
                    max_iter = 500,
                    tol = 1e-4,
                    n_iter_no_change = 10,
                    random_state = 87)

grid = {'hidden_layer_sizes': [(5), (6)],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['sgd', 'adam']}

search = GridSearchCV(mlp, param_grid = grid, scoring = 'accuracy', cv = 5, iid = False)

start = time()
search.fit(X_train, np.ravel(y_train))
end = time()
train_time = (end - start)

print('Grid search took {:.2f} seconds to find an optimal fit.'.format(train_time))
print(search.best_params_)

In [None]:
score = search.score(X_test, y_test)

print('Accuracy: {:.0f}%'.format(score * 100))

# Visualize the loss minimization of the optimized model

In [None]:
plot_loss(search.best_estimator_)

# Visualize the network structure of the optimized model

In [None]:
nn_diagram(X_train, y_train, search.best_estimator_, True)

# Examine the model's predictions on the test set

In [None]:
# Show example predictions with the test data.
results = data_raw_test.copy()
results['PredictedOccupancy'] = search.predict(X_test)
results.rename(columns = {'Occupancy': 'ActualOccupancy'}, inplace = True)  # Clarify ground truth column.
results.head(50)