In [None]:
import pandas as pd
data = pd.read_csv('DMSP_data.csv')
data.shape

In [None]:
data.info(verbose=True)

In [None]:
# Creating new features from the Datetimes column
data['year'] = data['Datetimes'].apply(lambda x: float(x.split()[0].split('-')[0]))
data['month'] = data['Datetimes'].apply(lambda x: float(x.split()[0].split('-')[1]))
data['day'] = data['Datetimes'].apply(lambda x: float(x.split()[0].split('-')[2]))
data['hour'] = data['Datetimes'].apply(lambda x: float(x.split()[1].split(':')[0]))
data['minute'] = data['Datetimes'].apply(lambda x: float(x.split()[1].split(':')[1]))
data['second'] = data['Datetimes'].apply(lambda x: float(x.split()[1].split(':')[2]))

# Dropping the 'Datetimes' column
data.drop(['Datetimes'], axis = 1, inplace = True)

In [None]:
data[['year', 'month', 'day', 'hour', 'minute', 'second']].describe()

In [None]:
# Dropping the constant 'second' column
data.drop(['second'], axis = 1, inplace = True)

In [None]:
import numpy as np
# Log-transforming the target
data['ELE_TOTAL_ENERGY_FLUX_LOG'] = data['ELE_TOTAL_ENERGY_FLUX'].apply(lambda x: np.log(x))
# Dropping the original target
data.drop(['ELE_TOTAL_ENERGY_FLUX'], axis = 1, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split
# Train-test split
X = data.drop('ELE_TOTAL_ENERGY_FLUX_LOG', axis = 1) # Independent variables
y = data['ELE_TOTAL_ENERGY_FLUX_LOG'] # Target variable (log-transformed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True)

# Resetting index
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

In [None]:
# Min-max normalization of predictors in the training set
for col in X_train.columns:
    if X_train[col].dtypes == 'int64' or X_train[col].dtypes == 'float64': # Checking if the column is numerical
        if X_train[col].nunique() > 1: # Checking if the column is non-constant
            X_train = X_train.apply(lambda iterator: ((iterator.max() - iterator)/(iterator.max() - iterator.min())).round(2))
            #X_train[col] = (X_train[col] - X_train[col].min()) / (X_train[col].max() - X_train[col].min())

In [None]:
result.head()

In [None]:
# Min-max normalization of predictors in the test set
for col in X_test.columns:
    if X_test[col].dtypes == 'int64' or X_test[col].dtypes == 'float64': # Checking if the column is numerical
        if X_test[col].nunique() > 1: # Checking if the column is non-constant
            X_test = X_test.apply(lambda iterator: ((iterator.max() - iterator)/(iterator.max() - iterator.min())).round(2)) 
            #X_test[col] = (X_test[col] - X_test[col].min()) / (X_test[col].max() - X_test[col].min())

In [None]:
from sklearn.decomposition import PCA
# PCA-fitting the training set
pca = PCA(n_components = 20)
pca.fit(X_train_norm)

# PCA summary
print(pd.Series({"Number of components": "{}".format(pca.n_components_),
                 "Explained variance ratio": "{:.4g}%".format(pca.explained_variance_ratio_.sum()*100)
                }).to_string())

In [None]:
# Adding layers to sequential model
model = Sequential()
model.add(Dense(units = 128, input_dim = len(X_train.columns), activation = 'relu'))
model.add(Dense(units = 64, activation = 'relu'))
model.add(Dense(units = 32, activation = 'relu'))
model.add(Dense(units = 16, activation = 'relu'))
model.add(Dense(units = 4, activation = 'relu'))
model.add(Dense(units = 1, activation = 'linear'))
model.summary()

In [None]:
# Specifying loss function and optimizer
adam_opt = Adam(learning_rate = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-07, amsgrad = False)
sgd_opt = SGD(learning_rate = 0.01, momentum = 0.9, nesterov = False)
model.compile(loss = 'mean_squared_error', optimizer = adam_opt)

In [None]:
history = model.fit(X_train, y_train, epochs = 120, batch_size = 32, validation_split = 0.2)