# Setting up the environment

In [1]:
# importing the libraries

import os
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# setting up the parameters

root_dir = os.path.dirname(os.path.abspath(os.getcwd()))
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 10)
plt.rcParams["figure.figsize"] = (16, 8)
sns.set_style("darkgrid")

---
# 1. Importing the Data

In [3]:
%%time
# importing the dataset

train_df = pd.read_csv(os.path.join(root_dir, "data", "train_3.csv"))
test_df = pd.read_csv(os.path.join(root_dir, "data", "test_3.csv"))

CPU times: user 1min 13s, sys: 4.37 s, total: 1min 17s
Wall time: 1min 57s


# 2. Machine Learning
## 2.1. DNN Regressor
- Seperate the label and variables
- MinMaxScale the numerical columns
- Convert the raw dataset into Dense tensors
- 

In [4]:
%%time
train_df.head()

CPU times: user 960 µs, sys: 60 µs, total: 1.02 ms
Wall time: 254 µs


Unnamed: 0,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,abs_distance,x_dist,y_dist,z_dist,x_0_mean_diff,x_0_median_diff,x_0_max_diff,x_0_min_diff,x_0_std_diff,y_0_mean_diff,y_0_median_diff,y_0_max_diff,y_0_min_diff,y_0_std_diff,z_0_mean_diff,z_0_median_diff,z_0_max_diff,z_0_min_diff,z_0_std_diff,x_1_mean_diff,x_1_median_diff,x_1_max_diff,x_1_min_diff,x_1_std_diff,y_1_mean_diff,y_1_median_diff,y_1_max_diff,y_1_min_diff,y_1_std_diff,z_1_mean_diff,z_1_median_diff,z_1_max_diff,z_1_min_diff,z_1_std_diff,abs_distance_mean_diff,abs_distance_median_diff,abs_distance_max_diff,abs_distance_min_diff,abs_distance_std_diff,x_dist_mean_diff,x_dist_median_diff,x_dist_max_diff,x_dist_min_diff,x_dist_std_diff,y_dist_mean_diff,y_dist_median_diff,y_dist_max_diff,y_dist_min_diff,y_dist_std_diff,z_dist_mean_diff,z_dist_median_diff,z_dist_max_diff,z_dist_min_diff,z_dist_std_diff
0,1,0,0,84.8076,0,0.00215,-0.006031,0.001976,0,-0.012698,1.085804,0.008001,1.091953,0.014849,-1.091835,-0.006025,0.082779,0.056849,9.38009,-9.237039,1.769698,-0.180928,-0.243785,9.720501,-9.488128,2.160387,0.056651,0.019745,7.635602,-9.136741,1.579822,0.025013,0.173496,1.132061,-1.165503,0.586586,-1.088871,-1.08614,0.030076,-2.200839,-0.423324,1.084899,1.086026,1.239941,1.0529,-0.001182,-0.996411,-1.062451,7.32894,-9.769829,0.363588,-0.214122,-0.295486,8.638816,-8.447354,1.843773,1.153529,1.102349,7.888135,-7.347847,2.32487,-0.004587,-0.051053,1.119004,-1.111128,0.658694
1,2,0,0,84.8074,0,1.011731,1.463751,0.000277,0,-0.012698,1.085804,0.008001,1.091952,1.024429,0.377947,-0.007724,-0.926801,-0.952732,8.37051,-10.24662,0.760118,-1.65071,-1.713567,8.250718,-10.957911,0.690605,0.058351,0.021444,7.637301,-9.135041,1.581521,0.025013,0.173496,1.132061,-1.165503,0.586586,-1.088871,-1.08614,0.030076,-2.200839,-0.423324,1.084899,1.086026,1.239941,1.0529,-0.001182,-0.99641,-1.06245,7.328941,-9.769827,0.363589,-1.223703,-1.305067,7.629236,-9.456935,0.834192,-0.316253,-0.367434,6.418352,-8.817629,0.855088,-0.002888,-0.049353,1.120704,-1.109429,0.660394
2,3,0,0,84.8093,0,-0.540815,1.447527,-0.876644,0,-0.012698,1.085804,0.008001,1.091946,-0.528117,0.361722,-0.884645,0.625745,0.599814,9.923056,-8.694074,2.312664,-1.634485,-1.697343,8.266943,-10.941686,0.70683,0.935271,0.898365,8.514222,-8.258121,2.458441,0.025013,0.173496,1.132061,-1.165503,0.586586,-1.088871,-1.08614,0.030076,-2.200839,-0.423324,1.084899,1.086026,1.239941,1.0529,-0.001182,-0.996405,-1.062445,7.328946,-9.769822,0.363595,0.328843,0.247479,9.181782,-7.904389,2.386738,-0.300029,-0.351209,6.434577,-8.801405,0.871312,0.874033,0.827567,1.997624,-0.232508,1.537314
3,4,0,0,84.8095,0,-0.523814,1.437933,0.906397,0,-0.012698,1.085804,0.008001,1.091948,-0.511115,0.352128,0.898396,0.608743,0.582813,9.906054,-8.711075,2.295662,-1.624892,-1.687749,8.276537,-10.932092,0.716423,-0.84777,-0.884676,6.731181,-10.041162,0.6754,0.025013,0.173496,1.132061,-1.165503,0.586586,-1.088871,-1.08614,0.030076,-2.200839,-0.423324,1.084899,1.086026,1.239941,1.0529,-0.001182,-0.996406,-1.062446,7.328945,-9.769823,0.363594,0.311842,0.230478,9.16478,-7.92139,2.369737,-0.290435,-0.341615,6.444171,-8.791811,0.880906,-0.909008,-0.955474,0.214583,-2.015549,-0.245727
4,2,0,0,171.22,0,-0.027803,2.198949,0.014154,0,-0.013324,1.132466,0.008276,1.066598,-0.014479,1.066484,0.005878,0.112732,0.086802,9.410043,-9.207086,1.799652,-2.385908,-2.448766,7.51552,-11.693109,-0.044593,0.044473,0.007567,7.623424,-9.148919,1.567644,0.025639,0.174122,1.132687,-1.164877,0.587212,-1.135532,-1.132801,-0.016585,-2.247501,-0.469986,1.084624,1.085751,1.239666,1.052626,-0.001457,-0.971056,-1.037096,7.354295,-9.744474,0.388943,-0.184795,-0.266159,8.668143,-8.418027,1.8731,-1.00479,-1.055971,5.729815,-9.506166,0.166551,-0.01649,-0.062956,1.107101,-1.123031,0.646791


In [31]:
x_train = train_df.drop(columns = ["scalar_coupling_constant"])     # predictors
y_train = train_df["scalar_coupling_constant"]                     # labels

del train_df

x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4658147 entries, 0 to 4658146
Data columns (total 65 columns):
atom_index_0                int64
atom_index_1                int64
type                        int64
atom_0                      int64
x_0                         float64
y_0                         float64
z_0                         float64
atom_1                      int64
x_1                         float64
y_1                         float64
z_1                         float64
abs_distance                float64
x_dist                      float64
y_dist                      float64
z_dist                      float64
x_0_mean_diff               float64
x_0_median_diff             float64
x_0_max_diff                float64
x_0_min_diff                float64
x_0_std_diff                float64
y_0_mean_diff               float64
y_0_median_diff             float64
y_0_max_diff                float64
y_0_min_diff                float64
y_0_std_diff                float

In [33]:
# all the values defined above as int are categorical columns.
boo = type(np.array(x_train["atom_index_0"])[1])

categorical_columns = [i for i in x_train.columns if type(np.array(x_train[i])[1]) == boo]
continuous_columns = [i for i in x_train.columns if i not in categorical_columns]

In [None]:
from sklearn.preprocessing import MinMaxScalar

# scaling the continuous columns
mms = 