In [1]:
import pandas as pd
import numpy as np
import datetime
import calendar
import holidays
import matplotlib.pyplot as plt
%matplotlib inline

### Retrieve the data 

In [2]:
#Load all Files (they must be in data directory in a brother directory of the notebook)
data_load = {
    'dipole_moments': pd.read_csv('./data/magnetic_shielding_tensors.csv'),
    'magnetic_shielding_tensors': pd.read_csv('./data/dipole_moments.csv'),
    'mulliken_charges': pd.read_csv('./data/mulliken_charges.csv'),
    'potential_energy': pd.read_csv('./data/potential_energy.csv'),
    'sample_submission': pd.read_csv('./data/sample_submission.csv'),
    'scalar_coupling_contributions': pd.read_csv('./data/scalar_coupling_contributions.csv'),
    'structures': pd.read_csv('./data/structures.csv'),
    'train': pd.read_csv('./data/train.csv'), 
    'test': pd.read_csv('./data/test.csv')
    }

Remark: besides train features, data are not provided in the test dataset.

Should we use those as generative features for the scalar coupling constant?"

In [3]:
data_load['train'].head(10)

# we have a molecule of methane
# strangely, scalar coupling between identical atoms (we should have a symetric molecule) are slightly different (7 per 100,000)

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074
5,5,dsgdb9nsd_000001,2,3,2JHH,-11.2541
6,6,dsgdb9nsd_000001,2,4,2JHH,-11.2548
7,7,dsgdb9nsd_000001,3,0,1JHC,84.8093
8,8,dsgdb9nsd_000001,3,4,2JHH,-11.2543
9,9,dsgdb9nsd_000001,4,0,1JHC,84.8095


In [4]:
#Confirmation with the molecule structure
structures = data_load['structures']
structures.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


#### Features description

* dipole_moments.csv - contains the molecular electric dipole moments. These are three dimensional vectors that indicate the charge distribution in the molecule. The first column (molecule_name) are the names of the molecule, the second to fourth column are the X, Y and Z components respectively of the dipole moment.
* magnetic_shielding_tensors.csv - contains the magnetic shielding tensors for all atoms in the molecules. The first column (molecule_name) contains the molecule name, the second column (atom_index) contains the index of the atom in the molecule, the  third to eleventh columns contain the XX, YX, ZX, XY, YY, ZY, XZ, YZ and ZZ elements of the tensor/matrix respectively.
* mulliken_charges.csv - contains the mulliken charges for all atoms in the molecules. The first column (molecule_name) contains the name of the molecule, the second column (atom_index) contains the index of the atom in the molecule, the third column (mulliken_charge) contains the mulliken charge of the atom.
* potential_energy.csv - contains the potential energy of the molecules. The first column (molecule_name) contains the name of the molecule, the second column (potential_energy) contains the potential energy of the molecule.
* scalar_coupling_contributions.csv - The scalar coupling constants in train.csv (or corresponding files) are a sum of four terms. scalar_coupling_contributions.csv contain all these terms. The first column (molecule_name) are the name of the molecule, the second (atom_index_0) and third column (atom_index_1) are the atom indices of the atom-pair, the fourth column indicates the type of coupling, the fifth column (fc) is the Fermi Contact contribution, the sixth column (sd) is the Spin-dipolar contribution, the seventh column (pso) is the Paramagnetic spin-orbit contribution and the eighth column (dso) is the Diamagnetic spin-orbit contribution.

### Types of Coupling considered

In [None]:
plt1 = data_load['train'].groupby('type').id.count()

fig1 = plt1.plot.bar(rot = 45)

# up to 3J coupling, only H-C, H-N, H-H coupling

In [None]:
data_load['test'].groupby('type').id.count()

# This is the same for test :)

In [None]:
import seaborn as sns

sns.pairplot(data_load['train'][["type", "scalar_coupling_constant", "atom_index_0"]], hue="type", height=5)

# The coupling type seems to be a rather good predictor of the scalar coupling constant, 
# except for 1JHC and 1JHH which have large distributions

# biggest molecule seems to have 29 atoms

### Do we have missing values?

In [None]:
data_load['train'].isna().sum()

# No we don't

#### Molecules structures provided

In [39]:
print('Total number of molecules provided:', len(structures.groupby('molecule_name')))
print('Total number of molecules in the train set:', len(data_load['train'].groupby('molecule_name')))
print('Total number of molecules in the test set:', len(data_load['test'].groupby('molecule_name')))
# Structures provide all molecules for train and test data sets

Total number of molecules provided: 130775
Total number of molecules in the train set: 85003
Total number of molecules in the test set: 45772
