In [9]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel

In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook, tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing
pd.options.display.precision = 15

import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn import metrics
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.display import HTML
import json
import altair as alt

import copy

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

alt.renderers.enable('notebook')

RendererRegistry.enable('notebook')

In [3]:
file_folder = 'data/input/champs-scalar-coupling' if 'champs-scalar-coupling' in os.listdir('../data/input/') else '../data/input'
os.listdir(file_folder)
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')

In [4]:
# pd.set_option('display.max_columns', 200)
# pd.set_option('display.max_colwidth', 200)

In [4]:
feat1_train = pd.read_pickle('../data/feats/feats1_train.pkl')
feat1_test = pd.read_pickle('../data/feats/feats1_test.pkl')
feat1_train = feat1_train.drop_duplicates(subset=['id','molecule_name','atom_index_0','atom_index_1'])
feat1_test = feat1_test.drop_duplicates(subset=['id','molecule_name','atom_index_0','atom_index_1'])

In [5]:
feat1_train.shape, feat1_test.shape

((4658147, 95), (2505542, 94))

In [6]:
feat2_train = pd.read_pickle('../data/feats/feats2_train.pkl')
feat2_test = pd.read_pickle('../data/feats/feats2_test.pkl')

In [7]:
feat2_train.shape, feat2_test.shape

((85003, 1090), (45772, 1090))

In [8]:
def _map_atom_charges(df, charges, atom_idx):
    df = pd.merge(df, charges, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'charge': f'atom_index_{atom_idx}_charge'})
    return df

In [10]:
# feats = pd.merge(feat1_train, feat2_train, how = 'left', left_on  = ['molecule_name'], right_on = ['molecule_name'])
feats_test = pd.merge(feat1_test, feat2_test, how = 'left', left_on  = ['molecule_name'], right_on = ['molecule_name'])

In [11]:
# feat3_train = pd.read_pickle('../data/feats/feats3_train.pkl')
feat3_test = pd.read_pickle('../data/feats/feats3_test.pkl')

In [13]:
feat3_test.shape

(2505542, 15)

In [14]:
feats_test = pd.merge(feats_test, feat3_test, on=['molecule_name','atom_index_0','atom_index_1'])

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [24]:
# feats_test = reduce_mem_usage(feats_test)

Mem. usage decreased to 6069.26 Mb (3.3% reduction)


In [15]:
feat4 = pd.read_pickle('../data/feats/feats4.pkl')

In [16]:
def _map_feats(feat1, feat2, atom_idx):
    df = pd.merge(feat1, feat2, how = 'left', left_on  = ['molecule_name', f'atom_index_{atom_idx}'], right_on = ['molecule_name',  'atom_index'])
    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={'bonds2_n_bonds': f'atom_index_{atom_idx}_bonds2_n_bonds', 
                           'bonds2_bonds_std': f'atom_index_{atom_idx}_bonds2_bonds_std',
                           'bonds2_bonds_mean': f'atom_index_{atom_idx}_bonds2_bonds_mean',
                           'bonds2_bond_lengths_mean': f'atom_index_{atom_idx}_bonds2_bond_lengths_mean',
                           'bonds2_bond_lengths_std': f'atom_index_{atom_idx}_bonds2_bond_lengths_std'})
    return df

In [17]:
feats_test = _map_feats(feats_test, feat4, 0)
feats_test = _map_feats(feats_test, feat4, 1)

In [18]:
# feat5_train = pd.read_pickle('../data/feats/feats5_train.pkl')
feat5_test = pd.read_pickle('../data/feats/feats5_test.pkl')

In [19]:
# feats = pd.merge(feats, feat5_train, on='id')
feats_test = pd.merge(feats_test, feat5_test, on='id')

In [23]:
for index, count in zip(feats_test.columns.value_counts().index, feats_test.columns.value_counts()):
    if count>1:
        print(index)

In [26]:
for col in feats_test.columns:
    if col.endswith('_x'):
        print(col)
    if col.endswith('_y'):
        print(col)

dist_x
dist_y
eem2015ba_dipole_moment_x
eem2015ba_dipole_moment_y
eem2015bm_dipole_moment_x
eem2015bm_dipole_moment_y
eem2015bn_dipole_moment_x
eem2015bn_dipole_moment_y
eem2015ha_dipole_moment_x
eem2015ha_dipole_moment_y
eem2015hm_dipole_moment_x
eem2015hm_dipole_moment_y
eem2015hn_dipole_moment_x
eem2015hn_dipole_moment_y
eem_dipole_moment_x
eem_dipole_moment_y
gasteiger_dipole_moment_x
gasteiger_dipole_moment_y
mmff94_dipole_moment_x
mmff94_dipole_moment_y
qeq_dipole_moment_x
qeq_dipole_moment_y
qtpie_dipole_moment_x
qtpie_dipole_moment_y
atom_x
atom_y


In [24]:
index, count

('qeq_dipole_moment_impulse_factor_', 1)

In [28]:
feats_test.shape

(2505542, 1211)

In [None]:
# feats.to_pickle('../data/feats/feats_train.pkl')
feats_test.to_pickle('../data/feats/feats_test.pkl')

In [36]:
feats_train = pd.read_pickle('../data/feats/feats_train.pkl')

In [37]:
feats_train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom_0,x_0,y_0,z_0,...,atom_y,atom_index_1_bonds2_n_bonds,atom_index_1_bonds2_bonds_std,atom_index_1_bonds2_bonds_mean,atom_index_1_bonds2_bond_lengths_mean,atom_index_1_bonds2_bond_lengths_std,bonds3_bond_angle_axis,bonds3_bond_angle_plane,bonds3_flatness,bonds3_size
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8125,H,0.00214958190918,-0.006031036376953,0.001976013183594,...,C,4,1.118033988749895,2.5,1.091949701309204,2.76246783e-06,34.46072338003748,52.08433623486488,0.443763023754605,5
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.2578125,H,0.00214958190918,-0.006031036376953,0.001976013183594,...,H,1,0.0,0.0,1.091951608657837,0.0,0.804151194742914,73.35136904563353,0.443763023754605,5
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2578125,H,0.00214958190918,-0.006031036376953,0.001976013183594,...,H,1,0.0,0.0,1.091946363449097,0.0,44.69664263498447,39.073645954716966,0.443763023754605,5
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2578125,H,0.00214958190918,-0.006031036376953,0.001976013183594,...,H,1,0.0,0.0,1.091947555541992,0.0,44.1604906421022,20.12427527341661,0.443763023754605,5
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8125,H,1.01171875,1.4638671875,0.000276565551758,...,C,4,1.118033988749895,2.5,1.091949701309204,2.76246783e-06,36.06906130000368,50.8589533184898,0.443763023754605,5


In [56]:
'id' in feats_train.columns

True

In [54]:
numerics = ['int16', 'int8', 'int32', 'int64', 'float16', 'float32', 'float64']
for col in feats_train.columns:
    col_type = feats_train[col].dtypes
    if not col_type in numerics:
        print(col, col_type)

molecule_name object
type object
atom_0 object
atom_1 object
type_0 object
type_1 object
bond_type object
atom_x object
atom_y object


In [55]:
feats_train[['molecule_name', 'type', 'atom_0', 'atom_1', 'type_0', 'type_1', 'bond_type', 'atom_x', 'atom_y']].head()

Unnamed: 0,molecule_name,type,atom_0,atom_1,type_0,type_1,bond_type,atom_x,atom_y
0,dsgdb9nsd_000001,1JHC,H,C,1,JHC,1.0CH,H,C
1,dsgdb9nsd_000001,2JHH,H,H,2,JHH,none,H,H
2,dsgdb9nsd_000001,2JHH,H,H,2,JHH,none,H,H
3,dsgdb9nsd_000001,2JHH,H,H,2,JHH,none,H,H
4,dsgdb9nsd_000001,1JHC,H,C,1,JHC,1.0CH,H,C


In [30]:
for col in ['atom_0', 'atom_1', 'type_0', 'type_1', 'bond_type']:
    if col not in feat1_train.columns:
        print(col)

bond_type


In [31]:
for col in ['atom_0', 'atom_1', 'type_0', 'type_1']: #, 'bond_type'
    le = preprocessing.LabelEncoder()
    le.fit(feat1_train[col].tolist())
#     feats_train['int_'.format(col)] = le.transform(feats_train[col].tolist())
    feats_test['int_'.format(col)] = le.transform(feats_test[col].tolist())

In [59]:
feats_train = feats_train.drop(columns=['molecule_name', 'type', 'atom_0', 'atom_1', 'type_0', 'type_1', 'bond_type', 'atom_x', 'atom_y'])
feats_test = feats_test.drop(columns=['molecule_name', 'type', 'atom_0', 'atom_1', 'type_0', 'type_1', 'bond_type', 'atom_x', 'atom_y'])

In [67]:
nullcolumns = []   
for col, nullcount in zip(feats_train.columns, feats_train.isnull().sum()):
    if nullcount!=0:
        series_ = feats_train[col].replace([np.inf, -np.inf], np.nan).dropna()
        nullcolumns.append({'feature':col, 'nullcount':nullcount, 'min':series_.min(), 'max':series_.max()})

In [69]:
df_nullcolumns = pd.DataFrame(nullcolumns)

In [72]:
fullnullcolumns = df_nullcolumns[df_nullcolumns['nullcount']==4658147]['feature'].tolist()

In [73]:
feats_train = feats_train.drop(columns=fullnullcolumns)
feats_test = feats_test.drop(columns=fullnullcolumns)

In [81]:
df_nullcolumns[df_nullcolumns['feature'].isin([col for col in df_nullcolumns['feature'].tolist() if col not in fullnullcolumns])]

Unnamed: 0,feature,max,min,nullcount
0,molecule_atom_index_0_x_1_std,2.06640625,7.283687591552734e-05,4
1,molecule_atom_index_0_y_1_mean_div,971766.9209867893,-1077306.585180925,24
2,molecule_atom_index_0_y_1_std,2.041015625,0.0,4
3,molecule_atom_index_0_z_1_std,2.07421875,0.0,4
4,molecule_atom_index_0_dist_std,1.5927734375,0.2198486328125,4
5,molecule_atom_index_0_dist_std_diff,0.49951171875,-3.203125,4
6,molecule_atom_index_0_dist_std_div,1.45703125,0.10015869140625,4
7,molecule_atom_index_1_dist_std,1.859375,0.0,284076
8,molecule_atom_index_1_dist_std_diff,0.7587890625,-3.73828125,284076
9,molecule_atom_index_1_dist_std_div,1.71875,0.0,284076


In [82]:
feats_train = feats_train.replace([np.inf, -np.inf], np.nan)
feats_test = feats_test.replace([np.inf, -np.inf], np.nan)

In [84]:
feats_train = feats_train.fillna(0)
feats_test = feats_test.fillna(0)

In [85]:
feats_train.to_pickle('../data/feats/feats_train.pkl')
feats_test.to_pickle('../data/feats/feats_test.pkl')