In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import sys
sys.path.append("../..") # Adds higher directory to python modules path.
from utilities import aggregate_feature_calculators
from utilities import aggregate_feature_calculators_setting as aggcal
from utilities.parallel import Parallel
from utilities.dfdb import DFDB

from utilities.process.pqueue import *
from utilities.process.pnode import *
from utilities.process.putilities import *

In [2]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import json
import copy
import gc
import warnings
from tqdm import tqdm_notebook, tqdm

import optuna

import lightgbm as lgb
import xgboost as xgb

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold,TimeSeriesSplit, GroupKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

import eli5
from eli5.sklearn import PermutationImportance

import networkx as nx

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
csv_file_folder =  '../../data/input'
os.listdir(csv_file_folder)

['structures',
 'magnetic_shielding_parameters.csv',
 'train.csv',
 'dipole_moments.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'sample_submission.csv',
 'scalar_coupling_contributions.csv',
 'structures.csv',
 'test.csv']

In [4]:
file_folder =  '../../data/feature'
[f for f in os.listdir(file_folder) if (f.endswith('.pkl')) and (not f.startswith('.'))]

['angles-and-distances_test.pkl',
 'angles-and-distances_train.pkl',
 'brute-force-feature-engineering_test.pkl',
 'eem_test.pkl',
 'giba-r-data-table-simple-features-0-991-lb_test.pkl',
 'giba-r-data-table-simple-features-0-991-lb_train.pkl',
 'brute-force-feature-engineering_train.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'keras-neural-net-for-champs_train.pkl',
 'eem_train.pkl',
 'molecular-properties-eda-and-models_test.pkl',
 'molecular-properties-eda-and-models_train.pkl',
 'molecule-with-openbabel_test.pkl',
 'molecule-with-openbabel_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_test.pkl',
 'brute-force-feature-engineering-mini_test.pkl',
 'eachtype_train.pkl',
 'eachtype_test.pkl',
 'giba-r-data-table-simple-features-1-17-lb_test.pkl',
 'dataset-with-number-of-bonds-between-atoms_train.pkl',
 'brute-force-feature-engineering-mini_train.pkl',
 'giba-r-data-table-simple-features-1-17-lb_train.pkl',
 'bonds-from-structure-data_train.pkl',
 'coulomb-interaction-spe

In [5]:
file_list = ['giba-r-data-table-simple-features-1-17-lb_train.pkl',
 'eem_train.pkl',
 'coulomb-interaction-speed-up_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_test.pkl',
 'keras-neural-net-for-champs_train.pkl',
 'angles-and-distances_test.pkl',
 'keras-neural-net-for-champs_test.pkl',
 'brute-force-feature-engineering-mini_train.pkl',
 'dataset-with-number-of-bonds-between-atoms_train.pkl',
 'coulomb-interaction-speed-up_test.pkl',
 'bonds-from-structure-data_train.pkl',
 'eem_test.pkl',
 'molecule-with-openbabel_train.pkl',
 'molecular-properties-eda-and-models_train.pkl',
 'brute-force-feature-engineering-mini_test.pkl',
 'molecular-properties-eda-and-models_test.pkl',
 'bonds-from-structure-data_test.pkl',
 'molecule-with-openbabel_test.pkl',
 'giba-r-data-table-simple-features-1-17-lb_test.pkl',
 'angles-and-distances_train.pkl']
print(len(file_list))

20


In [6]:
trail = []
def prepare_data(feature_folder='../../data/feature', csv_file_folder='../../data/input', feature_file_list=None, trail=trail):
    
    if os.path.exists(f'{feature_folder}/df_train'):
        print(f'=========================load from temp===============================')
        df_train = pd.read_pickle(f'{feature_folder}/df_train')
        df_test = pd.read_pickle(f'{feature_folder}/df_test')
        return df_train, df_test
    
    df_train = pd.read_csv(f"{csv_file_folder}/train.csv")
    df_test = pd.read_csv(f"{csv_file_folder}/test.csv")
    scalar_coupling_contributions = pd.read_csv(f'{csv_file_folder}/scalar_coupling_contributions.csv')
    
    #scalar_coupling_constant fc sd pso dso
    df_train = pd.merge(df_train, scalar_coupling_contributions, how = 'left',
                  left_on  = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
                  right_on = ['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])
    
    print(df_train.shape, df_test.shape)

    if type(feature_file_list) == type(None):
        feature_file_list = os.listdir(feature_folder)

    for f in feature_file_list:
        print(f'========================={f}===============================')
        if (f.endswith('.pkl')) and (not f.startswith('.')):
            if f[:-4].endswith('train'):
                df_feature_i = pd.read_pickle(f'{feature_folder}/{f}').sort_values(by=['id'])
                df_feature_i = df_feature_i.reset_index(drop=True)
                columns_i = df_feature_i.columns.tolist()
                new_columns = set(columns_i) - set(df_train.columns.tolist())
                duplicates_columns = [col for col in columns_i if col not in list(new_columns)]
                for col in duplicates_columns:
                    try:
                        error = np.where(df_feature_i[col].values!=df_train[col].values)[0]
                        if error.shape[0] > 0:
                            if np.where(~np.isnan(df_feature_i[col].values[np.where(df_feature_i[col].values!=df_train[col].values)[0]]))[0].shape[0]>0:
                                if not str in [type(df_train[col].values[0]),type(df_feature_i[col].values[0])]:
                                    trail.append({'df_feature_i':df_feature_i, 'df_train':df_train})
                                    print(col, error, [f'{v1}:{v2}' for v1, v2 in zip(df_feature_i[col].values[error], df_train[col].values[error])])
                                    raise Exception()
                    except Exception as e:
#                         raise Exception(col)
                        print(col)
                df_train = pd.merge(df_train, df_feature_i[list(new_columns) + ['id']], on='id')
                df_train = df_train.sort_values(by=['id'])
                df_train = df_train.reset_index(drop=True)
                print('train add', f, new_columns)
            if f[:-4].endswith('test'):
                df_feature_i = pd.read_pickle(f'{feature_folder}/{f}').sort_values(by=['id'])
                df_feature_i = df_feature_i.reset_index(drop=True)
                columns_i = df_feature_i.columns.tolist()
                new_columns = set(columns_i) - set(df_test.columns.tolist())
                duplicates_columns = [col for col in columns_i if col not in list(new_columns)]
                for col in duplicates_columns:
                    try:
                        error = np.where(df_feature_i[col].values!=df_test[col].values)[0]
                        if error.shape[0] > 0:
                            if np.where(~np.isnan(df_feature_i[col].values[np.where(df_feature_i[col].values!=df_test[col].values)[0]]))[0].shape[0]>0:
                                if not str in [type(df_test[col].values[0]),type(df_feature_i[col].values[0])]:
                                    trail.append({'df_feature_i':df_feature_i, 'df_test':df_test})
                                    print(col, error, [f'{v1}:{v2}' for v1, v2 in zip(df_feature_i[col].values[error], df_test[col].values[error])])
                                    raise Exception()
                    except Exception as e:
#                         raise Exception(col)
                        print(col)
                df_test = pd.merge(df_test, df_feature_i[list(new_columns) + ['id']], on='id')
                df_test = df_test.sort_values(by=['id'])
                df_test = df_test.reset_index(drop=True)
                print('test add', f, new_columns)

    print(f'=========================encode label===============================')
    numerics = ['int16', 'int8', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df_train.columns:
        col_type = df_train[col].dtypes
        if not col_type in numerics:
            print(col, df_train[col].unique())
            le = LabelEncoder()
            le.fit(list(df_train[col].values) + list(df_test[col].values))
            df_train[col] = le.transform(list(df_train[col].values))
            df_test[col] = le.transform(list(df_test[col].values))
            print(le.classes_)

    print(f'=========================fill nan inf===============================')
    df_train = df_train.replace([np.inf, -np.inf], np.nan)
    df_train = df_train.fillna(0)
    df_test = df_test.replace([np.inf, -np.inf], np.nan)
    df_test = df_test.fillna(0)

    print(f'=========================rename===============================')
    df_train = df_train.rename(columns={'id': 'index'}) #'scalar_coupling_constant': 'y'
    df_test = df_test.rename(columns={'id': 'index'})
    df_train = df_train.rename(columns={'molecule_name':'group'})
    df_test = df_test.rename(columns={'molecule_name':'group'})
    df_test = df_test.rename(columns={'cycle_size_mean_x':'atom_index_0_cycle_size_mean', 
                            'cycle_size_mean_y':'atom_index_1_cycle_size_mean',
                           'n_cycle_x':'atom_index_0_n_cycle',
                           'n_cycle_y':'atom_index_1_n_cycle'})

    df_train = df_train.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    
    if not os.path.exists(f'{feature_folder}/df_train'):
        print(f'=========================save tmp===============================')
        df_train.to_pickle(f'{file_folder}/df_train.gzde', compression='gzip')
        df_test.to_pickle(f'{file_folder}/df_test.gzde', compression='gzip')
    return df_train, df_test

In [7]:
df_train, df_test = prepare_data(feature_file_list=file_list)

(4658147, 10) (2505542, 5)
train add giba-r-data-table-simple-features-1-17-lb_train.pkl {'NH', 'inv_dist1E', 'linkM1', 'ID', 'adN2', 'yukawa_H.y', 'coulomb_H.x', 'distN0', 'inv_dist1R', 'distC0', 'yukawa_C.y', 'atom_index_1.1', 'structure_z_1', 'E0', 'structure_x_0', 'coulomb_N.y', 'vander_O.y', 'adC4', 'structure_y_1', 'structure_z_0', 'typei', 'coulomb_C.x', 'yukawa_O.x', 'inv_distP', 'vander_H.x', 'structure_atom_0', 'sd_molecule_atom_1_dist_xyz', 'E1', 'vander_N.y', 'linkN', 'structure_x_1', 'structure_atom_1', 'coulomb_F.y', 'inv_dist1', 'inv_dist0R', 'link0', 'inv_dist0', 'inv_distPR', 'mean_molecule_atom_0_dist_xyz', 'distH0', 'pos', 'linkM0', 'yukawa_N.x', 'structure_y_0', 'adH2', 'yukawa_H.x', 'vander_C.x', 'vander_F.x', 'adH3', 'R0', 'sd_molecule_atom_0_dist_xyz', 'coulomb_O.x', 'coulomb_O.y', 'NN', 'N2', 'vander_N.x', 'NF', 'yukawa_C.x', 'adC1', 'inv_dist0E', 'yukawa_N.y', 'adH4', 'adN4', 'max_molecule_atom_0_dist_xyz', 'adC3', 'adN1', 'max_molecule_atom_1_dist_xyz', 'link1

train add dataset-with-number-of-bonds-between-atoms_train.pkl {'nbond', 'error', 'atom_index_0_charge', 'atom_index_0_n_cycle', 'atom_index_1_charge', 'atom_index_0_cycle_size_mean', 'L2dist', 'bond_type', 'atom_index_1_n_cycle', 'atom_index_1_cycle_size_mean', 'is_found_bond'}
test add coulomb-interaction-speed-up_test.pkl {'dist_C_2_y', 'dist_H_2_y', 'dist_C_2_x', 'dist_C_3_x', 'dist_O_2_y', 'dist_H_1_x', 'dist_F_0_y', 'dist_O_3_x', 'dist_H_2_x', 'dist_O_1_y', 'dist_O_0_x', 'dist_F_4_y', 'dist_H_3_x', 'dist_C_4_x', 'dist_N_1_x', 'dist_C_1_y', 'dist_N_4_x', 'dist_F_2_x', 'dist_C_4_y', 'dist_N_3_x', 'dist_N_4_y', 'dist_H_4_y', 'dist_F_1_x', 'dist_C_0_x', 'dist_H_4_x', 'dist_F_4_x', 'dist_N_1_y', 'dist_O_1_x', 'dist_F_0_x', 'dist_C_0_y', 'dist_C_3_y', 'dist_F_2_y', 'dist_N_0_y', 'dist_N_0_x', 'dist_H_0_y', 'dist_N_2_x', 'dist_F_3_x', 'dist_O_4_y', 'dist_H_0_x', 'dist_N_3_y', 'dist_H_3_y', 'dist_O_3_y', 'dist_O_4_x', 'dist_H_1_y', 'dist_F_1_y', 'dist_N_2_y', 'dist_O_0_y', 'dist_F_3_y', 

molecule_name ['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133881' 'dsgdb9nsd_133882' 'dsgdb9nsd_133884']
['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133883' 'dsgdb9nsd_133884' 'dsgdb9nsd_133885']
type ['1JHC' '2JHH' '1JHN' '2JHN' '2JHC' '3JHH' '3JHC' '3JHN']
['1JHC' '1JHN' '2JHC' '2JHH' '2JHN' '3JHC' '3JHH' '3JHN']
structure_atom_0 ['H']
['H']
structure_atom_1 ['C' 'H' 'N']
['C' 'H' 'N']
molecule_name.1 ['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133881' 'dsgdb9nsd_133882' 'dsgdb9nsd_133884']
['dsgdb9nsd_000001' 'dsgdb9nsd_000002' 'dsgdb9nsd_000003' ...
 'dsgdb9nsd_133883' 'dsgdb9nsd_133884' 'dsgdb9nsd_133885']
atom_1 ['C' 'H' 'N']
['C' 'H' 'N']
atom_0 ['H']
['H']
bond_type ['1.0CH' 'none' '1.0HN']
['1.0CH' '1.0HN' 'none']
type_1 ['JHC' 'JHH' 'JHN']
['JHC' 'JHH' 'JHN']
type_0 ['1' '2' '3']
['1' '2' '3']
tertiary_atom_17 [nan 'H' 'HO' 'C3' 'O3' 'O2' 'C1' 'N1' 'C2' 'Nam' 'N3' 'N2' 'Car' 'Nar'
 'Npl'