In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv('../../data/feature/giba-r-data-table-simple-features-1-17-lb.r.csv')

In [3]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['test.csv',
 'structures',
 'sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'mulliken_charges.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'structures.csv',
 'train.csv']

In [4]:
train = pd.read_csv("{}/train.csv".format(file_folder))
test = pd.read_csv("{}/test.csv".format(file_folder))

In [5]:
print(df.shape)
df.head()

(7163689, 101)


Unnamed: 0,molecule_name,atom_index_1,atom_index_0,id,type,scalar_coupling_constant,ID,structure_atom_0,structure_x_0,structure_y_0,...,adC4,adN1,adN2,adN3,adN4,NC,NH,NN,NF,NO
0,dsgdb9nsd_000001,0,1,0,1JHC,84.8076,1,H,0.00215,-0.006031,...,,,,,,1.0,4,,,
1,dsgdb9nsd_000001,0,2,4,1JHC,84.8074,1,H,1.011731,1.463751,...,,,,,,1.0,4,,,
2,dsgdb9nsd_000001,0,3,7,1JHC,84.8093,1,H,-0.540815,1.447527,...,,,,,,1.0,4,,,
3,dsgdb9nsd_000001,0,4,9,1JHC,84.8095,1,H,-0.523814,1.437933,...,,,,,,1.0,4,,,
4,dsgdb9nsd_000001,2,1,1,2JHH,-11.257,1,H,0.00215,-0.006031,...,,,,,,1.0,4,,,


In [6]:
useless_columns = [
"molecule_name",
"atom_index_0",
"atom_index_1", 
"type",
"scalar_coupling_constant", 
"ID", 
"structure_atom_0",
"structure_atom_1", 
"structure_x_0",
"structure_y_0",
"structure_z_0",  
"structure_x_1",
"structure_y_1",
"structure_z_1",
"typei",
"pos",
"R0",
"R1",
"E0",
"E1"]

In [7]:
confirmed_useless_columns = []
for col in useless_columns:
    if col in train.columns:
        confirmed_useless_columns.append(col)

In [8]:
df_train = df[df['id'].isin(train.id.values)].drop(columns=confirmed_useless_columns)
df_test = df[df['id'].isin(test.id.values)].drop(columns=confirmed_useless_columns)

In [9]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [10]:
df_train = reduce_mem_usage(df_train)

Mem. usage decreased to 946.22 Mb (72.6% reduction)


In [11]:
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 508.96 Mb (72.6% reduction)


In [12]:
nullcolumns = []   
for col, nullcount in zip(df_train.columns, df_train.isnull().sum()):
    if nullcount!=0:
        series_ = df_train[col].replace([np.inf, -np.inf], np.nan).dropna()
        nullcolumns.append({'feature':col, 'nullcount':nullcount, 'min':series_.min(), 'max':series_.max()})

df_nullcolumns = pd.DataFrame(nullcolumns)

In [13]:
df_nullcolumns

Unnamed: 0,feature,max,min,nullcount
0,sd_molecule_atom_0_dist_xyz,1.592773,0.219849,4
1,sd_molecule_atom_1_dist_xyz,1.859375,0.0,284076
2,distC0,3.164062,1.066406,1297678
3,distH0,3.171875,1.513672,3689500
4,distN0,3.861328,1.001953,4329116
5,distC1,20.15625,0.199341,1297678
6,distH1,21.28125,0.294434,3689500
7,distN1,13.125,0.072388,4329116
8,adH1,3.177734,1.601562,1346576
9,adH2,3.177734,1.619141,2969337


In [14]:
df_train.to_pickle('../../data/feature/giba-r-data-table-simple-features-1-17-lb_train.pkl')

In [15]:
df_test.to_pickle('../../data/feature/giba-r-data-table-simple-features-1-17-lb_test.pkl')