In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb
import xgboost as xgb
import time

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold
from catboost import CatBoostRegressor

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv("featurized_train.csv")

In [3]:
train.columns.to_list()

['id',
 'molecule_name',
 'atom_index_0',
 'atom_index_1',
 'type',
 'scalar_coupling_constant',
 'atom_1_type',
 'atom_1_hybridization',
 'pi_bonds',
 'graph_distance',
 'graph_smile',
 'angle',
 'dihedral',
 'sum_electronegativity_inbetwen',
 'sum_electronegativity_neghb',
 'donor_groups_in_neighb',
 'aceptor_groups_in_neighb',
 'posIonizable_groups_in_neighb',
 'aromatic_groups_in_neighb',
 'hydrophobe_groups_in_neighb',
 'lumpedHydrophobe_groups_in_neighb',
 'negIonizable_groups_in_neighb',
 'sigma_bonds']

In [4]:
# mean_encode
to_meanencode = [
 'atom_1_type',
 'atom_1_hybridization',
 'pi_bonds',
 'graph_smile',
 'donor_groups_in_neighb',
 'aceptor_groups_in_neighb',
 'posIonizable_groups_in_neighb',
 'aromatic_groups_in_neighb',
 'hydrophobe_groups_in_neighb',
 'lumpedHydrophobe_groups_in_neighb',
 'negIonizable_groups_in_neighb',
 'sigma_bonds']

In [6]:
mean = train.groupby(["type"]).agg({'scalar_coupling_constant': ['mean']})
mean.columns = [ "mean"]
mean.reset_index(inplace=True)
for i in to_meanencode:
    print i 
    gg = train.groupby([i, "type"]).agg({'scalar_coupling_constant': ['mean', 'count']})
    gg.columns = [ i + '_mean', i + '_count' ]
    gg.reset_index(inplace=True)
    
    tmp = pd.merge(gg, mean, on=["type"], how="left")
    
    m = 40
    counts = tmp[i + '_count']
    means = tmp[i + '_mean']
    smooth = (counts * means + m * tmp['mean']) / (counts + m)
    smooth
    tmp[i+'_smthmean'] = smooth
    train = pd.merge(train, tmp[[i, "type", i+"_smthmean"]], on=[i, "type"], how="left")    

atom_1_type
atom_1_hybridization
pi_bonds
graph_smile
donor_groups_in_neighb
aceptor_groups_in_neighb
posIonizable_groups_in_neighb
aromatic_groups_in_neighb
hydrophobe_groups_in_neighb
lumpedHydrophobe_groups_in_neighb
negIonizable_groups_in_neighb
sigma_bonds


In [7]:
to_leableencode = [
    "type",
    'atom_1_type',
    'atom_1_hybridization',
    'graph_smile']
for i in to_leableencode:
    print i 
    le = LabelEncoder()
    le.fit(train[i])
    train[i] = le.transform(train[i])

type
atom_1_type
atom_1_hybridization
graph_smile


In [8]:
train.columns.to_list()

['id',
 'molecule_name',
 'atom_index_0',
 'atom_index_1',
 'type',
 'scalar_coupling_constant',
 'atom_1_type',
 'atom_1_hybridization',
 'pi_bonds',
 'graph_distance',
 'graph_smile',
 'angle',
 'dihedral',
 'sum_electronegativity_inbetwen',
 'sum_electronegativity_neghb',
 'donor_groups_in_neighb',
 'aceptor_groups_in_neighb',
 'posIonizable_groups_in_neighb',
 'aromatic_groups_in_neighb',
 'hydrophobe_groups_in_neighb',
 'lumpedHydrophobe_groups_in_neighb',
 'negIonizable_groups_in_neighb',
 'sigma_bonds',
 'atom_1_type_smthmean',
 'atom_1_hybridization_smthmean',
 'pi_bonds_smthmean',
 'graph_smile_smthmean',
 'donor_groups_in_neighb_smthmean',
 'aceptor_groups_in_neighb_smthmean',
 'posIonizable_groups_in_neighb_smthmean',
 'aromatic_groups_in_neighb_smthmean',
 'hydrophobe_groups_in_neighb_smthmean',
 'lumpedHydrophobe_groups_in_neighb_smthmean',
 'negIonizable_groups_in_neighb_smthmean',
 'sigma_bonds_smthmean']

In [9]:
good_columns = [
 'type',
 'atom_1_type',
 'atom_1_hybridization',
 'pi_bonds',
 'graph_distance',
 'graph_smile',
 'angle',
 'dihedral',
 'sum_electronegativity_inbetwen',
 'sum_electronegativity_neghb',
 'donor_groups_in_neighb',
 'aceptor_groups_in_neighb',
 'posIonizable_groups_in_neighb',
 'aromatic_groups_in_neighb',
 'hydrophobe_groups_in_neighb',
 'lumpedHydrophobe_groups_in_neighb',
 'negIonizable_groups_in_neighb',
 'sigma_bonds',
 'atom_1_type_smthmean',
 'atom_1_hybridization_smthmean',
 'pi_bonds_smthmean',
 'graph_smile_smthmean',
 'donor_groups_in_neighb_smthmean',
 'aceptor_groups_in_neighb_smthmean',
 'posIonizable_groups_in_neighb_smthmean',
 'aromatic_groups_in_neighb_smthmean',
 'hydrophobe_groups_in_neighb_smthmean',
 'lumpedHydrophobe_groups_in_neighb_smthmean',
 'negIonizable_groups_in_neighb_smthmean',
 'sigma_bonds_smthmean']

In [15]:
# to check the number of datapoints originally in the test set
# which we need to predict. they all have scalar_coupling_constant as nan
train['scalar_coupling_constant'].isna().sum()

2505542

In [17]:
train['scalar_coupling_constant'].iloc[-2505542:]

2505542

In [18]:
X_train = train[good_columns].iloc[:-2505542].copy()
y_train = train['scalar_coupling_constant'].iloc[:-2505542]
X_test = train[good_columns].iloc[-2505542:].copy()

In [20]:
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    """
    Fast metric computation for this competition: https://www.kaggle.com/c/champs-scalar-coupling
    Code is from this kernel: https://www.kaggle.com/uberkinder/efficient-metric
    """
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()
    

def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None,
                               verbose=10000, early_stopping_rounds=200, n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    
    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'sklearn_scoring_function': mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                        'catboost_metric_name': 'MAE',
                        'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                        'catboost_metric_name': 'MSE',
                        'sklearn_scoring_function': mean_squared_error}
                    }

    
    result_dict = {}
    
    # out-of-fold predictions on train data
    oof = np.zeros(len(X))
    
    # averaged predictions on train data
    prediction = np.zeros(len(X_test))
    
    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    
    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print("Fold %d started at %s" %(fold_n + 1, time.ctime()))
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
                    
        if model_type == 'lgb':
            model = lgb.LGBMRegressor(n_estimators=n_estimators, n_jobs=-1, **params)
            model.fit(X_train, y_train, 
                      eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose, 
                      early_stopping_rounds=early_stopping_rounds)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, 
                              num_boost_round=20000, 
                              evals=watchlist, 
                              early_stopping_rounds=200, 
                              verbose_eval=verbose, 
                              params=params)
            
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), 
                                         ntree_limit=model.best_ntree_limit)
            
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), 
                                   ntree_limit=model.best_ntree_limit)
        
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print("Fold {fold_n. {eval_metric}: {score:.4f}.}".format(**locals()))
            print('')
            
            y_pred = model.predict(X_test).reshape(-1,)
        
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=50000, 
                                      eval_metric=metrics_dict[eval_metric]["catboost_metric_name"],
                                      loss_function=metrics_dict[eval_metric]["catboost_metric_name"])
            model.set_params(**params)
            
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred    
        
        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores
    result_dict['model'] = model
    
    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
            
            result_dict['feature_importance'] = feature_importance
        
    return result_dict

In [29]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
params = {'learning_rate': 0.03,
          "iterations": 50000,
          "early_stopping_rounds": 500,
         }


In [30]:
#This take a long while!!
# Its a gradient boosting model using 5-folds validation strategy
# I am uses 1500 estimators and eraly stopping after 200 rounds if no improvements 
result_dict_lgb = train_model_regression(X=X_train, X_test=X_test, y=y_train, 
                                         params=params, folds=folds, 
                                         model_type='lgb', eval_metric='group_mae', 
                                         plot_feature_importance=False, verbose=500, 
                                         early_stopping_rounds=200, n_estimators=1500)

Fold 1 started at Tue Jul  9 12:03:36 2019
Fold 2 started at Wed Jul 10 01:04:24 2019
Fold 3 started at Wed Jul 10 10:34:56 2019
Fold 4 started at Wed Jul 10 20:06:55 2019
Fold 5 started at Thu Jul 11 05:39:19 2019
CV mean score: 0.2230, std: 0.0023.


In [31]:
test = pd.read_csv("test.csv")
test["scalar_coupling_constant"] = result_dict_lgb["prediction"]

In [32]:
test.drop(columns=['molecule_name','atom_index_0','atom_index_1','type'], inplace=True)

In [33]:
test.to_csv("submition_04.csv", index=False)