<h1 style="color:#2c3f51">Predict the likelihood of a genome sequence undergoing mutation.</h1>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import optuna

# Metrics
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
import seaborn as sns
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats
import math
import random


from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.calibration import CalibrationDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, Concatenate
from tensorflow.keras.utils import plot_model

plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])


# Loading Data

In [3]:
train = pd.read_csv('../input/devday22-competition-datascience/train.csv')
test = pd.read_csv('../input/devday22-competition-datascience/test.csv')

print('Data Dimension\n')
train.shape, test.shape

Data Dimension



((180000, 32), (120000, 31))

# Understanding Data

- Data describe

In [4]:
train.describe()

Unnamed: 0,ID,A,B,C,D,E,F,G,H,I,...,M,N,O,P,Q,R,S,T,U,mutation
count,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,...,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0
mean,150259.57235,8.118989,2.632989,0.617833,4.802733,29.696178,1.262467,23.543444,33.423717,1.60515,...,0.516286,0.474217,0.505086,0.501836,0.487837,0.5017,0.48764,0.469766,0.507951,0.2649
std,86591.625009,3.078097,4.147355,1.191218,1.514448,12.700896,2.10944,13.897618,18.991241,2.752459,...,0.214696,0.216486,0.227391,0.241277,0.211201,0.203394,0.17899,0.194466,0.203378,0.441281
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.099276,-0.045316,0.169495,-0.033695,0.005199,0.093056,0.024139,0.215575,0.097789,0.0
25%,75313.75,7.0,0.0,0.0,4.0,33.0,0.0,9.0,19.0,0.0,...,0.325804,0.291995,0.279693,0.276133,0.323797,0.353174,0.358795,0.310106,0.367429,0.0
50%,150321.0,8.0,0.0,0.0,4.0,33.0,0.0,23.0,38.0,0.0,...,0.471377,0.388573,0.480476,0.554751,0.517962,0.435187,0.410841,0.408567,0.446227,0.0
75%,225293.25,10.0,3.0,1.0,5.0,33.0,2.0,35.0,49.0,2.0,...,0.704146,0.645087,0.725755,0.735208,0.607613,0.642035,0.611452,0.58705,0.581691,1.0
max,299999.0,14.0,18.0,12.0,19.0,83.0,15.0,50.0,60.0,18.0,...,1.0166,0.951018,0.856975,0.853022,0.960912,1.035818,1.054257,1.005652,1.011331,1.0


In [5]:
test.describe()

Unnamed: 0,ID,A,B,C,D,E,F,G,H,I,...,L,M,N,O,P,Q,R,S,T,U
count,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,...,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0
mean,149609.391475,8.127517,2.616475,0.612867,4.811917,29.67525,1.261375,23.459275,33.394692,1.607508,...,0.494413,0.517149,0.474261,0.50449,0.502889,0.488817,0.501796,0.488725,0.469092,0.508648
std,86618.168318,3.060814,4.11972,1.173331,1.520519,12.679975,2.123597,13.8524,19.019033,2.770016,...,0.213312,0.215083,0.216861,0.227599,0.241192,0.211534,0.20365,0.179134,0.194591,0.203416
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.08448,0.094493,-0.031298,0.168071,-0.036379,0.018907,0.090901,0.045832,0.214866,0.126711
25%,74540.75,7.0,0.0,0.0,4.0,33.0,0.0,9.0,19.0,0.0,...,0.31764,0.326394,0.29266,0.278682,0.276907,0.324116,0.3529,0.359112,0.309691,0.369417
50%,149557.5,8.0,0.0,0.0,4.0,33.0,0.0,23.0,38.0,0.0,...,0.462719,0.471137,0.385824,0.479183,0.554798,0.51827,0.43573,0.411012,0.405421,0.447079
75%,224504.25,10.0,3.0,1.0,5.0,33.0,2.0,35.0,49.0,2.0,...,0.695031,0.705315,0.644702,0.725804,0.735256,0.609925,0.641976,0.614888,0.586463,0.582367
max,299992.0,14.0,18.0,12.0,19.0,83.0,15.0,50.0,60.0,18.0,...,1.005793,1.005743,0.952187,0.858578,0.846413,0.966553,1.027458,1.055885,1.005392,1.006479


- Data info

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 32 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   ID         180000 non-null  int64  
 1   sequence1  180000 non-null  object 
 2   sequence2  180000 non-null  object 
 3   sequence3  180000 non-null  object 
 4   sequence4  180000 non-null  object 
 5   sequence5  180000 non-null  object 
 6   sequence6  180000 non-null  object 
 7   sequence7  180000 non-null  object 
 8   sequence8  180000 non-null  object 
 9   sequence9  180000 non-null  object 
 10  A          180000 non-null  int64  
 11  B          180000 non-null  int64  
 12  C          180000 non-null  int64  
 13  D          180000 non-null  int64  
 14  E          180000 non-null  int64  
 15  F          180000 non-null  int64  
 16  G          180000 non-null  int64  
 17  H          180000 non-null  int64  
 18  I          180000 non-null  int64  
 19  J          180000 non-n

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 31 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   ID         120000 non-null  int64  
 1   sequence1  120000 non-null  object 
 2   sequence2  120000 non-null  object 
 3   sequence3  120000 non-null  object 
 4   sequence4  120000 non-null  object 
 5   sequence5  120000 non-null  object 
 6   sequence6  120000 non-null  object 
 7   sequence7  120000 non-null  object 
 8   sequence8  120000 non-null  object 
 9   sequence9  120000 non-null  object 
 10  A          120000 non-null  int64  
 11  B          120000 non-null  int64  
 12  C          120000 non-null  int64  
 13  D          120000 non-null  int64  
 14  E          120000 non-null  int64  
 15  F          120000 non-null  int64  
 16  G          120000 non-null  int64  
 17  H          120000 non-null  int64  
 18  I          120000 non-null  int64  
 19  J          120000 non-n

- Check missing values

In [8]:
train.isnull().sum().any()

False

In [9]:
test.isnull().sum().any()

False

- Feature correlation

In [10]:
#train.corr

# Label encode

We are going to encode objects columns

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

Object columns selection

In [12]:
obj_cols = [col for col in train.columns if train[col].dtype == 'object']
obj_cols

['sequence1',
 'sequence2',
 'sequence3',
 'sequence4',
 'sequence5',
 'sequence6',
 'sequence7',
 'sequence8',
 'sequence9']

Encoding label

In [13]:
for col in obj_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

# Split Data

In [14]:
# training Data
X = train.drop(['ID','mutation'], axis=1)
y = train['mutation']

# Test data
test = test.drop(['ID'], axis=1)

X.shape, test.shape

((180000, 30), (120000, 30))

In [15]:
features = X.columns.tolist()
features

['sequence1',
 'sequence2',
 'sequence3',
 'sequence4',
 'sequence5',
 'sequence6',
 'sequence7',
 'sequence8',
 'sequence9',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U']

# Modeling with Keras

In [16]:
def my_model():
    """Simple sequential neural network with four hidden layers.
    
    Returns a (not yet compiled) instance of tensorflow.keras.models.Model.
    """
    activation = 'swish'
    inputs = Input(shape=(len(features)))
    x = Dense(128, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(inputs)
    x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(x)
    x = Dense(64, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(x)
    x = Dense(16, kernel_regularizer=tf.keras.regularizers.l2(40e-6),
              activation=activation,
             )(x)
    x = Dense(1, #kernel_regularizer=tf.keras.regularizers.l2(1e-6),
              activation='sigmoid',
             )(x)
    model = Model(inputs, x)
    return model

In [17]:
# Plot training history
def plot_history(history, *, n_epochs=None, plot_lr=False, title=None, bottom=None, top=None):
    """Plot (the last n_epochs epochs of) the training history
    
    Plots loss and optionally val_loss and lr."""
    plt.figure(figsize=(15, 6))
    from_epoch = 0 if n_epochs is None else max(len(history['loss']) - n_epochs, 0)
    
    # Plot training and validation losses
    plt.plot(np.arange(from_epoch, len(history['loss'])), history['loss'][from_epoch:], label='Training loss')
    try:
        plt.plot(np.arange(from_epoch, len(history['loss'])), history['val_loss'][from_epoch:], label='Validation loss')
        best_epoch = np.argmin(np.array(history['val_loss']))
        best_val_loss = history['val_loss'][best_epoch]
        if best_epoch >= from_epoch:
            plt.scatter([best_epoch], [best_val_loss], c='r', label=f'Best val_loss = {best_val_loss:.5f}')
        if best_epoch > 0:
            almost_epoch = np.argmin(np.array(history['val_loss'])[:best_epoch])
            almost_val_loss = history['val_loss'][almost_epoch]
            if almost_epoch >= from_epoch:
                plt.scatter([almost_epoch], [almost_val_loss], c='orange', label='Second best val_loss')
    except KeyError:
        pass
    if bottom is not None: plt.ylim(bottom=bottom)
    if top is not None: plt.ylim(top=top)
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(loc='lower left')
    if title is not None: plt.title(title)
        
    # Plot learning rate
    if plot_lr and 'lr' in history:
        ax2 = plt.gca().twinx()
        ax2.plot(np.arange(from_epoch, len(history['lr'])), np.array(history['lr'][from_epoch:]), color='g', label='Learning rate')
        ax2.set_ylabel('Learning rate')
        ax2.legend(loc='upper right')
        
    plt.show()

# Cross validation

In [18]:
from sklearn.metrics import roc_auc_score


In [19]:
%%time
# Cross-validation of the classifier

EPOCHS = 200
EPOCHS_COSINEDECAY = 150
CYCLES = 1
VERBOSE = 0 # set to 0 for less output, or to 2 for more output
DIAGRAMS = True
USE_PLATEAU = True
BATCH_SIZE = 512
ONLY_FIRST_FOLD = False

# see https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development
np.random.seed(1)
random.seed(1)
tf.random.set_seed(1)

def fit_model(X_tr, y_tr, X_va=None, y_va=None, run=0):
    """Scale the data, fit a model, plot the training history and optionally validate the model
    
    Returns a trained instance of tensorflow.keras.models.Model.
    
    As a side effect, updates y_va_pred, history_list and score_list.
    """
    global y_va_pred
    start_time = datetime.datetime.now()
    
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    
    if X_va is not None:
        X_va = scaler.transform(X_va)
        validation_data = (X_va, y_va)
    else:
        validation_data = None

    # Define the learning rate schedule and EarlyStopping
    lr_start=0.01
    #if USE_PLATEAU and X_va is not None: # use early stopping
    epochs = EPOCHS
    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.7, 
                           patience=4, verbose=VERBOSE)
    es = EarlyStopping(monitor="val_loss",
                       patience=12, 
                       verbose=1,
                       mode="min", 
                       restore_best_weights=True)
    callbacks = [lr, es, tf.keras.callbacks.TerminateOnNaN()]

        
    # Construct and compile the model
    model = my_model()
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_start),
                  metrics='AUC',
                  loss=tf.keras.losses.BinaryCrossentropy())

    # Train the model
    history = model.fit(X_tr, y_tr, 
                        validation_data=validation_data, 
                        epochs=epochs,
                        verbose=VERBOSE,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        callbacks=callbacks)

    history_list.append(history.history)
    callbacks, es, lr, history = None, None, None, None
    
    if X_va is None:
        print(f"Training loss: {history_list[-1]['loss'][-1]:.4f}")
    else:
        lastloss = f"Training loss: {history_list[-1]['loss'][-1]:.4f} | Val loss: {history_list[-1]['val_loss'][-1]:.4f}"
        
        # Inference for validation
        y_va_pred = model.predict(X_va, batch_size=len(X_va), verbose=0)
        #oof_list[run][val_idx] = y_va_pred
        
        # Evaluation: Execution time, loss and AUC
        score = roc_auc_score(y_va, y_va_pred)
        print(f"Fold {run}.{fold} | {str(datetime.datetime.now() - start_time)[-12:-7]}"
              f" | {lastloss} | AUC: {score:.5f}")
        score_list.append(score)
        
        if DIAGRAMS and fold == 0 and run == 0:
            # Plot training history
            plot_history(history_list[-1], 
                         title=f"Learning curve (validation AUC = {score:.5f})",
                         plot_lr=True)

            # Plot y_true vs. y_pred
            plt.figure(figsize=(10, 4))
            plt.hist(y_va_pred[y_va == 0], bins=np.linspace(0, 1, 21),
                     alpha=0.5, density=True)
            plt.hist(y_va_pred[y_va == 1], bins=np.linspace(0, 1, 21),
                     alpha=0.5, density=True)
            plt.xlabel('y_pred')
            plt.ylabel('density')
            plt.title('OOF Predictions')
            plt.show()

    return model, scaler



CPU times: user 124 µs, sys: 0 ns, total: 124 µs
Wall time: 129 µs


In [20]:
history_list = []
score_list = []
kf = KFold(n_splits=3)
"""
for fold, (idx_tr, idx_va) in enumerate(kf.split(train)):
    X_tr = train.iloc[idx_tr][features]
    X_va = train.iloc[idx_va][features]
    y_tr = train.iloc[idx_tr].mutation
    y_va = train.iloc[idx_va]['mutation']
    
    fit_model(X_tr, y_tr, X_va, y_va)
    if ONLY_FIRST_FOLD: break # we only need the first fold

print(f"OOF AUC:                       {np.mean(score_list):.5f}")
# Fold 0.0 | 03:57 | Training loss: 0.3497 | Val loss: 0.3842 | AUC: 0.87392

# Fold 0.0 | 00:59 | Training loss: 0.3649 | Val loss: 0.3730 | AUC: 0.87974

# Fold 0.0 | 00:42 | Training loss: 0.3586 | Val loss: 0.3754 | AUC: 0.88020
"""

'\nfor fold, (idx_tr, idx_va) in enumerate(kf.split(train)):\n    X_tr = train.iloc[idx_tr][features]\n    X_va = train.iloc[idx_va][features]\n    y_tr = train.iloc[idx_tr].mutation\n    y_va = train.iloc[idx_va][\'mutation\']\n    \n    fit_model(X_tr, y_tr, X_va, y_va)\n    if ONLY_FIRST_FOLD: break # we only need the first fold\n\nprint(f"OOF AUC:                       {np.mean(score_list):.5f}")\n# Fold 0.0 | 03:57 | Training loss: 0.3497 | Val loss: 0.3842 | AUC: 0.87392\n\n# Fold 0.0 | 00:59 | Training loss: 0.3649 | Val loss: 0.3730 | AUC: 0.87974\n\n# Fold 0.0 | 00:42 | Training loss: 0.3586 | Val loss: 0.3754 | AUC: 0.88020\n'

# Test prediction

In [21]:
%%time
# Create submission
print(f"{len(features)} features")

X_tr = train[features]
y_tr = train.mutation

pred_list = []
for seed in range(1):
    # see https://keras.io/getting_started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    model, scaler = fit_model(X_tr, y_tr, run=seed)
    pred_list.append(model.predict(scaler.transform(test[features]),
                                                        batch_size=len(test), verbose=0))
    print(f"{seed:2}", pred_list[-1])
print()

30 features


2022-05-11 08:55:47.472874: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-05-11 08:55:47.711443: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Training loss: 0.3691
 0 [[0.37057388]
 [0.07098329]
 [0.3166237 ]
 ...
 [0.75658727]
 [0.04025546]
 [0.70558023]]

CPU times: user 8min 1s, sys: 1min 30s, total: 9min 31s
Wall time: 5min 26s


# Submission

In [22]:
submission = pd.read_csv('../input/devday22-competition-datascience/sample_submission.csv')
submission.head()

Unnamed: 0,ID,mutation
0,1,0.5
1,3,0.5
2,4,0.5
3,6,0.5
4,8,0.5


In [23]:
submission['mutation'] = np.array(pred_list).mean(axis=0)
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,ID,mutation
0,1,0.370574
1,3,0.070983
2,4,0.316624
3,6,0.333139
4,8,0.022045
...,...,...
119995,299983,0.088370
119996,299984,0.712463
119997,299990,0.756587
119998,299991,0.040255


<center>
    <h2 style="color:#2c3f51">Thanks for reading...</h2>