# Test if positive-negative ratio in training data affects the predictor

In [79]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

# Add local path
import sys
sys.path.append("/Users/Jphild/Documents/Papers/_***Thesis/MRP7Pred/")

import pandas as pd
import numpy as np

from pandas import DataFrame
from numpy import ndarray
from typing import Union, Any, Dict, Tuple, List

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 1. Load positive and negative dataset

In [2]:
df_pos = pd.read_csv("../data/positive.csv")
df_neg = pd.read_csv("../data/negative.csv")

len(df_pos), len(df_neg), len(df_pos)/len(df_neg)

(53, 63, 0.8412698412698413)

### 2. Create training data with different pos:neg ratio

In [44]:
def split_pos_neg(
    df_pos: DataFrame,
    df_neg: DataFrame,
    n_pos: int,
    n_neg: int
) -> DataFrame:
    chosen_idx_pos = np.random.choice(len(df_pos), replace=False, size=n_pos)
    chosen_idx_neg = np.random.choice(len(df_neg), replace=False, size=n_neg)
    label = [1] * n_pos + [0] * n_neg
    df = pd.concat([
        df_pos.iloc[chosen_idx_pos, :],
        df_neg.iloc[chosen_idx_neg, :]
    ], axis=0)
    df = df.rename(columns={'compound_name': 'name', 'Canonical SMILES': 'smiles'})
    df = df[["name", "smiles"]].reset_index(drop=True)
    df["label"] = label
    return df

In [45]:
# 1. pos >> neg
# pos: 50
# neg: 10

df_more_pos = split_pos_neg(df_pos, df_neg, n_pos=50, n_neg=10)
len(df_more_pos)

60

In [46]:
df_more_pos.head()

Unnamed: 0,name,smiles,label
0,﻿17-betaEstradiol,CC12CCC3C(C1CCC2O)CCC4=C3C=CC(=C4)O,1
1,Sulfinpyrazone,C1=CC=C(C=C1)N2C(=O)C(C(=O)N2C3=CC=CC=C3)CCS(=...,1
2,gemcitabine,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,1
3,PD-173074,CCN(CC)CCCCNC1=NC2=NC(=C(C=C2C=N1)C3=CC(=CC(=C...,1
4,﻿Glycolithocholate-3-sulfate,CC(CCC(=O)NCC(=O)O)C1CCC2C1(CCC3C2CCC4C3(CCC(C...,1


In [47]:
# 2. pos == neg
# pos: 30
# neg: 30

df_balance = split_pos_neg(df_pos, df_neg, n_pos=30, n_neg=30)
len(df_more_pos)

60

In [48]:
df_balance.head()

Unnamed: 0,name,smiles,label
0,﻿Zaprinast,CCCOC1=CC=CC=C1C2=NC3=NNN=C3C(=O)N2,1
1,17beta-Estradiol 3-sulfate-17-(beta-D-glucuron...,CC12CCC3C(C1CCC2OC4C(C(C(C(O4)C(=O)[O-])O)O)O)...,1
2,"﻿16alpha,17beta-Estriol 16-(beta-d-glucuronide)",CC12CCC3C(C1CC(C2O)OC4C(C(C(C(O4)C(=O)O)O)O)O)...,1
3,PD-173074,CCN(CC)CCCCNC1=NC2=NC(=C(C=C2C=N1)C3=CC(=CC(=C...,1
4,Methotrexate,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,1


In [49]:
# 3. pos << neg
# pos: 10
# neg: 50

df_more_neg = split_pos_neg(df_pos, df_neg, n_pos=10, n_neg=50)
len(df_more_neg)

60

In [50]:
df_more_neg.head()

Unnamed: 0,name,smiles,label
0,vinblastine,CCC1(CC2CC(C3=C(CCN(C2)C1)C4=CC=CC=C4N3)(C5=C(...,1
1,gemcitabine,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,1
2,Verapamil,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,1
3,Etoposide,CC1OCC2C(O1)C(C(C(O2)OC3C4COC(=O)C4C(C5=CC6=C(...,1
4,AlstolucinesB,CC(=O)C1CN2CCC34C2CC1C(=C3NC5=CC=CC=C45)C(=O)OC,1


### 3. Train models

In [97]:
from grid import grid
from utils import DATA, OUTPUT, plot_roc_auc, get_scoring, get_current_time, ensure_folder
from preprocess import load_data, featurize_and_split
from train import train

from MRP7Pred import MRP7Pred

def start_train(df, ratio, model_filename, print_log=False):
    name_train, name_test, X_train, y_train, X_test, y_test = featurize_and_split(df, ratio=ratio)

    print("Start training ...", end="", flush=True)
    clf_best = train(X_train, y_train, model_dir=model_filename, print_log=print_log)
    print("Done!")

    print(f"Best model:\n{clf_best}")

    print("Evaluate model on test data ... ", end="", flush=True)
    test_score = clf_best.score(X_test, y_test)
    y_pred = clf_best.predict(X_test)
    y_score = [score[1] for score in clf_best.predict_proba(X_test)]
    print(f"Done! Score: {test_score}")

    print("Getting full score set ... ", end="", flush=True)
    test_scores = get_scoring(y_test, y_score, y_pred)
    print(f"Done! Full scores: {test_scores}")

In [99]:
# Training / Testing
RATIO = 0.5

## 1. pos >> neg
## pos: neg = 50 : 10

In [100]:
start_train(df_more_pos, RATIO, "./pos_neg_ratio/more_positive")

Featurzing data ... 

100%|██████████| 60/60 [00:02<00:00, 22.29it/s]

Done!
Spliting training and test data ... Done!
Start training ...




Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1877s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 118 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 128 out of 135 | elapsed:   10.0s remaining:    0.5s


Best score: 0.8666666666666668
Done!
Best model:
Pipeline(steps=[('sclr', DummyScaler(scaler=StandardScaler())),
                ('clf', SVC(C=1, gamma=0.001, probability=True))])
Evaluate model on test data ... Done! Score: 0.8
Getting full score set ... Done! Full scores: {'stats': {'tp': 24.0, 'fp': 6.0, 'tn': 0.0, 'fn': 0.0}, 'score': {'roc_auc': 0.1736111111111111, 'accuracy': 0.8}}


[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   10.6s finished


In [101]:
# Test on unknown data
m7p_more_positive = MRP7Pred("./pos_neg_ratio/more_positive/best_model_20201224-224346.pkl")
df_more_positive_pred = m7p_more_positive.predict("../data/unknown.csv")
len(df_more_positive_pred[df_more_positive_pred["pred"]==1]), len(df_more_positive_pred[df_more_positive_pred["pred"]==0])

Loading trained model ... 

  0%|          | 0/80 [00:00<?, ?it/s]

Done!
Generating features ... 


100%|██████████| 80/80 [00:03<00:00, 22.92it/s]

Done!
Start predicting ...Done!
Writing output ...




Done!


(79, 1)

## 2. pos == neg
## pos: neg = 30 : 30

In [102]:
start_train(df_balance, RATIO, "./pos_neg_ratio/balance")

Featurzing data ... 

100%|██████████| 60/60 [00:04<00:00, 14.15it/s]

Done!
Spliting training and test data ... Done!
Start training ...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0355s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0355s.) Setting batch_size=4.


Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    5.6s


Best score: 0.9
Done!
Best model:
Pipeline(steps=[('sclr', DummyScaler(scaler=StandardScaler())),
                ('clf', SVC(C=50, gamma=0.0001, probability=True))])
Evaluate model on test data ... Done! Score: 0.7666666666666667
Getting full score set ... Done! Full scores: {'stats': {'tp': 13.0, 'fp': 4.0, 'tn': 10.0, 'fn': 3.0}, 'score': {'roc_auc': 0.7678571428571428, 'accuracy': 0.7666666666666667}}


[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    6.2s finished


In [103]:
m7p_balance = MRP7Pred("./pos_neg_ratio/balance/best_model_20201224-225148.pkl")
df_balance_pred = m7p_balance.predict("../data/unknown.csv")
len(df_balance_pred[df_balance_pred["pred"]==1]), len(df_balance_pred[df_balance_pred["pred"]==0])

Loading trained model ... 

  0%|          | 0/80 [00:00<?, ?it/s]

Done!
Generating features ... 


100%|██████████| 80/80 [00:03<00:00, 20.70it/s]

Done!
Start predicting ...




Done!
Writing output ...Done!


(65, 15)

## 3. pos << neg
## pos: neg = 10 : 50

In [104]:
start_train(df_more_neg, RATIO, "./pos_neg_ratio/more_neg")

Featurzing data ... 

100%|██████████| 60/60 [00:04<00:00, 14.52it/s]

Done!
Spliting training and test data ... Done!
Start training ...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Batch computation too fast (0.0851s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 128 out of 135 | elapsed:    5.1s remaining:    0.3s


Best score: 0.9333333333333333
Done!
Best model:
Pipeline(steps=[('sclr', DummyScaler(scaler=MinMaxScaler())),
                ('clf',
                 MLPClassifier(alpha=1e-05, hidden_layer_sizes=12,
                               solver='lbfgs'))])
Evaluate model on test data ... Done! Score: 0.7666666666666667
Getting full score set ... Done! Full scores: {'stats': {'tp': 2.0, 'fp': 4.0, 'tn': 21.0, 'fn': 3.0}, 'score': {'roc_auc': 0.552, 'accuracy': 0.7666666666666667}}


[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    5.8s finished


In [105]:
m7p_more_neg = MRP7Pred("./pos_neg_ratio/more_neg/best_model_20201224-225558.pkl")
df_more_neg_pred = m7p_more_neg.predict("../data/unknown.csv")
len(df_more_neg_pred[df_more_neg_pred["pred"]==1]), len(df_more_neg_pred[df_more_neg_pred["pred"]==0])

Loading trained model ... 

  4%|▍         | 3/80 [00:00<00:02, 26.49it/s]

Done!
Generating features ... 


100%|██████████| 80/80 [00:02<00:00, 28.20it/s]

Done!
Start predicting ...Done!
Writing output ...




Done!


(22, 58)

# Conclusions
If the training set has much more positive than negative, the trained model will tend to predict unknown cases as positive and vice versa. 