# 기본 세팅

In [3]:
import os
import numpy as np

import pandas as pd
from pandas import DataFrame
pd.set_option('max_rows',500)
pd.set_option('display.max_columns', 30)

from collections import Counter
from tqdm import tqdm, tqdm_notebook

import warnings
warnings.filterwarnings(action='ignore')
from __future__ import print_function

#smiles 분자코드 변환 라이브러리 rdkit 중 일부 사용
from rdkit.Chem import MolFromSmiles, Descriptors
from rdkit.Chem.AllChem import GetHashedAtomPairFingerprintAsBitVect
from rdkit.Chem.AllChem import GetHashedTopologicalTorsionFingerprintAsBitVect
# from rdkit.Chem.Draw import IPythonColsole

import lightgbm as lgbm # lightgbm 부스팅 알고리즘 사용

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score, accuracy_score

# lgb 전용 f1 score 계산 수식
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True



# EDA

In [1]:
train = pd.read_csv("train.csv")
# train.head()

#smiles 코드 분자에서 벡터 값으로 변환되는 기존 3가지 이외에 다른 2가지 방식 추가

agg = {}

for i in tqdm( range(len( train['SMILES']) ) ) :
    m1 = MolFromSmiles( train['SMILES'][i] )
    tt = DataFrame( list( GetHashedAtomPairFingerprintAsBitVect(m1,nBits=1024,includeChirality=True) ) )
    agg[i] = tt.T
    
result = pd.concat(agg).reset_index(drop=True)

agg11 = {}

for i in tqdm( range(len( train['SMILES']) ) ) :
    m1 = MolFromSmiles( train['SMILES'][i] )
    tt = DataFrame( list( GetHashedTopologicalTorsionFingerprintAsBitVect(m1,nBits=1024,includeChirality=True)  ) )
    agg11[i] = tt.T
    
result11 = pd.concat(agg11).reset_index(drop=True)

for i in tqdm( range(result11.shape[1])) :
    result11 = result11.rename(columns={result11.columns[i]: 'k'+str(result11.columns[i])})
    

#smiles 코드 분자에서 성분값을 추가로 반환해주는 라이브러리 함수 사용

BJ = [Descriptors.BalabanJ(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

BCT = [Descriptors.BertzCT(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

Ch = [Descriptors.Chi0(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

FDM = [Descriptors.FpDensityMorgan1(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

HM = [Descriptors.HeavyAtomMolWt(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

IPC = [Descriptors.Ipc(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

MR = [Descriptors.MolMR(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

SMR = [Descriptors.SMR_VSA10(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

SV = [Descriptors.SlogP_VSA10(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

TS = [Descriptors.TPSA(Chem.MolFromSmiles(i) ) for i in train['SMILES'] ]

new = DataFrame( list(zip( IPC, MR, TS, SMR, Ch, FDM, HM, BJ, BCT, SV )) ,
           columns=['IPC', 'MR', 'TS', 'SMR', 'Ch', 'FDM', 'HM', 'BJ', 'BCT', 'SV'])
    
train2 = train.copy()
train_total = pd.concat([train2, result, result11, new], axis=1).reset_index(drop=True)
train_total = train_total.astype(np.float64)

## test 데이터는 이름 제외, 결과 동일로 코드 생략

# 최종 모델링 용 Dataset
# (1024 * 5)(분자 코드 벡터화 1024열 * 5개 방식) + 4(기존 변수) + 10(추가 변수) = 5134개 독립 변수 사용
train_set = train_total.iloc[:, :5134] 
train_label = train['label'] #반응변수 분리
test_set = predict_total.copy()

# 모델링

In [2]:
NUM_BOOST_ROUND = 10000
n_splits = 10
SEED = 1217

#파라미터 설정은 기본 세팅 이후, 하나씩 값 변경해가며 수행
lgbm_param = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'random_state': 1217,
    'learning_rate': 0.065,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'early_stopping_rounds': 200,
    'subsample_freq': 3,
    'reg_lambda': 2,
    'reg_alpha': 5,
    'num_leaves': 1023,
    'scale_pos_weight': 1.25
}

#결과 저장 part
evals_result = {}
train_f1 = []
valid_f1 = []
final_test = np.zeros( test_set.shape[0] )

kfolds = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)

#10-fold 수행 part
for ind, (trn_ind, val_ind) in tqdm_notebook(
        enumerate(kfolds.split(X=train_set, y=train_label))):

    X_train, y_train = train_set.iloc[trn_ind], train_label[trn_ind]
    X_valid, y_valid = train_set.iloc[val_ind], train_label[val_ind]

    dtrain = lgbm.Dataset(X_train, y_train)
    dvalid = lgbm.Dataset(X_valid, y_valid, reference=dtrain)

    model = lgbm.train(lgbm_param,
                       dtrain,
                       NUM_BOOST_ROUND,
                       valid_sets=(dtrain, dvalid),
                       valid_names=('train', 'valid'),
                       verbose_eval=200,
                       feval=lgb_f1_score,
                       evals_result=evals_result)

    train_x_predict = model.predict(X_train)
    train_x_predict_binary = [1 if i >= 0.5 else 0 for i in train_x_predict]
    train_f1.append(f1_score(y_train, train_x_predict_binary))

    valid_x_predict = model.predict(X_valid)
    valid_x_predict_binary = [1 if i >= 0.5 else 0 for i in valid_x_predict]
    valid_f1.append(f1_score(y_valid, valid_x_predict_binary))

    test_pred = model.predict(test_set)
    final_test += test_pred

    print('=' * 80)

In [None]:
y_pred = model.predict(X_train)
print(roc_auc_score(y_train, y_pred))

train_pred2 = [1 if i >= 0.5 else 0 for i in y_pred]
print(Counter(train_pred2))
print(confusion_matrix(y_train, train_pred2))
print(classification_report(y_train, train_pred2, digits=4))

In [None]:
valid_pred = model.predict(X_valid)
print(roc_auc_score(y_valid, valid_pred))

valid_pred2 = [1 if i >= 0.5 else 0 for i in valid_pred]
print(Counter(valid_pred2))
print(confusion_matrix(y_valid, valid_pred2))
print(classification_report(y_valid, valid_pred2, digits=4))

In [5]:
real_final = final_test / n_splits
test_predict = [1 if i >= 0.5 else 0 for i in real_final]
print(Counter(test_predict))

real_pred = DataFrame(predict['SMILES'])
real_pred['label'] = test_predict

real_pred.to_csv("최종결과물.csv", index=False)