In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

train = pd.read_csv("train.csv")

test = pd.read_csv("test.csv")

train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

snp_info = pd.read_csv("snp_info.csv")

class CFG:
    SEED = 42
    
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정    

use_train = train_x
use_test = test_x

In [2]:
use_train

Unnamed: 0,father,mother,gender,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,0,0,0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A
1,0,0,0,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A
2,0,0,0,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A
3,0,0,0,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G
4,0,0,0,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,0,0,0,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A
258,0,0,0,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A
259,0,0,0,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A A,G G,G G,G G,C A,G G
260,0,0,0,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,A G,A G,G A,G G,C A,G G


# 유전자 다형성

In [3]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

snp_data = []
for col in snp_col:
    snp_data += list(use_train[col].values)
    
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

use_train = use_train.drop(["father","mother","gender"],axis = 1)
use_test = use_test.drop(["father","mother","gender"],axis = 1)

use_train["snp_poly"] = 0
use_test["snp_poly"] = 0

count_list = []

for j in range(len(use_train)):
    count = 0
    for col in use_train.columns[1:-1]:
        a =use_train[col].str.split(" ")[j][0]
        b =use_train[col].str.split(" ")[j][1]
        
        if a != b:
            count += 1
    
    count_list.append(count/15)

use_train["snp_poly"] = count_list
    
count_list = []
    
for j in range(len(use_test)):
    count = 0
    for col in use_test.columns[1:-1]:
        a =use_test[col].str.split(" ")[j][0]
        b =use_test[col].str.split(" ")[j][1]
        
        if a != b:
            count += 1
    
    count_list.append(count/15)    


use_test["snp_poly"] = count_list

# 컬럼 원핫 인코딩

In [4]:
train_x = train_x.drop(["father","mother","gender"],axis = 1)
test_x = test_x.drop(["father","mother","gender"],axis = 1)


for j in range(len(train_x)):
    train_x["SNP_15"][j] = "1" + str(train_x["SNP_15"][j])

for j in range(len(test_x)):
    test_x["SNP_15"][j] = "1" + str(test_x["SNP_15"][j])
    
for j in range(len(train_x)):
    train_x["SNP_02"][j] = "2" + str(train_x["SNP_02"][j])

for j in range(len(test_x)):
    test_x["SNP_02"][j] = "2" + str(test_x["SNP_02"][j])
    
for j in range(len(train_x)):
    train_x["SNP_01"][j] = "3" + str(train_x["SNP_01"][j])
    train_x["SNP_03"][j] = "3" + str(train_x["SNP_03"][j])
    train_x["SNP_10"][j] = "3" + str(train_x["SNP_10"][j])
    train_x["SNP_12"][j] = "3" + str(train_x["SNP_12"][j])
    train_x["SNP_13"][j] = "3" + str(train_x["SNP_13"][j])
    train_x["SNP_14"][j] = "3" + str(train_x["SNP_14"][j])

for j in range(len(test_x)):
    test_x["SNP_01"][j] = "3" + str(test_x["SNP_01"][j])
    test_x["SNP_03"][j] = "3" + str(test_x["SNP_03"][j])
    test_x["SNP_10"][j] = "3" + str(test_x["SNP_10"][j])
    test_x["SNP_12"][j] = "3" + str(test_x["SNP_12"][j])
    test_x["SNP_13"][j] = "3" + str(test_x["SNP_13"][j])
    test_x["SNP_14"][j] = "3" + str(test_x["SNP_14"][j])
    
for j in range(len(train_x)):
    train_x["SNP_04"][j] = "4" + str(train_x["SNP_04"][j])
    train_x["SNP_05"][j] = "4" + str(train_x["SNP_05"][j])
    train_x["SNP_06"][j] = "4" + str(train_x["SNP_06"][j])
    train_x["SNP_07"][j] = "4" + str(train_x["SNP_07"][j])
    train_x["SNP_08"][j] = "4" + str(train_x["SNP_08"][j])
    train_x["SNP_09"][j] = "4" + str(train_x["SNP_09"][j])

for j in range(len(test_x)):
    test_x["SNP_04"][j] = "4" + str(test_x["SNP_04"][j])
    test_x["SNP_05"][j] = "4" + str(test_x["SNP_05"][j])
    test_x["SNP_06"][j] = "4" + str(test_x["SNP_06"][j])
    test_x["SNP_07"][j] = "4" + str(test_x["SNP_07"][j])
    test_x["SNP_08"][j] = "4" + str(test_x["SNP_08"][j])
    test_x["SNP_09"][j] = "4" + str(test_x["SNP_09"][j])

for j in range(len(train_x)):
    train_x["SNP_11"][j] = "5" + str(train_x["SNP_11"][j])

for j in range(len(test_x)):
    test_x["SNP_11"][j] = "5" + str(test_x["SNP_11"][j])

class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)
    
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

        
train_x["snp_poly"] = use_train["snp_poly"]
test_x["snp_poly"] = use_test["snp_poly"]

In [5]:
max_value = snp_info["pos"].max()
min_value = snp_info["pos"].min()

minmax_list = []

for i in snp_info["pos"]:
    minmax_list.append(1 - ((i-min_value) / (max_value-min_value)))
    
minmax_list

[0.5399589933134101,
 0.8572861515759044,
 0.5218218298880182,
 0.4217195526137484,
 0.33495913681510514,
 0.32111655205493717,
 0.3103296207210382,
 0.27244152976799485,
 0.21154800629129755,
 0.5635189936192853,
 0.0,
 0.4088281261053408,
 0.3577215709533632,
 0.21449630888659688,
 1.0]

In [6]:
# max_value = snp_info["pos"].max()
# min_value = snp_info["pos"].min()

# minmax_list = []

# for i in snp_info["pos"]:
#     minmax_list.append(1 - ((i-min_value) / (max_value-min_value)))
    
# minmax_list

In [7]:
from tqdm import tqdm

distance_x = pd.DataFrame()
distance_y = pd.DataFrame()

snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

for col in snp_col:
    distance_x[col] = train[col]
    distance_y[col] = train[col]

for i in tqdm(snp_col):
    for j in range(len(train)):
        distance_x[i][j] = train[i][j][0]
        distance_y[i][j] = train[i][j][2]

distance_x["class"] = train["class"]
distance_y["class"] = train["class"]
A1 = ""
A2 = ""
B1 = ""
B2 = ""
C1 = ""
C2 = ""

for i in distance_x.columns:
    snp = distance_x[distance_x["class"] == "A"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    A1 += snplarge
    
for i in distance_y.columns:
    snp = distance_y[distance_y["class"] == "A"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    A2 += snplarge
    
for i in distance_x.columns:
    snp = distance_x[distance_x["class"] == "B"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    B1 += snplarge
    
for i in distance_y.columns:
    snp = distance_y[distance_y["class"] == "B"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    B2 += snplarge
    
for i in distance_x.columns:
    snp = distance_x[distance_x["class"] == "C"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    C1 += snplarge
    
for i in distance_y.columns:
    snp = distance_y[distance_y["class"] == "C"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    C2 += snplarge
    
print("A1: ", A1, 
    "A2: ", A2, 
      "B1:", B1, 
      "B2:", B2, 
      "C1:", C1, 
      "C2:", C2)

mat_dis = pd.DataFrame()
for col in snp_col:
    mat_dis[col] = train[col]

for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != A1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
train_x["distance1"] = a                

mat_dis = pd.DataFrame()
for col in snp_col:
    mat_dis[col] = train[col]

for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != B1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
train_x["distance2"] = a     

mat_dis = pd.DataFrame()

for col in snp_col:
    mat_dis[col] = train[col]
            
for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != C1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
train_x["distance3"] = a    
            
mat_dis2 = pd.DataFrame()
snp_col = [f'pre_SNP_{str(x).zfill(2)}' for x in range(1,16)]

for col in snp_col:
    mat_dis2[col] = train[col[4:]]

for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != A2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0

a = mat_dis2.sum(axis = "columns")
train_x["distance4"] = a    

mat_dis2 = pd.DataFrame()    


for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != B2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0
            
a = mat_dis.sum(axis = "columns")
train_x["distance5"] = a    
            
mat_dis2 = pd.DataFrame()    

for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != C2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0
            
a = mat_dis.sum(axis = "columns")
train_x["distance6"] = a

100%|███████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 29.07it/s]


A1:  AGAGAGGAGAGGGCGA A2:  AGAGAGGAAGGGGAGA B1: GACGCAAGAGAAAAAB B2: GGAACGAGAGGAGAAB C1: GACACAAGAAAAAAGC C2: GGAAAGAAAGAAGAAC


In [8]:
from tqdm import tqdm

distance_x = pd.DataFrame()
distance_y = pd.DataFrame()

snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

for col in snp_col:
    distance_x[col] = test[col]
    distance_y[col] = test[col]

for i in tqdm(snp_col):
    for j in range(len(test)):
        distance_x[i][j] = test[i][j][0]
        distance_y[i][j] = test[i][j][2]

mat_dis = pd.DataFrame()
for col in snp_col:
    mat_dis[col] = test[col]

for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != A1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
test_x["distance1"] = a                

mat_dis = pd.DataFrame()
for col in snp_col:
    mat_dis[col] = train[col]

for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != B1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
test_x["distance2"] = a     

mat_dis = pd.DataFrame()

for col in snp_col:
    mat_dis[col] = train[col]
            
for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != C1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
test_x["distance3"] = a    
            
mat_dis2 = pd.DataFrame()
snp_col = [f'pre_SNP_{str(x).zfill(2)}' for x in range(1,16)]

for col in snp_col:
    mat_dis2[col] = train[col[4:]]

for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != A2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0

a = mat_dis2.sum(axis = "columns")
test_x["distance4"] = a    

mat_dis2 = pd.DataFrame()    


for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != B2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0
            
a = mat_dis.sum(axis = "columns")
test_x["distance5"] = a    
            
mat_dis2 = pd.DataFrame()    

for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != C2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0
            
a = mat_dis.sum(axis = "columns")
test_x["distance6"] = a

for col in test_x.columns:
    test_x[col] = pd.to_numeric(test_x[col])

100%|███████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 43.70it/s]


In [9]:
from sklearn.model_selection import train_test_split
import warnings

####################
# GBM을 빼야 하나 #
# CAT도 빼야 하나 #
##################

warnings.filterwarnings("ignore")

# for col in dis.columns:
#     dis[col] = pd.to_numeric(dis[col])

# trainX, testX, trainY, testY = train_test_split(dis, train_y, test_size = 0.2)

trainX = train_x
trainY = train_y

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

xgb = XGBClassifier()
lgb = LGBMClassifier()
gbm = GradientBoostingClassifier()
cat = CatBoostClassifier()
clf = RandomForestClassifier()
lreg = LogisticRegression()

# 최적의 파라미터 값 찾아보기
from sklearn.model_selection import GridSearchCV
# XGB
param_xgb = {"max_depth": [10,30,50], # 6
              "min_child_weight" : [1,3,6,10], # 1
              "n_estimators": [0, 100,200,300,500] #
              }    
# LGB
param_lgb = {"learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5], # 0.1
             "max_depth" : [5, 10, 25, 50, 75], # -1
             "num_leaves" : [20, 50, 100,200, 300, 500], # 31
             "n_estimators" : [100, 200, 300,400, 500], # 100
              }
# GBM              
# param_gbm = {"max_depth" : [4,5,6,7,8,9,10],
#              "learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5],
#              "n_estimators" : [100,200,300,500]
#               }
# CAT
# param_cat = {"depth" : [6,4,5,7,8,9,10],
#           "iterations" : [250,100, 10, 50],
#           "learning_rate" : [0.001,0.01,0.1,0.2,0.3], 
#           "l2_leaf_reg" : [2,5,10,20,30],
#           "border_count" : [254]
#           }
# cls
param_clf = {
    'max_depth' : [3,5,10,15,20, 30],
    'max_features' : [3,5,10, 15, 30, 50],
    'n_estimators' : [80, 100, 150, 200]
}

# Logistic
param_lreg = { 'C' : [1.0, 3, 5, 7, 10,15, 25],
              'max_iter': [10, 50, 200, 100, 300]
              }

gscv_xgb = GridSearchCV (estimator = xgb, param_grid = param_xgb, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
gscv_lgb = GridSearchCV (estimator = lgb, param_grid = param_lgb, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
# gscv_gbm = GridSearchCV (estimator = gbm, param_grid = param_gbm, scoring ='accuracy', cv = 3, refit=True, n_jobs=1, verbose=1)
# gscv_cat = GridSearchCV (estimator = cat, param_grid = param_cat, scoring ='accuracy', cv = 3, refit=True, n_jobs=1, verbose=1)
gscv_lreg = GridSearchCV (estimator = lreg, param_grid = param_lreg, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
gscv_clf = GridSearchCV (estimator = clf, param_grid = param_clf, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
gscv_xgb.fit(trainX, trainY)
gscv_lgb.fit(trainX, trainY)
# gscv_gbm.fit(trainX, trainY)
# gscv_cat.fit(trainX, trainY)
gscv_lreg.fit(trainX, trainY)
gscv_clf.fit(trainX, trainY)

print("="*30)
print('clf 파라미터: ', gscv_clf.best_params_)
print('clf 예측 정확도: {:.4f}'.format(gscv_clf.best_score_))
print("="*30)
print('LGB 파라미터: ', gscv_lgb.best_params_)
print('LGB 예측 정확도: {:.4f}'.format(gscv_lgb.best_score_))
print("="*30)
print('XGB 파라미터: ', gscv_xgb.best_params_)
print('XGB 예측 정확도: {:.4f}'.format(gscv_xgb.best_score_))
print("="*30)
# print('GBM 파라미터: ', gscv_gbm.best_params_)
# print('GBM 예측 정확도: {:.4f}'.format(gscv_gbm.best_score_))
# print("="*30)
# print('CAT 파라미터: ', gscv_cat.best_params_)
# print('CAT 예측 정확도: {:.4f}'.format(gscv_cat.best_score_))
# print("="*30)
print('Lreg 파라미터: ', gscv_lreg.best_params_)
print('Lreg 예측 정확도: {:.4f}'.format(gscv_lreg.best_score_))
print("="*30)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


KeyboardInterrupt: 

In [146]:
train_x

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,SNP_13,SNP_14,SNP_15,snp_poly,distance1,distance2,distance3,distance4,distance5,distance6
0,2,11,4,6,16,14,12,12,17,12,...,6,6,0,0.266667,5.392205,0.521822,2.507060,3.632116,2.507060,2.507060
1,2,7,4,8,12,12,13,12,16,12,...,11,6,0,0.466667,4.130760,2.626706,2.641468,2.680836,2.641468,2.641468
2,2,11,5,6,16,15,17,12,16,16,...,6,6,0,0.266667,3.438735,2.475292,3.333492,3.373517,3.333492,3.333492
3,1,6,5,6,16,12,17,17,12,17,...,11,6,2,0.133333,0.214496,5.699531,4.557731,0.633268,4.557731,4.557731
4,2,11,5,9,12,15,12,12,12,12,...,7,6,1,0.133333,4.206019,2.551447,1.693247,3.858734,1.693247,1.693247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,7,4,6,16,15,13,12,16,12,...,6,6,0,0.466667,4.443418,1.470609,3.455847,3.373517,3.455847,3.455847
258,2,11,3,8,12,12,13,16,16,12,...,7,6,1,0.533333,4.126939,2.630527,0.645289,3.538122,0.645289,0.645289
259,1,7,5,6,16,12,13,17,16,16,...,11,8,2,0.400000,0.593558,5.320469,4.178669,1.525198,4.178669,4.178669
260,1,6,5,6,16,12,17,17,12,16,...,11,8,2,0.400000,0.000000,5.914027,4.772227,0.830548,4.772227,4.772227


In [148]:
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(train_x, train_y, test_size = 0.3)

from sklearn.metrics import plot_confusion_matrix, confusion_matrix
import matplotlib.pyplot as plt
import warnings
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")

clf = RandomForestClassifier(random_state=CFG.SEED,
                            max_depth = 20,
                            max_features = 3,
                            n_estimators = 80)
lgbm = LGBMClassifier(n_estimators=400,
                            learning_rate = 0.05,
                            num_leaves = 20,
                            max_depth = 10
                     )
xgbc = XGBClassifier(random_state=CFG.SEED,
                     min_child_weight = 3,
                    max_depth = 10,
                    n_estimators = 500)


lreg = LogisticRegression(C =15,
                        max_iter =200)

clf.fit(trainX, trainY)
y_pred = clf.predict(testX)
print("RandomForest Result")
print(confusion_matrix(testY, y_pred))
print()


lgbm.fit(trainX, trainY)
y_pred = lgbm.predict(testX)
print("LightGBM Result")
print(confusion_matrix(testY, y_pred))
print()


xgbc.fit(trainX, trainY)
y_pred = xgbc.predict(testX)
print("XGBC Result")
print(confusion_matrix(testY, y_pred))
print()

lreg.fit(trainX, trainY)
y_pred = lreg.predict(testX)
print("lreg Result")
print(confusion_matrix(testY, y_pred))
print()

RandomForest Result
[[19  0  0]
 [ 0 35  0]
 [ 0  4 21]]

LightGBM Result
[[19  0  0]
 [ 0 33  2]
 [ 0  2 23]]

XGBC Result
[[19  0  0]
 [ 0 34  1]
 [ 0  0 25]]

lreg Result
[[19  0  0]
 [ 0 35  0]
 [ 0  0 25]]



In [10]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(n_estimators=400,
                            learning_rate = 0.01,
                            num_leaves = 20,
                            max_depth = 10
                     )

lgbm.fit(train_x, train_y)

preds = lgbm.predict(test_x)

submit = pd.read_csv('sample_submission.csv')

submit['class'] = class_le.inverse_transform(preds)

submit.to_csv('newdistance_LGBM.csv', index=False)

submit1 = pd.read_csv("newdistance_LGBM.csv")

submit1["class"].value_counts()

1    84
0    50
2    41
Name: class, dtype: int64

In [166]:
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(C =7,
                        max_iter =50)

lreg.fit(train_x, train_y)

new_preds = lreg.predict(test_x)


transform = {0:"A", 1:"B", 2:"C"}

preds = []
for i in range(len(new_preds)):
    preds.append(transform[new_preds[i]])

submit = pd.read_csv('sample_submission.csv')

# submit['class'] = class_le.inverse_transform(preds)

submit['class'] = preds

submit.to_csv('all_mix750_Lreg_best.csv', index=False)

submit1 = pd.read_csv("all_mix750_Lreg_best.csv")

submit1["class"].value_counts()

B    83
A    51
C    41
Name: class, dtype: int64

In [167]:
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(C =15,
                        max_iter =200)

lreg.fit(train_x, train_y)

new_preds = lreg.predict(test_x)


transform = {0:"A", 1:"B", 2:"C"}

preds = []
for i in range(len(new_preds)):
    preds.append(transform[new_preds[i]])

submit = pd.read_csv('sample_submission.csv')

# submit['class'] = class_le.inverse_transform(preds)

submit['class'] = preds

submit.to_csv('all_mix15200_Lreg_best.csv', index=False)

submit1 = pd.read_csv("all_mix15200_Lreg_best.csv")

submit1["class"].value_counts()

B    85
A    51
C    39
Name: class, dtype: int64

In [11]:
clf = RandomForestClassifier(random_state=CFG.SEED,
                            max_depth = 10,
                            max_features = 3,
                            n_estimators = 150)

clf.fit(train_x, train_y)

preds = clf.predict(test_x)

submit = pd.read_csv('sample_submission.csv')

submit['class'] = class_le.inverse_transform(preds)

submit.to_csv('new_distance2_clf_LGBM.csv', index=False)

submit1 = pd.read_csv("new_distance2_clf_LGBM.csv")

submit1["class"].value_counts()

1    87
0    50
2    38
Name: class, dtype: int64

In [12]:
xgbc = XGBClassifier(random_state=CFG.SEED,
                     min_child_weight = 3,
                    max_depth = 10,
                    n_estimators = 500)
xgbc.fit(train_x, train_y)

preds = xgbc.predict(test_x)

submit = pd.read_csv('sample_submission.csv')

submit['class'] = class_le.inverse_transform(preds)

submit.to_csv('new_distance2_xgbc.csv', index=False)

submit1 = pd.read_csv("new_distance2_xgbc.csv")

submit1["class"].value_counts()

1    84
0    51
2    40
Name: class, dtype: int64

# number2

In [13]:
import pandas as pd
import random
import os
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

train = pd.read_csv("train.csv")

test = pd.read_csv("test.csv")

train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

snp_info = pd.read_csv("snp_info.csv")

class CFG:
    SEED = 42
    
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정    


In [14]:
train_x["snp_poly"] = 0
test_x["snp_poly"] = 0

train_x = train_x.drop(["father","mother","gender"],axis = 1)
test_x = test_x.drop(["father","mother","gender"],axis = 1)

count_list = []

for j in range(len(train_x)):
    count = 0
    for col in train_x.columns[1:-1]:
        a =train_x[col].str.split(" ")[j][0]
        b =train_x[col].str.split(" ")[j][1]
        
        if a != b:
            count += 1
    
    count_list.append(count/15)

train_x["snp_poly"] = count_list
    
count_list = []
    
for j in range(len(test_x)):
    count = 0
    for col in test_x.columns[1:-1]:
        a =test_x[col].str.split(" ")[j][0]
        b =test_x[col].str.split(" ")[j][1]
        
        if a != b:
            count += 1
    
    count_list.append(count/15)    


test_x["snp_poly"] = count_list

In [134]:
train_x

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,snp_poly
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,0.266667
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,0.466667
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,0.266667
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,0.133333
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,0.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,G G,A G,G A,A A,A A,A A,0.466667
258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,A G,A G,A A,A G,A A,G A,0.533333
259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,A A,G G,G G,G G,C A,G G,0.400000
260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,A G,A G,G A,G G,C A,G G,0.400000


In [15]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)
    
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])


In [16]:
max_value = snp_info["pos"].max()
min_value = snp_info["pos"].min()

minmax_list = []

for i in snp_info["pos"]:
    minmax_list.append(1 - ((i-min_value) / (max_value-min_value)))
    
minmax_list

[0.5399589933134101,
 0.8572861515759044,
 0.5218218298880182,
 0.4217195526137484,
 0.33495913681510514,
 0.32111655205493717,
 0.3103296207210382,
 0.27244152976799485,
 0.21154800629129755,
 0.5635189936192853,
 0.0,
 0.4088281261053408,
 0.3577215709533632,
 0.21449630888659688,
 1.0]

In [17]:
from tqdm import tqdm

distance_x = pd.DataFrame()
distance_y = pd.DataFrame()

snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

for col in snp_col:
    distance_x[col] = train[col]
    distance_y[col] = train[col]

for i in tqdm(snp_col):
    for j in range(len(train)):
        distance_x[i][j] = train[i][j][0]
        distance_y[i][j] = train[i][j][2]

distance_x["class"] = train["class"]
distance_y["class"] = train["class"]
A1 = ""
A2 = ""
B1 = ""
B2 = ""
C1 = ""
C2 = ""

for i in distance_x.columns:
    snp = distance_x[distance_x["class"] == "A"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    A1 += snplarge
    
for i in distance_y.columns:
    snp = distance_y[distance_y["class"] == "A"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    A2 += snplarge
    
for i in distance_x.columns:
    snp = distance_x[distance_x["class"] == "B"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    B1 += snplarge
    
for i in distance_y.columns:
    snp = distance_y[distance_y["class"] == "B"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    B2 += snplarge
    
for i in distance_x.columns:
    snp = distance_x[distance_x["class"] == "C"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    C1 += snplarge
    
for i in distance_y.columns:
    snp = distance_y[distance_y["class"] == "C"][i].value_counts()
    snplarge = snp.nlargest(1).index[0]

    C2 += snplarge
    
print("A1: ", A1, 
    "A2: ", A2, 
      "B1:", B1, 
      "B2:", B2, 
      "C1:", C1, 
      "C2:", C2)

mat_dis = pd.DataFrame()
for col in snp_col:
    mat_dis[col] = train[col]

for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != A1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
train_x["distance1"] = a                

mat_dis = pd.DataFrame()
for col in snp_col:
    mat_dis[col] = train[col]

for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != B1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
train_x["distance2"] = a     

mat_dis = pd.DataFrame()

for col in snp_col:
    mat_dis[col] = train[col]
            
for col in range(len(mat_dis.columns)):
    for i in range(len(distance_x)):
        if distance_x[mat_dis.columns[col]][i] != C1[col]:
            mat_dis[mat_dis.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis[mat_dis.columns[col]][i] = 0

a = mat_dis.sum(axis = "columns")
train_x["distance3"] = a    
            
mat_dis2 = pd.DataFrame()
snp_col = [f'pre_SNP_{str(x).zfill(2)}' for x in range(1,16)]

for col in snp_col:
    mat_dis2[col] = train[col[4:]]

for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != A2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0

a = mat_dis2.sum(axis = "columns")
train_x["distance4"] = a    

mat_dis2 = pd.DataFrame()    


for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != B2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0
            
a = mat_dis.sum(axis = "columns")
train_x["distance5"] = a    
            
mat_dis2 = pd.DataFrame()    

for col in range(len(mat_dis2.columns)):
    for i in range(len(distance_y)):
        if distance_y[mat_dis2.columns[col][4:]][i] != C2[col]:
            mat_dis2[mat_dis2.columns[col]][i] = 1 * minmax_list[col]
        else:
            mat_dis2[mat_dis2.columns[col]][i] = 0
            
a = mat_dis.sum(axis = "columns")
train_x["distance6"] = a

100%|███████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 28.67it/s]


A1:  AGAGAGGAGAGGGCGA A2:  AGAGAGGAAGGGGAGA B1: GACGCAAGAGAAAAAB B2: GGAACGAGAGGAGAAB C1: GACACAAGAAAAAAGC C2: GGAAAGAAAGAAGAAC


In [38]:
from sklearn.model_selection import train_test_split
import warnings

####################
# GBM을 빼야 하나 #
# CAT도 빼야 하나 #
##################

warnings.filterwarnings("ignore")

# for col in dis.columns:
#     dis[col] = pd.to_numeric(dis[col])

# trainX, testX, trainY, testY = train_test_split(dis, train_y, test_size = 0.2)

trainX = train_x
trainY = train_y

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

xgb = XGBClassifier()
lgb = LGBMClassifier()
gbm = GradientBoostingClassifier()
cat = CatBoostClassifier()
clf = RandomForestClassifier()
lreg = LogisticRegression()

# 최적의 파라미터 값 찾아보기
from sklearn.model_selection import GridSearchCV
# XGB
param_xgb = {"max_depth": [10,30,50], # 6
              "min_child_weight" : [1,3,6,10], # 1
              "n_estimators": [0, 100,200,300,500] #
              }    
# LGB
param_lgb = {"learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5], # 0.1
             "max_depth" : [5, 10, 25, 50, 75], # -1
             "num_leaves" : [20, 50, 100,200, 300, 500], # 31
             "n_estimators" : [100, 200, 300,400, 500], # 100
              }
# GBM              
# param_gbm = {"max_depth" : [4,5,6,7,8,9,10],
#              "learning_rate" : [0.01,0.1,0.2,0.3,0.4,0.5],
#              "n_estimators" : [100,200,300,500]
#               }
# CAT
# param_cat = {"depth" : [6,4,5,7,8,9,10],
#           "iterations" : [250,100, 10, 50],
#           "learning_rate" : [0.001,0.01,0.1,0.2,0.3], 
#           "l2_leaf_reg" : [2,5,10,20,30],
#           "border_count" : [254]
#           }
# cls
param_clf = {
    'max_depth' : [3,5,10,15,20, 30],
    'max_features' : [3,5,10, 15, 30, 50],
    'n_estimators' : [80, 100, 150, 200]
}

# Logistic
param_lreg = { 'C' : [1.0, 3, 5, 7, 10,15, 25],
              'max_iter': [10, 50, 200, 100, 300]
              }

gscv_xgb = GridSearchCV (estimator = xgb, param_grid = param_xgb, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
gscv_lgb = GridSearchCV (estimator = lgb, param_grid = param_lgb, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
# gscv_gbm = GridSearchCV (estimator = gbm, param_grid = param_gbm, scoring ='accuracy', cv = 3, refit=True, n_jobs=1, verbose=1)
# gscv_cat = GridSearchCV (estimator = cat, param_grid = param_cat, scoring ='accuracy', cv = 3, refit=True, n_jobs=1, verbose=1)
gscv_lreg = GridSearchCV (estimator = lreg, param_grid = param_lreg, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
gscv_clf = GridSearchCV (estimator = clf, param_grid = param_clf, scoring ='accuracy', cv = None, refit=True, n_jobs=1, verbose=1)
gscv_xgb.fit(trainX, trainY)
gscv_lgb.fit(trainX, trainY)
# gscv_gbm.fit(trainX, trainY)
# gscv_cat.fit(trainX, trainY)
gscv_lreg.fit(trainX, trainY)
gscv_clf.fit(trainX, trainY)

print("="*30)
print('clf 파라미터: ', gscv_clf.best_params_)
print('clf 예측 정확도: {:.4f}'.format(gscv_clf.best_score_))
print("="*30)
print('LGB 파라미터: ', gscv_lgb.best_params_)
print('LGB 예측 정확도: {:.4f}'.format(gscv_lgb.best_score_))
print("="*30)
print('XGB 파라미터: ', gscv_xgb.best_params_)
print('XGB 예측 정확도: {:.4f}'.format(gscv_xgb.best_score_))
print("="*30)
# print('GBM 파라미터: ', gscv_gbm.best_params_)
# print('GBM 예측 정확도: {:.4f}'.format(gscv_gbm.best_score_))
# print("="*30)
# print('CAT 파라미터: ', gscv_cat.best_params_)
# print('CAT 예측 정확도: {:.4f}'.format(gscv_cat.best_score_))
# print("="*30)
print('Lreg 파라미터: ', gscv_lreg.best_params_)
print('Lreg 예측 정확도: {:.4f}'.format(gscv_lreg.best_score_))
print("="*30)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Fitting 5 folds for each of 900 candidates, totalling 4500 fits
Fitting 5 folds for each of 35 candidates, totalling 175 fits
Fitting 5 folds for each of 144 candidates, totalling 720 fits
clf 파라미터:  {'max_depth': 30, 'max_features': 3, 'n_estimators': 80}
clf 예측 정확도: 0.9695
LGB 파라미터:  {'learning_rate': 0.4, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 20}
LGB 예측 정확도: 0.9504
XGB 파라미터:  {'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 200}
XGB 예측 정확도: 0.9429
Lreg 파라미터:  {'C': 1.0, 'max_iter': 10}
Lreg 예측 정확도: 0.9465


In [19]:
from sklearn.model_selection import train_test_split

trainX, testX, trainY, testY = train_test_split(train_x, train_y, test_size = 0.3)

from sklearn.metrics import plot_confusion_matrix, confusion_matrix
import matplotlib.pyplot as plt
import warnings
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings("ignore")

clf = RandomForestClassifier(random_state=CFG.SEED,
                            max_depth = 30,
                            max_features = 3,
                            n_estimators = 80)
lgbm = LGBMClassifier(n_estimators=400,
                            learning_rate = 0.01,
                            num_leaves = 20,
                            max_depth = 10
                     )
xgbc = XGBClassifier(random_state=CFG.SEED,
                     min_child_weight = 3,
                    max_depth = 10,
                    n_estimators = 500)


lreg = LogisticRegression(C =7,
                        max_iter =50)

clf.fit(trainX, trainY)
y_pred = clf.predict(testX)
print("RandomForest Result")
print(confusion_matrix(testY, y_pred))
print()


lgbm.fit(trainX, trainY)
y_pred = lgbm.predict(testX)
print("LightGBM Result")
print(confusion_matrix(testY, y_pred))
print()


xgbc.fit(trainX, trainY)
y_pred = xgbc.predict(testX)
print("XGBC Result")
print(confusion_matrix(testY, y_pred))
print()

lreg.fit(trainX, trainY)
y_pred = lreg.predict(testX)
print("lreg Result")
print(confusion_matrix(testY, y_pred))
print()

RandomForest Result
[[20  0  0]
 [ 0 29  2]
 [ 0  3 25]]

LightGBM Result
[[20  0  0]
 [ 0 30  1]
 [ 0  3 25]]

XGBC Result
[[20  0  0]
 [ 0 29  2]
 [ 0  4 24]]

lreg Result
[[20  0  0]
 [ 0 29  2]
 [ 0  3 25]]



In [43]:
test_x

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,snp_poly
0,1,1,5,0,4,0,1,5,4,4,1,1,4,5,2,4,0.666667
1,2,5,1,3,5,3,0,0,0,0,5,1,0,0,0,0,0.133333
2,2,5,1,0,0,2,1,0,0,0,1,0,4,5,0,5,0.333333
3,2,5,1,2,0,3,0,0,0,0,5,0,4,1,0,0,0.266667
4,1,0,5,0,5,0,5,5,0,5,1,5,4,5,0,5,0.133333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,2,1,5,3,0,2,1,0,5,0,5,5,0,0,0,4,0.266667
171,2,5,0,0,0,2,1,0,0,0,1,0,0,1,0,4,0.333333
172,2,5,0,0,0,2,1,0,0,0,5,1,0,1,0,5,0.266667
173,2,1,5,2,4,3,5,0,4,0,5,1,0,0,0,0,0.333333


In [20]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(n_estimators=400,
                            learning_rate = 0.01,
                            num_leaves = 20,
                            max_depth = 10
                     )

lgbm.fit(train_x, train_y)

preds = lgbm.predict(test_x)

submit = pd.read_csv('sample_submission.csv')

submit['class'] = class_le.inverse_transform(preds)

submit.to_csv('allmix_withoutcm_LGBM.csv', index=False)

submit1 = pd.read_csv("allmix_withoutcm_LGBM.csv")

submit1["class"].value_counts()

ValueError: Number of features of the model must match the input. Model n_features_ is 23 and input n_features is 17

In [48]:
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(C =7,
                        max_iter =50)

lreg.fit(train_x, train_y)

preds = lreg.predict(test_x)

submit = pd.read_csv('sample_submission.csv')

submit['class'] = class_le.inverse_transform(preds)

submit.to_csv('allmix_withoutcm_Lreg.csv', index=False)

submit1 = pd.read_csv("allmix_withoutcm_Lreg.csv")

submit1["class"].value_counts()

B    85
A    51
C    39
Name: class, dtype: int64

In [49]:
clf = RandomForestClassifier(random_state=CFG.SEED,
                            max_depth = 30,
                            max_features = 3,
                            n_estimators = 80)

clf.fit(train_x, train_y)

preds = clf.predict(test_x)

submit = pd.read_csv('sample_submission.csv')

submit['class'] = class_le.inverse_transform(preds)

submit.to_csv('allmix_withoutcm_clf.csv', index=False)

submit1 = pd.read_csv("allmix_withoutcm_clf.csv")

submit1["class"].value_counts()

B    87
A    50
C    38
Name: class, dtype: int64