In [8]:
!pip3 install lightgbm scikit-learn tqdm catboost optuna xgboost ipywidgets imblearn
# 모델을 위해 필요한 라이브러리 설치(우수 예시에만 있는 부분)



[0m

In [14]:
import pandas as pd
import sklearn
import numpy as np
import random
import os
from imblearn.over_sampling import BorderlineSMOTE
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
from catboost import CatBoostClassifier, Pool

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold


In [15]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed=42
seed_everything(seed) # Seed 고정

In [16]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv(r"submission(0.767).csv") # 테스트 데이터(제출파일의 데이터)

In [17]:
def category_all(df_train, df_test):
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    'customer_idx',
    'lead_owner',
    'id_strategic_ver',
    'it_strategic_ver',
    'idit_strategic_ver',
    'ver_cus',
    'ver_pro',
    ]
    for i in label_columns:
        df_train[i] = df_train[i].astype('category')
        df_test[i] = df_test[i].astype('category')
    return df_train, df_test    


def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""
    
    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

        
def manage_label_encoding(df_train, df_test):
    tmp = len(df_train)
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
   'customer_idx',
   'lead_owner',
    'id_strategic_ver',
    'it_strategic_ver',
    'idit_strategic_ver',
    'ver_cus',
    'ver_pro',
    ]
    
    df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

    for col in label_columns:
        df_all[col] = label_encoding(df_all[col])
    
    for col in label_columns:  
        df_train[col] = df_all.iloc[: tmp][col]
        df_test[col] = df_all.iloc[tmp :][col]

    return df_train, df_test

def preprocessing(df_train, df_test):
    df_train.drop_duplicates(keep='first', inplace=True)
    df_train.reset_index(inplace=True)
    df_train.drop('index',inplace=True, axis=1)
    df_test_id_temp = df_test['id'].copy()
    df_all = pd.concat([df_train,df_test.drop('id',axis=1)])
    df_all.drop('customer_country.1',axis=1,inplace=True)
    df_all.id_strategic_ver.fillna(0., inplace=True)
    df_all.it_strategic_ver.fillna(0., inplace=True)
    df_all.idit_strategic_ver.fillna(0., inplace=True)
    df_all.business_subarea.fillna('NAN', inplace=True)
    df_all.com_reg_ver_win_rate.fillna(0,inplace=True)
    df_all['customer_type'] = df_all['customer_type'].replace({
        'End-Customer':'End Customer'
    })
    df_all.customer_type.fillna('NAN',inplace=True)
    df_all.historical_existing_cnt.fillna(0, inplace=True)
    
    df_all.product_subcategory.fillna('NAN', inplace=True)
    
    df_all.product_modelname.fillna('NAN', inplace=True)

    df_all['expected_timeline'] = df_all['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',
        'being followed up': 'being followed up.',
        'less than 6 months': '3 months ~ 6 months',
        '3_months_~_6_months' : '3 months ~ 6 months'
    })

    df_all.expected_timeline.fillna('NAN',inplace=True)
    df_all.ver_win_rate_x.fillna(0,inplace=True)
    df_all.ver_win_ratio_per_bu.fillna(0,inplace=True)
    df_all.business_area.fillna('NAN',inplace=True)
    df_all.customer_job.fillna('NAN', inplace=True)
    df_all.product_category.fillna('NAN', inplace=True)
    
    #처리못한부분
    #customer_country
    df_all.customer_country.fillna('NAN', inplace=True)
    #inquiry_type
    df_all.inquiry_type.fillna('NAN', inplace=True)

    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    
    ###############################
    #detect special char and to lower
    for i in label_columns:
        df_all[i] = df_all[i].str.lower()
        df_all[i] = df_all[i].str.replace(pat=r'[^\w]', repl=r'', regex=True)
        
    #to int
    df_all['id_strategic_ver'] = df_all['id_strategic_ver'].astype('int64')
    df_all['it_strategic_ver'] = df_all['it_strategic_ver'].astype('int64')
    df_all['idit_strategic_ver'] = df_all['idit_strategic_ver'].astype('int64')
    df_all['lead_desc_length'] = df_all['lead_desc_length'].astype('int64')
    df_all['historical_existing_cnt'] = df_all['historical_existing_cnt'].astype('int64')
    #df_all['bant_submit'] = (df_all['bant_submit']*4).astype('int64')
    
    #seperate
    tmp_len = len(df_train)
    df_train = df_all.iloc[: tmp_len ]
    df_test = df_all.iloc[tmp_len :]
    df_test = pd.concat([df_test,df_test_id_temp],axis=1)
    
    #RobustScaler
    robustScaler = RobustScaler()
    df_train['ver_win_rate_x'] = robustScaler.fit_transform(df_train[['ver_win_rate_x']])
    df_test['ver_win_rate_x'] = robustScaler.transform(df_test[['ver_win_rate_x']])
    robustScaler = RobustScaler()
    df_train['ver_win_ratio_per_bu'] = robustScaler.fit_transform(df_train[['ver_win_ratio_per_bu']])
    df_test['ver_win_ratio_per_bu'] = robustScaler.transform(df_test[['ver_win_ratio_per_bu']])
    ##############################
    
    return df_train, df_test


df_train, df_test = preprocessing(df_train, df_test)
df_train, df_test = category_all(df_train, df_test)
rate = ((len(df_train)-sum(df_train['is_converted']))/sum(df_train['is_converted']))
x_train, y_train = df_train.drop("is_converted",axis=1), list(df_train["is_converted"])
x_val, y_val = df_test.drop(["is_converted", 'id'], axis=1), list(df_test["is_converted"])

In [19]:
def objective(trial):
    params = {
        "objective": "binary:logistic",
        'scale_pos_weight': rate,
        'random_state':seed,
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_uniform('learning_rate', 0.0001, 0.1),
        'n_estimators': trial.suggest_int("n_estimators", 300, 3000, 200),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.4, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.4, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1.0),
        "gamma": trial.suggest_float("gamma", 0.01, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'eta': trial.suggest_float('eta', 0.007, 0.013), # Most important parameter.
    }
    
    score = []
    clf = XGBClassifier(**params, tree_method='hist', device='cuda', enable_categorical=True)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_val)
    score = f1_score(y_val,y_pred,labels=[True,False])
    return score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=None)

[I 2024-02-12 12:22:12,952] A new study created in memory with name: no-name-75ef0ff8-3310-4917-9e48-c094e590a046
[I 2024-02-12 12:22:20,253] Trial 0 finished with value: 0.0 and parameters: {'max_depth': 7, 'learning_rate': 0.09507635921035062, 'n_estimators': 2300, 'colsample_bytree': 0.759195090518222, 'colsample_bylevel': 0.4936111842654619, 'colsample_bynode': 0.49359671220172163, 'reg_lambda': 0.01306673923805328, 'reg_alpha': 0.5399484409787431, 'gamma': 0.6051038616257767, 'min_child_weight': 15, 'eta': 0.007123506965774815}. Best is trial 0 with value: 0.0.
[I 2024-02-12 12:22:31,141] Trial 1 finished with value: 0.002254791431792559 and parameters: {'max_depth': 15, 'learning_rate': 0.08326101981596214, 'n_estimators': 700, 'colsample_bytree': 0.5090949803242604, 'colsample_bylevel': 0.5100427059120604, 'colsample_bynode': 0.5825453457757226, 'reg_lambda': 0.11207606211860566, 'reg_alpha': 0.0730953983591291, 'gamma': 0.2983168487960615, 'min_child_weight': 13, 'eta': 0.00783

KeyboardInterrupt: 

In [13]:
best_param = study.best_params
best_param['scale_pos_weight'] = rate
best_param["objective"] = "binary:logistic"
best_param['random_state'] = seed
best_param

{'max_depth': 6,
 'learning_rate': 0.036699548145039805,
 'n_estimators': 1500,
 'colsample_bytree': 0.8711055768358081,
 'colsample_bylevel': 0.5198042692950159,
 'colsample_bynode': 0.708540663048167,
 'reg_lambda': 0.15304852121831464,
 'reg_alpha': 0.012385137298860933,
 'gamma': 0.611469403382424,
 'min_child_weight': 4,
 'eta': 0.007390309557911677,
 'scale_pos_weight': 11.073593073593074,
 'objective': 'binary:logistic',
 'random_state': 42}

In [43]:
best_models = []
best_param = {
    #'scale_pos_weight':rate,
    'objective':"binary:logistic",
    'random_state':seed
}
for all_x_train, all_y_train in zip(all_x_trains, all_y_trains):
    best_model = XGBClassifier(**best_param,tree_method='hist', device='cuda', enable_categorical=True)
    best_model.fit(all_x_train, all_y_train)
    best_models.append(best_model)

In [44]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [47]:
train_pred = [best_model.predict_proba(df_train.drop("is_converted",axis=1)) for best_model in best_models]
y = list(df_train['is_converted'])
train_pred = np.mean(train_pred[:],axis=0)
threshold = min([num[1] if y[i] else 1 for i, num in enumerate(train_pred)])
train_y_pred = [True if i[1]>=0.5 else False for i in train_pred]
get_clf_eval(y, train_y_pred)

오차행렬:
 [[ 4387   233]
 [   51 51109]]

정확도: 0.9949
정밀도: 0.9885
재현율: 0.9496
F1: 0.9686


In [48]:
threshold

0.012190523

In [49]:
categorical_features = [
                                            "customer_country",
                                            "business_subarea",
                                            "business_area",
                                            "business_unit",
                                            "customer_type",
                                            "enterprise",
                                            "customer_job",
                                            "inquiry_type",
                                            "product_category",
                                            "product_subcategory",
                                            "product_modelname",
                                            "customer_position",
                                            "response_corporate",
                                            "expected_timeline",
                                            'customer_idx',
                                            'lead_owner',
                                            'lead_owner',
                                            'id_strategic_ver',
                                            'it_strategic_ver',
                                            'idit_strategic_ver',
                                            'ver_cus',
                                            'ver_pro',
                                        ]
# 
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [50]:
a_pred = [best_model.predict_proba(x_test) for best_model in best_models]
pred = np.mean(a_pred, axis=0)
pred = [True if i[1]>=0.2 else False for i in pred]
#pred = np.argmax(t_pred, axis=1)
sum(pred)

19

In [22]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [31]:
real = pd.read_csv(r'submission(0.767).csv')
get_clf_eval(real['is_converted'], pred)

오차행렬:
 [[   3 1769]
 [   5 3494]]

정확도: 0.6634
정밀도: 0.3750
재현율: 0.0017
F1: 0.0034


In [32]:
custom_pred = [True if (real['is_converted'][i] or pred[i]) else False for i in range(len(real))]   
sum(custom_pred)

1777

In [33]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_feature_importance(importance,names,model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')
for i in range(10):
    plot_feature_importance(best_models[i].get_feature_importance(),df_test.drop(['is_converted','id'],axis=1).columns,f'CATBOOST-{i}')

AttributeError: 'XGBClassifier' object has no attribute 'get_feature_importance'

In [34]:
# all_train_pool = []
# all_val_pool = []
# all_y_val = []
for i in range(10):
    train_pred = np.mean([best_model.predict_proba(all_x_vals[i]) for best_model in best_models], axis=0)
    train_pred = np.argmax(train_pred, axis=1)
    get_clf_eval(all_y_vals[i], train_pred)

오차행렬:
 [[ 462    0]
 [ 110 5006]]

정확도: 0.9803
정밀도: 0.8077
재현율: 1.0000
F1: 0.8936
오차행렬:
 [[ 461    1]
 [  95 5021]]

정확도: 0.9828
정밀도: 0.8291
재현율: 0.9978
F1: 0.9057
오차행렬:
 [[ 461    1]
 [  95 5021]]

정확도: 0.9828
정밀도: 0.8291
재현율: 0.9978
F1: 0.9057
오차행렬:
 [[ 462    0]
 [ 105 5011]]

정확도: 0.9812
정밀도: 0.8148
재현율: 1.0000
F1: 0.8980
오차행렬:
 [[ 461    1]
 [ 100 5016]]

정확도: 0.9819
정밀도: 0.8217
재현율: 0.9978
F1: 0.9013
오차행렬:
 [[ 462    0]
 [  93 5023]]

정확도: 0.9833
정밀도: 0.8324
재현율: 1.0000
F1: 0.9086
오차행렬:
 [[ 461    1]
 [ 110 5006]]

정확도: 0.9801
정밀도: 0.8074
재현율: 0.9978
F1: 0.8925
오차행렬:
 [[ 462    0]
 [ 103 5013]]

정확도: 0.9815
정밀도: 0.8177
재현율: 1.0000
F1: 0.8997
오차행렬:
 [[ 462    0]
 [  93 5023]]

정확도: 0.9833
정밀도: 0.8324
재현율: 1.0000
F1: 0.9086
오차행렬:
 [[ 462    0]
 [  92 5024]]

정확도: 0.9835
정밀도: 0.8339
재현율: 1.0000
F1: 0.9094
