In [19]:
!pip3 install lightgbm scikit-learn tqdm catboost optuna xgboost ipywidgets imblearn
# 모델을 위해 필요한 라이브러리 설치(우수 예시에만 있는 부분)



[0m

In [20]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import os
import sklearn
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


In [21]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed=42
seed_everything(seed) # Seed 고정

### 데이터 셋 읽어오기

In [22]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [23]:
def category_all(df_train, df_test):
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    for i in label_columns:
        df_train[i] = df_train[i].astype('category')
        df_test[i] = df_test[i].astype('category')
    return df_train, df_test    


def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""
    
    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

        
def manage_label_encoding(df_train, df_test):
    tmp = len(df_train)
    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    
    df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

    for col in label_columns:
        df_all[col] = label_encoding(df_all[col])
    
    for col in label_columns:  
        df_train[col] = df_all.iloc[: tmp][col]
        df_test[col] = df_all.iloc[tmp :][col]

    return df_train, df_test

def preprocessing(df_train, df_test):
    df_train.drop_duplicates(keep='first', inplace=True)
    df_train.reset_index(inplace=True)
    df_train.drop('index',inplace=True, axis=1)
    df_test_id_temp = df_test['id'].copy()
    df_all = pd.concat([df_train,df_test.drop('id',axis=1)])
    df_all.drop('customer_country.1',axis=1,inplace=True)
    df_all.id_strategic_ver.fillna(0., inplace=True)
    df_all.it_strategic_ver.fillna(0., inplace=True)
    df_all.idit_strategic_ver.fillna(0., inplace=True)
    df_all.business_subarea.fillna('NAN', inplace=True)
    df_all.com_reg_ver_win_rate.fillna(0,inplace=True)
    df_all['customer_type'] = df_all['customer_type'].replace({
        'End-Customer':'End Customer'
    })
    df_all.customer_type.fillna('NAN',inplace=True)
    df_all.historical_existing_cnt.fillna(0, inplace=True)
    
    df_all.product_subcategory.fillna('NAN', inplace=True)
    
    df_all.product_modelname.fillna('NAN', inplace=True)

    df_all['expected_timeline'] = df_all['expected_timeline'].replace({
        'less_than_3_months': 'less than 3 months',  # merge into 'less than 3 months'
        'being followed up': 'being followed up.',  # merge into 'being followed up.'
        'less than 6 months': '3 months ~ 6 months',  # merge into '3 months ~ 6 months'
        '3_months_~_6_months' : '3 months ~ 6 months'
    })

    df_all.expected_timeline.fillna('NAN',inplace=True)
    df_all.ver_win_rate_x.fillna(0,inplace=True)
    df_all.ver_win_ratio_per_bu.fillna(0,inplace=True)
    df_all.business_area.fillna('NAN',inplace=True)
    df_all.customer_job.fillna('NAN', inplace=True)
    df_all.product_category.fillna('NAN', inplace=True)
    
    #처리못한부분
    #customer_country
    df_all.customer_country.fillna('NAN', inplace=True)
    #inquiry_type
    df_all.inquiry_type.fillna('NAN', inplace=True)

    label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
    ]
    
    for i in label_columns:
        df_all[i] = df_all[i].str.lower()
        df_all[i] = df_all[i].str.replace(pat=r'[^\w]', repl=r'', regex=True)
        
    tmp_len = len(df_train)
    df_train = df_all.iloc[: tmp_len ]
    df_test = df_all.iloc[tmp_len :]
    df_test = pd.concat([df_test,df_test_id_temp],axis=1)
    return df_train, df_test
df_train, df_test = preprocessing(df_train, df_test)
df_train, df_test = category_all(df_train, df_test)

In [24]:
from supervised.automl import AutoML
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    eval_metric='f1',
    validation_strategy={
                        "validation_type": "kfold",
                        "k_folds": 5,
                        "shuffle": True,
                        "stratify": True,
                        "random_seed": seed
                        },
    total_time_limit=None,
    features_selection=True,  # 특성 선택 활성화
    boost_on_errors=True,     # 오류에 대한 부스팅 활성화
)

In [25]:
df_train = sklearn.utils.shuffle(df_train, random_state=seed)
automl.fit(df_train.drop('is_converted',axis=1), df_train['is_converted'])

Linear algorithm was disabled.
AutoML directory: AutoML_3
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
'Decision Tree' Traceback (most recent call last):
  File "/home/server4/Desktop/compet/supervised/tuner/mljar_tuner.py", line 209, in generate_params
    return self.simple_algorithms_params(models_cnt)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/server4/Desktop/compet/supervised/tuner/mljar_tuner.py", line 727, in simple_algorithms_params
    params = self._get_mode

2024-02-09 06:34:24,492 supervised.exceptions ERROR No models produced. 
Please check your data or submit a Github issue at https://github.com/mljar/mljar-supervised/issues/new.


There was an error during 8_Xgboost training.
Please check AutoML_3/errors.md for details.
There was an error during 26_CatBoost training.
Please check AutoML_3/errors.md for details.
There was an error during 35_RandomForest training.
Please check AutoML_3/errors.md for details.
There was an error during 44_ExtraTrees training.
Please check AutoML_3/errors.md for details.
There was an error during 53_NeuralNetwork training.
Please check AutoML_3/errors.md for details.
There was an error during 62_NearestNeighbors training.
Please check AutoML_3/errors.md for details.
There was an error during 18_LightGBM training.
Please check AutoML_3/errors.md for details.
There was an error during 9_Xgboost training.
Please check AutoML_3/errors.md for details.
There was an error during 27_CatBoost training.
Please check AutoML_3/errors.md for details.
There was an error during 36_RandomForest training.
Please check AutoML_3/errors.md for details.
There was an error during 45_ExtraTrees training.
P

AutoMLException: No models produced. 
Please check your data or submit a Github issue at https://github.com/mljar/mljar-supervised/issues/new.

### 모델 성능 보기

In [41]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [44]:
pred = automl.predict(x_train)
get_clf_eval(y_train, pred)

오차행렬:
 [[ 4825    25]
 [   19 54430]]

정확도: 0.9993
정밀도: 0.9961
재현율: 0.9948
F1: 0.9955


## 4. 제출하기

### 테스트 데이터 예측

In [45]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)
x_test

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner
0,0.00,11,2,0.073248,47466,9,0,53.0,,,...,74,43,271,1,0,0.001183,0.049840,10,25,278
1,0.25,15156,3,,5405,9,1,,,,...,74,50,271,0,0,0.000013,,12,62,437
2,1.00,53,2,,13597,29,1,,,,...,64,18,246,0,0,0.000060,0.131148,4,37,874
3,0.50,893,2,0.118644,17204,33,0,,,,...,74,50,267,0,0,0.001183,0.049840,10,86,194
4,1.00,1402,2,0.074949,2329,9,0,2.0,1.0,,...,79,43,246,1,1,0.003079,0.064566,0,28,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,0.50,10650,0,,40292,33,0,10.0,,,...,9,43,271,0,0,,,6,86,97
5267,0.25,16640,3,,47466,33,0,0.0,,,...,74,50,271,0,0,,,6,86,438
5268,0.75,1079,0,,46227,29,0,,,,...,64,43,246,0,0,,,6,86,97
5269,0.00,24,3,,45667,9,1,,,,...,74,12,271,0,0,,,6,86,429


In [47]:
test_pred = automl.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

495

### 제출 파일 작성

In [16]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**