In [7]:
# ====================================================
# Library
# ====================================================
import sys
import os
import gc
import re
import unicodedata
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm
import category_encoders as ce
import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.layers import BatchNormalization
from keras.layers import Activation
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from keras.models import load_model
from keras.callbacks import Callback
from keras.models import clone_model
from sklearn.linear_model import LogisticRegression


pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',100)

# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 1
    AUTHOR = 'Naoki'
    COMPETITION = 'SC2024'
    DATA_PATH = Path('/data')
    OOF_DATA_PATH = Path('/oof')
    MODEL_DATA_PATH = Path('/models')
    SUB_DATA_PATH = Path('/submission')
    METHOD_LIST = ['lightgbm','catboost']
    seed = 42
    n_folds = 7
    target_col = 'ProdTaken'
    metric = 'AUC'
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
    }

    classification_cat_params = {
        'learning_rate': 0.05,
        'iterations': num_boost_round,
        'random_seed': seed,
    }
    classification_adaboost_params = {
        'n_estimators': 100,
        'learning_rate': 1.0,
        'random_state': 42,
    }
    
    model_weight_dict = {'adaboost': 0.10,'lightgbm': 0.25, 'xgboost': 0.10, 'catboost': 0.25}
    
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)


# ====================================================
# Metric
# ====================================================
# AUC

# ====================================================
# LightGBM Metric
# ====================================================
def lgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro'), CFG.metric_maximize_flag

# ====================================================
# XGBoost Metric
# ====================================================
def xgb_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'f1score', f1_score(y_true, np.where(y_pred >= 0.5, 1, 0), average='macro')

In [8]:
#データの読み込み
train_df = pd.read_csv('data/train.csv', index_col=0)
test_df = pd.read_csv('data/test.csv', index_col=0)
def Preprocessing(train_df, test_df):
    # Age numeric
    # 漢数字とアラビア数字のマッピング
    kanji_to_num = {'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,'十': 10, '百': 100, '千': 1000, '万': 10000,'零': 0, '〇': 0}
    def kanji_to_arabic(kanji):
        result = 0
        temp = 0
        for char in kanji:
            value = kanji_to_num.get(char, None)
            if value is not None:
                if value < 10:
                    if temp == 0:
                        temp = value
                    else:
                        temp = temp * 10 + value
                elif value >= 10:
                    if temp == 0:
                        temp = 1
                    result += temp * value
                    temp = 0
        return result + temp
    '''
    def process_age(age):
        if age is None or str(age) == 'nan':
            return None
        age = unicodedata.normalize('NFKC', age)
        age = ''.join([c for c in age if c.isdigit() or c in kanji_to_num])
        if age.isdigit():
            return int(age)
        return kanji_to_arabic(age)
    
    def process_age(age):
        if age is None or str(age) == 'nan':
            return None
        age = unicodedata.normalize('NFKC', age)
        if '代' in age:
            return None  # 年代を欠損値として扱う意味あった
        age = ''.join([c for c in age if c.isdigit() or c in kanji_to_num])
        if age.isdigit():
            return int(age)
        return kanji_to_arabic(age)
    '''
    
    def process_age(age):
        if age is None or str(age) == 'nan':
            return None
        age = unicodedata.normalize('NFKC', age)
        if '代' in age:
            # 代表値で補う
            if '10代' in age:
                return 19
            elif '20代' in age:
                return 28
            elif '30代' in age:
                return 36
            # 他の年代にも対応する
            elif '40代' in age:
                return 43
            elif '50代' in age:
                return 52
            elif '60代' in age:
                return 60
            else:
                return None  # その他の年代は欠損値にする
        age = ''.join([c for c in age if c.isdigit() or c in kanji_to_num])
        if age.isdigit():
            return int(age)
        return kanji_to_arabic(age)

    
    # TypeofContact categorical(dummy)
    def TypeofContact_to_dummy(str):
        if str == 'Self Enquiry':
            return 1
        elif str == 'Company Invited':
            return 0
        
    # CityTier 順序尺度
    
    # DurationOfPitch numeric
    def convert_to_minutes(duration):
        # durationがfloat型またはNoneである可能性があるため、文字列であることを確認
        if pd.isnull(duration):
            return None  # NaNの場合、Noneを返す
        duration = str(duration)  # 文字列に変換してエラーを防ぐ
        if '分' in duration:
            return float(duration.replace('分', ''))
        elif '秒' in duration:
            return float(duration.replace('秒', '')) / 60  # 秒を分に変換し、整数で返す

    # Occupation categorical
       
    # Gender categorical
    def Gender_dealing(gender):
        # 文字列を半角に変換し、大文字に統一
        gender = unicodedata.normalize('NFKC', gender).upper().strip()
        # 不要な空白を削除
        gender = ''.join(gender.split())

        if 'FEMALE' in gender:
            return 1
        elif 'MALE' in gender:
            return 0
        else:
            return None  # 性別が識別できない場合はNoneを返す
        
    # NumberOfPersonVisiting numeric
    
    # NumberOfFollowups numeric
    def NumberOfFollowups_dealing(input_int):
        if input_int >= 100:
            return input_int /100
        else:
            return input_int
    
    # ProductPitched categorical
    # Designation categorical
    def standardize_str(input_str):
        # 文字列を半角に変換し、小文字に統一
        input_str = unicodedata.normalize('NFKC', input_str).lower().strip()
        # 不要な空白や特殊記号を削除
        input_str = ''.join(input_str.split())
        input_str = input_str.replace('|', 'l').replace('×', 'x').replace('𝘤', 'c').replace('𝖺', 'a').replace('𝙳', 'd')
        # その他特殊文字を通常の英字に置換
        input_str = input_str.replace('ᗞ', 'd').replace('𐊡', 'a').replace('𝘳', 'r').replace('ꓢ', 's').replace('ı', 'i')
        input_str = input_str.replace('β', 'b').replace('в', 'b').replace('с', 'c').replace('տ', 's').replace('ς', 'c')
        input_str = input_str.replace('ꭰ', 'd').replace('ε', 'e').replace('ι', 'i').replace('α', 'a').replace('ո', 'n')
        input_str = input_str.replace('ѕ', 's').replace('μ', 'm').replace('е', 'e').replace('а', 'a').replace('ѵ', 'v')
        input_str = input_str.replace('aasic', 'basic')
        return input_str
    
    # PreferredPropertyStar 順序尺度
    
    # NumberOfTrips numeric
    def NumberOfTrips_dealing(str):
        if pd.isnull(str):
            return None 
        if '半年に' in str:
            return 2 * int(str.replace('半年に', '').replace('回', ''))
        elif '年に' in str:
            return int(str.replace('年に', '').replace('回', ''))
        elif '四半期に' in str:
            return 4 * int(str.replace('四半期に', '').replace('回', ''))
        else :
            return int(str)
        
    # Passport categorical(dummy)
    
    # PitchSatisfactionScore 順序尺度だけど間隔尺度的要素あり
    
    # MonthlyIncome numeric
    def MonthlyIncome_dealing(input_str):
        if pd.isnull(input_str):
            return None 
        if '月収' in input_str:
            return 10000 * float(input_str.replace('月収', '').replace('万円', ''))
        elif '万円' in input_str:
            return 10000 * float(input_str.replace('万円', ''))
        else:
            return float(input_str)
        
    # customer_info
    def customer_info_dealing(input_str):
        # 文字列を半角に変換し、小文字に統一
        input_str = unicodedata.normalize('NFKC', input_str).lower().strip()
        # 不要な空白や特殊記号を削除
        input_str = input_str.replace('/', ' ').replace('／', ' ').replace('、', ' ').replace('　', ' ')
        input_str = input_str.replace('\u3000', ' ').replace('\t', ' ').replace('\n', ' ')
        input_str = re.sub(r'(?<=\S)\s+(?=\S)', ',', input_str, count=2)
        return input_str
    
    # married categorical
    def married_dealing(input_str):
        if input_str == '未婚':
            return 'unmarried'
        elif input_str == '独身':
            return 'single'
        elif input_str == '離婚済み':
            return 'divorced'
        elif input_str == '結婚済み':
            return 'married'
    # car_possesion categorival
    def car_possesion_dealing(input_str):
        if input_str in ['車未所持', '自動車未所有', '自家用車なし', '乗用車なし', '車なし', '車保有なし', 0]:
            return 0
        elif input_str in ['車所持', '自動車所有', '自家用車あり', '乗用車所持', '車保有', '車あり', 1]:
            return 1
        
    # offspring -1以外はnumeric
    def offspring_dealing(input_str):
        if '1' in input_str:
            return 1
        elif '2' in input_str:
            return 2
        elif '3' in input_str:
            return 3
        elif input_str in ['子供なし', '子供無し', '子供ゼロ', '非育児家庭', '無子']:
            return 0
        elif input_str in ['子供の数不明', '不明', 'わからない', '子育て状況不明', '子の数不詳']:
            return -99
      
    def function_apply(input_df):
        df = input_df.copy()
        df['Age'] = df['Age'].apply(process_age)
        df['TypeofContact'] = df['TypeofContact'].apply(TypeofContact_to_dummy)
        df['DurationOfPitch'] = df['DurationOfPitch'].apply(convert_to_minutes)
        df['Gender'] = df['Gender'].apply(Gender_dealing)
        df['NumberOfFollowups'] = df['NumberOfFollowups'].apply(NumberOfFollowups_dealing)
        df['ProductPitched'] = df['ProductPitched'].apply(standardize_str)
        df['NumberOfTrips'] = df['NumberOfTrips'].apply(NumberOfTrips_dealing)
        df['Designation'] = df['Designation'].apply(standardize_str)
        df['MonthlyIncome'] = df['MonthlyIncome'].apply(MonthlyIncome_dealing)
        df['customer_info'] = df['customer_info'].apply(customer_info_dealing)
        df[['married', 'car_possesion', 'offspring']] = df['customer_info'].str.split(',', n=2, expand=True)
        df = df.drop(['customer_info'],axis=1)
        df['married'] = df['married'].apply(married_dealing)
        df['car_possesion'] = df['car_possesion'].apply(car_possesion_dealing)
        df['offspring'] = df['offspring'].apply(offspring_dealing)
        return df
    def miss_dealing(train_df,test_df):
        combined = pd.concat([train_df,test_df], ignore_index=True)
        for col in ['TypeofContact']:
            train_df[col] = train_df[col].fillna(-1)
            test_df[col] = test_df[col].fillna(-1)
        for col in ['Age','DurationOfPitch','NumberOfFollowups','NumberOfTrips','MonthlyIncome']:
            median = combined[col].median()
            train_df[col] = train_df[col].fillna(median)
            test_df[col] = test_df[col].fillna(median)
        return train_df,test_df
        
     #特徴量作成
    def make_features(input_df):
        df = input_df.copy()
            
        
        
        
        
        
        
        return df
    
    

    def encoding(train_df,test_df):
        #ラベルエンコーディング
        LabelList = ['TypeofContact']
        for col in LabelList:
            encoder = LabelEncoder()
            combined = pd.concat([train_df[col], test_df[col]], axis=0)
            encoder.fit(combined)
            train_df[col] = encoder.transform(train_df[col])
            test_df[col] = encoder.transform(test_df[col])
        #ワンホットエンコーディング
        OneHotList = ['CityTier','Occupation','ProductPitched','PreferredPropertyStar','Designation','married']
        train_df2 = train_df.drop([CFG.target_col],axis=1)
        ohe = ce.OneHotEncoder(cols=OneHotList,use_cat_names=True)
        train_df2 = ohe.fit_transform(train_df2)
        test_df = ohe.transform(test_df)
        train_df = pd.concat([train_df2,train_df[CFG.target_col]],axis=1)
        return train_df, test_df
        
    
    
    train_applied = function_apply(train_df)
    test_applied = function_apply(test_df)
    train_applied.to_csv('data/train_processed.csv')
    test_applied.to_csv('data/test_processed.csv')
    
    return train_applied, test_applied
    
#前処理の実行
train_df, test_df = Preprocessing(train_df,test_df)
print(train_df)
print(test_df)

       Age  TypeofContact  CityTier  DurationOfPitch      Occupation  Gender  \
id                                                                             
0     50.0            1.0         2             15.0  Large Business       0   
1     56.0            0.0         1             14.0        Salaried       0   
2      NaN            1.0         1             10.0  Large Business       1   
3     37.0            1.0         2             18.0  Small Business       1   
4     48.0            0.0         3             17.0  Small Business       1   
...    ...            ...       ...              ...             ...     ...   
3484  40.0            1.0         2             26.0        Salaried       0   
3485  43.0            1.0         1              9.0  Large Business       0   
3486  31.0            1.0         1             14.0  Small Business       1   
3487  56.0            0.0         2             15.0        Salaried       0   
3488  42.0            1.0         1     