In [1]:
import pandas as pd
df_train = pd.read_csv('train_v7.csv', index_col=0)
df_test = pd.read_csv('test_v5.csv', index_col=0)

In [7]:
!pip install catboost
!pip install xgboost
!pip install category_encoders
!pip install numpy_typing


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Collecting numpy_typing
  Downloading numpy_typing-1.1.1.tar.gz (6.9 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproje

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
# 目的変数と特徴量の分割
X = df_train.drop(columns=['ProdTaken', 'id'])  # 特徴量
y = df_train['ProdTaken']  # 目的変数

# 訓練データとテストデータに分割 (80%を訓練データ、20%をテストデータ)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 訓練データと検証データに分割
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 新しい特徴量を作成する関数
def enhanced_preprocess_data(X):
    # 不要な特徴量の削除
    X = X.drop(columns=['NumberOfFollowups', 'MonthlyIncome', 'id', 'Unnamed: 0'], errors='ignore')

    # 収入レベルのカテゴリ作成
    def categorize_income(income):
        if income < 200000:
            return 'low'
        elif income < 400000:
            return 'medium'
        else:
            return 'high'
    X['income_level'] = df_train.loc[X.index, 'MonthlyIncome'].apply(categorize_income)

    # 年齢グループの作成
    X['age_group'] = pd.cut(X['Age'], bins=[0, 25, 40, 60, 100], labels=['young', 'adult', 'mid_age', 'senior'])

    # DurationOfPitch のカテゴリ分け
    X['pitch_duration_group'] = pd.cut(X['DurationOfPitch'], bins=[0, 300, 600, 900, 1200], labels=['short', 'medium', 'long', 'very_long'])

    # One-hot encoding
    X = pd.get_dummies(X, columns=['income_level', 'age_group', 'pitch_duration_group'], drop_first=True)

    return X

# 訓練データとテストデータに前処理適用
X_train_selected = enhanced_preprocess_data(X_train)
X_valid_selected = enhanced_preprocess_data(X_valid)
X_test_selected = enhanced_preprocess_data(df_test)

In [None]:
# 必要に応じて元データから `MonthlyIncome` 列を復元する
def advanced_feature_engineering(X, original_data):
    # 元データから MonthlyIncome 列を追加
    if 'MonthlyIncome' not in X.columns:
        X['MonthlyIncome'] = original_data['MonthlyIncome']

    # 交互作用項とログ変換
    X['Designation_Income'] = X['Designation'] * X['MonthlyIncome']
    X['Income_log'] = np.log1p(X['MonthlyIncome'])

    # 必要な列のみにフィルタリング（元のX_train_selectedと同じ列に揃える）
    X = X[X_train_selected.columns]

    return X

In [None]:
import lightgbm as lgb

# カテゴリ変数をターゲットエンコーディング
model = lgb.LGBMClassifier()
model.fit(X_train_selected, y_train)
importances = pd.Series(model.feature_importances_, index=X_train_selected.columns)
importances = importances.sort_values(ascending=False)
print("Feature importances:", importances)
selected_features = importances[importances > 5].index
X_train_selected = X_train_selected[selected_features]
X_valid_selected = X_valid_selected[selected_features]
X_test_selected = X_test_selected[selected_features]

In [None]:
from sklearn.model_selection import GridSearchCV

# ハイパーパラメータの候補
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500]
}

# モデルとグリッドサーチ
model = lgb.LGBMClassifier()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', verbose=1)
grid_search.fit(X_train_selected, y_train)

# 最適なパラメータとスコア
print("Best parameters:", grid_search.best_params_)
print("Best AUC score:", grid_search.best_score_)

# 最適なモデルで再学習
best_model = grid_search.best_estimator_

In [None]:
import category_encoders as ce
import numpy as np

X_train_advanced = advanced_feature_engineering(X_train_selected, X_train)
X_valid_advanced = advanced_feature_engineering(X_valid_selected, X_valid)
X_test_advanced = advanced_feature_engineering(X_test_selected, df_test)
encoder = ce.TargetEncoder(cols=['TypeofContact', 'Occupation', 'ProductPitched'])
X_train_encoded = encoder.fit_transform(X_train_advanced, y_train)
X_valid_encoded = encoder.transform(X_valid_advanced)
X_test_encoded = encoder.transform(X_test_advanced)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np

# データの分割（80%を訓練データ、20%を評価データに分割）
X_train_split, X_eval_split, y_train_split, y_eval_split = train_test_split(
    X_train_advanced, y_train, test_size=0.2, random_state=42
)

# 訓練データと検証データの作成（free_raw_data=False を設定）
train_data = lgb.Dataset(X_train_split, label=y_train_split, free_raw_data=False)
valid_data = lgb.Dataset(X_eval_split, label=y_eval_split, reference=train_data, free_raw_data=False)

# モデルのパラメータ設定
params = {
    'objective': 'binary',
    'learning_rate': 0.08,
    'num_leaves': 20,
    'metric': 'auc'
}

# 訓練ループ
best_iteration = 0
best_score = -np.inf
no_improve_rounds = 50  # 早期停止のチェック回数
current_no_improve = 0  # 改善がない連続回数を追跡

# 最大500回の繰り返し
for num_round in range(1, 501):
    lgb_model = lgb.train(params, train_data, num_boost_round=num_round, init_model=lgb_model if num_round > 1 else None)
    valid_pred = lgb_model.predict(X_eval_split, num_iteration=num_round)
    current_score = roc_auc_score(y_eval_split, valid_pred)

    if current_score > best_score:
        best_score = current_score
        best_iteration = num_round
        current_no_improve = 0  # 改善があったのでリセット
    else:
        current_no_improve += 1

    # 早期停止条件を満たした場合
    if current_no_improve >= no_improve_rounds:
        print(f"Early stopping at iteration {best_iteration} with best AUC score {best_score}")
        break

# 検証データでの最適反復数を使用した予測
valid_pred = lgb_model.predict(X_valid_encoded, num_iteration=best_iteration)
print("Final AUC score on validation set with manual early stopping:", roc_auc_score(y_valid, valid_pred))

In [None]:
# 新しい特徴量を追加する関数
def enhanced_feature_engineering(df):
    # 交互作用項の追加
    df['Designation_MonthlyIncome'] = df['Designation'] * df['MonthlyIncome']
    df['Trips_Per_Pitch'] = df['NumberOfTrips'] / (df['DurationOfPitch'] + 1)  # 0除算を避けるため +1
    df['Income_Per_Person'] = df['MonthlyIncome'] / (df['NumberOfPersonVisiting'] + 1)

    # 年齢グループ化
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 25, 40, 60, 100], labels=['young', 'adult', 'mid_age', 'senior'])
    df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)  # One-hot encoding

    return df

# 訓練・検証・テストデータへの適用
X_train_enhanced = enhanced_feature_engineering(X_train_encoded)
X_valid_enhanced = enhanced_feature_engineering(X_valid_encoded)
X_test_enhanced = enhanced_feature_engineering(X_test_encoded)

In [None]:
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import catboost as cb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 各モデルのインスタンス
lgb_model = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=100, num_leaves=31)
xgb_model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=100)
cat_model = cb.CatBoostClassifier(learning_rate=0.1, iterations=100, verbose=0)
logreg_model = LogisticRegression(max_iter=1000)
rf_model = RandomForestClassifier(n_estimators=100)

# アンサンブルモデル
ensemble_model = VotingClassifier(estimators=[
    ('lgb', lgb_model), ('xgb', xgb_model), ('cat', cat_model),
    ('logreg', logreg_model), ('rf', rf_model)
], voting='soft')

# アンサンブルモデルの学習と評価
ensemble_model.fit(X_train_selected, y_train)
valid_pred = ensemble_model.predict_proba(X_valid_selected)[:, 1]
print("AUC score on validation set with ensemble:", roc_auc_score(y_valid, valid_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# 各モデルの学習
lgb_model = lgb.LGBMClassifier(**grid_search.best_params_)
lgb_model.fit(X_train_enhanced, y_train)

xgb_model = xgb.XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=best_iteration)
xgb_model.fit(X_train_enhanced, y_train)

cat_model = cb.CatBoostClassifier(learning_rate=0.1, iterations=100, verbose=0)
cat_model.fit(X_train_enhanced, y_train)

# 新たに追加するモデル
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train_enhanced, y_train)

rf_model = RandomForestClassifier()
rf_model.fit(X_train_enhanced, y_train)

# 各モデルの予測結果取得（検証データ）
lgb_pred = lgb_model.predict_proba(X_valid_enhanced)[:, 1]
xgb_pred = xgb_model.predict_proba(X_valid_enhanced)[:, 1]
cat_pred = cat_model.predict_proba(X_valid_enhanced)[:, 1]
gb_pred = gb_model.predict_proba(X_valid_enhanced)[:, 1]
rf_pred = rf_model.predict_proba(X_valid_enhanced)[:, 1]

# 重み付きアンサンブルのスコア確認（例: LightGBMとXGBoostに重点を置く）
ensemble_pred = 0.4 * lgb_pred + 0.3 * xgb_pred + 0.15 * cat_pred + 0.1 * gb_pred + 0.05 * rf_pred
print("AUC score on validation set with expanded ensemble:", roc_auc_score(y_valid, ensemble_pred))

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Stratified K-Fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
lgb_scores = cross_val_score(lgb_model, X_train_enhanced, y_train, cv=skf, scoring='roc_auc')
xgb_scores = cross_val_score(xgb_model, X_train_enhanced, y_train, cv=skf, scoring='roc_auc')
cat_scores = cross_val_score(cat_model, X_train_enhanced, y_train, cv=skf, scoring='roc_auc')

print("LGBM Stratified K-Fold AUC:", lgb_scores.mean())
print("XGBoost Stratified K-Fold AUC:", xgb_scores.mean())
print("CatBoost Stratified K-Fold AUC:", cat_scores.mean())

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# スタッキングアンサンブル構築
stacking_model = StackingClassifier(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('cat', cat_model),
        ('gb', GradientBoostingClassifier()),
        ('rf', RandomForestClassifier())
    ],
    final_estimator=LogisticRegression(max_iter=1000),  # メタモデル
    cv=5
)

# 学習と評価
stacking_model.fit(X_train_enhanced, y_train)
stacking_pred = stacking_model.predict_proba(X_valid_enhanced)[:, 1]
print("AUC score on validation set with stacking ensemble:", roc_auc_score(y_valid, stacking_pred))

In [None]:
# スタッキングモデルによるテストデータの予測
test_stacking_pred = stacking_model.predict_proba(X_test_enhanced)[:, 1]

# 提出用データフレームの作成
submission = pd.DataFrame({'id': df_test['id'], 'prediction': test_stacking_pred})

# 提出ファイルの保存
submission.to_csv('evaluate/submission_v13.csv', index=False)