In [1]:
import pandas as pd

# csvの読み取り
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data.head()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease
0,59,Male,0.786886,0.150498,220.178691,13.467617,21.729246,6.815731,3.112276,1.006802,0
1,69,Male,1.003987,0.195625,221.218413,51.033462,64.752323,6.889608,3.051521,0.751346,1
2,65,Male,0.65714,0.081287,320.770533,12.625011,30.61318,5.947767,2.489167,0.774952,0
3,65,Male,0.906822,0.214173,369.27816,34.347597,54.510085,6.967183,3.613837,0.988155,1
4,22,Female,1.734959,0.197706,222.782025,20.572891,170.010177,5.837537,3.068697,1.026654,0


In [2]:
#欠損値の確認
missing_values = train_data.isnull().sum()
missing_values

Age         0
Gender      0
T_Bil       0
D_Bil       0
ALP         0
ALT_GPT     0
AST_GOT     0
TP          0
Alb         0
AG_ratio    0
disease     0
dtype: int64

In [3]:
# Encode the 'Gender' column (Male: 1, Female: 0)
train_data['Gender'] = train_data['Gender'].map({'Male': 1, 'Female': 0})

train_data.head()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease
0,59,1,0.786886,0.150498,220.178691,13.467617,21.729246,6.815731,3.112276,1.006802,0
1,69,1,1.003987,0.195625,221.218413,51.033462,64.752323,6.889608,3.051521,0.751346,1
2,65,1,0.65714,0.081287,320.770533,12.625011,30.61318,5.947767,2.489167,0.774952,0
3,65,1,0.906822,0.214173,369.27816,34.347597,54.510085,6.967183,3.613837,0.988155,1
4,22,0,1.734959,0.197706,222.782025,20.572891,170.010177,5.837537,3.068697,1.026654,0


In [4]:
from sklearn.preprocessing import StandardScaler

# スケールの初期化
scaler = StandardScaler()

# スケーリングする特徴量の選択し
features_to_scale = ['Age', 'T_Bil', 'D_Bil', 'ALP', 'ALT_GPT', 'AST_GOT', 'TP', 'Alb', 'AG_ratio']

#スケーリングの適用
train_data[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])

train_data.head()

Unnamed: 0,Age,Gender,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease
0,0.753431,1,-0.335955,-0.290949,-0.259863,-0.195492,-0.356975,-0.257757,-0.724042,-0.606631,0
1,1.361556,1,-0.247057,-0.262171,-0.254671,0.062088,0.033929,-0.169826,-0.832608,-1.731472,1
2,1.118306,1,-0.389083,-0.335085,0.242516,-0.20127,-0.276256,-1.290839,-1.837506,-1.627531,0
3,1.118306,1,-0.286844,-0.250342,0.484775,-0.052323,-0.059131,-0.077493,0.172224,-0.688739,1
4,-1.496631,0,0.052258,-0.260844,-0.246862,-0.146773,0.990291,-1.422039,-0.801915,-0.519213,0


In [5]:

import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

# データの準備
X = train_data.drop('disease', axis=1)
y = train_data['disease']

# 訓練用データと検証用データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBMモデルの初期化
lgbm_model = lgb.LGBMClassifier(random_state=42)

#  モデルの訓練
lgbm_model.fit(X_train, y_train)

# 検証データに対する予測
y_val_pred_proba = lgbm_model.predict_proba(X_val)[:, 1]

# AUCスコアの計算
auc_score = roc_auc_score(y_val, y_val_pred_proba)

# AUCスコアの表示
auc_score


[LightGBM] [Info] Number of positive: 301, number of negative: 379
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001242 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1859
[LightGBM] [Info] Number of data points in the train set: 680, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.442647 -> initscore=-0.230426
[LightGBM] [Info] Start training from score -0.230426


0.9674626448819996

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Re-define X and y
X = train_data.drop('disease', axis=1)
y = train_data['disease']

# Re-split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict probabilities on the validation set
y_val_pred_proba_rf = rf_model.predict_proba(X_val)[:, 1]

# Calculate the AUC score for RandomForest
auc_score_rf = roc_auc_score(y_val, y_val_pred_proba_rf)

# Display the AUC score
print(auc_score_rf)

0.9733975701717636


In [7]:
# 'Gender'列を数値に変換
test_data['Gender'] = test_data['Gender'].map({'Male': 1, 'Female': 0})

# 予測の再計算
test_predictions_proba = rf_model.predict_proba(test_data)[:, 1]

# 提出用データの作成
submission = pd.DataFrame({
    'id': test_data.index,
    'predicted_probability': test_predictions_proba
})

# CSVファイルとして保存
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False, header=False)

# CSVファイルの確認
import pandas as pd
pd.read_csv(submission_file_path)

Unnamed: 0,0,0.79
0,1,0.76
1,2,0.79
2,3,0.79
3,4,0.76
4,5,0.79
...,...,...
344,345,0.76
345,346,0.79
346,347,0.79
347,348,0.79
