##### ▶ライブラリ

In [1]:
import pandas as pd
import numpy as np
import requests
import io
import math
import copy
from scipy.stats import gmean
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import patches
from pipeline_functions import PipelineFunctions
pf = PipelineFunctions()
from view_functions import ViewFunctions
vf = ViewFunctions()

import umap
from sklearn.manifold import TSNE
import seaborn as sns
palette = ['#CC521D', '#4F4AD7', '#39AE3D']

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

##### ▶KNNを検証する

In [4]:
from sklearn.neighbors import KNeighborsClassifier

### 変えないパラメータ ###
TRAIN_TEST_SPLIT_SEED = 1
KFOLD_SHUFFLE_SEED = 1
KFOLD_SHUFFLE_LABEL = 'saito_label'
Y_COL = 'is_good_saito'
K = 5

### 変えるパラメータ ###
PARAM_COMBO = pd.read_csv('../temp/param_combo.csv', delimiter=',', index_col=0)[220:]
PARAM_COMBO_GRAD = pd.read_csv('../temp/param_combo_grad.csv', delimiter=',', index_col=0)

### ハイパーパラメータ ###
KNN_PARAMETER = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)

for i, row in PARAM_COMBO.iterrows():
    ### パラメータの組み合わせ ###
    ptcnt_type, ptcnt, input_col = row['ptcnt_type'], row['ptcnt'], row['X_col']
    
    ### 結果格納用df ###
    param_result = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)
    param_result_dict = {}
    for col in param_result:
        param_result_dict[col] = param_result[col]
    for k in range(K):
        param_result_dict[f'kfold_{k}_recall'] = []
        param_result_dict[f'kfold_{k}_precision'] = []
        param_result_dict[f'kfold_{k}_f1'] = []
        param_result_dict[f'kfold_{k}_accuracy'] = []

    param_result_dict['test_recall'] = []
    param_result_dict['test_precision'] = []
    param_result_dict['test_f1'] = []
    param_result_dict['test_accuracy'] = []

    data = pd.read_csv(f'../temp/eachpt_feature/{ptcnt_type}_{ptcnt}_{input_col}.csv', delimiter=',', index_col=0)
    X = data.copy()
    y = data[Y_COL]
    # 学習データとテストデータを725:310に分割する
    train_valid, test, y_train_valid, y_test = train_test_split(X, y, train_size=725, shuffle=True, stratify=y, random_state=TRAIN_TEST_SPLIT_SEED)
   
    for j, row in KNN_PARAMETER.iterrows():
        ### ハイパーパラメータ ###
        n_neighbors = row['n_neighbors']
        
        # K-Fold
        skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=KFOLD_SHUFFLE_SEED)
        for k, (train_index, valid_index) in enumerate(skf.split(train_valid, train_valid[KFOLD_SHUFFLE_LABEL])):
            
            ### 訓練データと検証データに分ける ###
            train, valid = train_valid.iloc[train_index], train_valid.iloc[valid_index]
            
            ### 説明変数と目的変数に分ける ###
            X_col = [col for col in train_valid if 'feature_' in col]
            y_col = 'is_good_saito'
            train_X = train.copy()[X_col]
            train_y = train.copy()[y_col]
            valid_X = valid.copy()[X_col]
            valid_y = valid.copy()[y_col]
            
            ### 学習データのスケーリング(正規化, 学習データのmin, maxを検証データとテストデータに適用) ###
            scaler = MinMaxScaler()
            for col in train_X:
                train_minmax = scaler.fit(train_X[[col]])
                train_X[f'norm_{col}'] = scaler.transform(train_X[[col]])
                valid_X[f'norm_{col}'] = scaler.transform(valid_X[[col]])
                del train_X[col]
                del valid_X[col]
        
            ### モデルへの入力形式に変換する ###
            train_X = train_X.to_numpy()
            train_y = train_y.to_numpy()
            valid_X = valid_X.to_numpy()
            valid_y = valid_y.to_numpy()
        
            ### モデルを生成する ###
            model = KNeighborsClassifier(n_neighbors=n_neighbors)
            ### モデルを学習させる ###
            result = model.fit(train_X, train_y)
            ### 検証データで精度を算出する ###
            valid_pred = model.predict(valid_X)
            ### 各指標の値(validationデータに対する) ###
            recall = recall_score(valid_y, valid_pred, average=None)[1]
            precision = precision_score(valid_y, valid_pred, average=None)[1]
            f1 = f1_score(valid_y, valid_pred, average=None)[1]
            accuracy = accuracy_score(valid_y, valid_pred)
            
            ### 結果を格納する ###
            param_result_dict[f'kfold_{k}_recall'].append(recall)
            param_result_dict[f'kfold_{k}_precision'].append(precision)
            param_result_dict[f'kfold_{k}_f1'].append(f1)
            param_result_dict[f'kfold_{k}_accuracy'].append(accuracy)
        
        ### 説明変数と目的変数に分ける ###
        X_col = [col for col in train_valid if 'feature_' in col]
        y_col = 'is_good_saito'
        train_valid_X = train_valid.copy()[X_col]
        train_valid_y = train_valid.copy()[y_col]
        test_X = test.copy()[X_col]
        test_y = test.copy()[y_col]

        ### 学習データのスケーリング(正規化, 学習データのmin, maxを検証データとテストデータに適用) ###
        scaler = MinMaxScaler()
        for col in train_valid_X:
            train_minmax = scaler.fit(train_valid_X[[col]])
            train_valid_X[f'norm_{col}'] = scaler.transform(train_valid_X[[col]])
            test_X[f'norm_{col}'] = scaler.transform(test_X[[col]])
            del train_valid_X[col]
            del test_X[col]
        
        ### モデルへの入力形式に変換する ###
        train_valid_X = train_valid_X.to_numpy()
        train_valid_y = train_valid_y.to_numpy()
        test_X = test_X.to_numpy()
        test_y = test_y.to_numpy()

        ### モデルを生成する ###
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        ### モデルを学習させる ###
        result = model.fit(train_X, train_y)
        ### 検証データで精度を算出する ###
        test_pred = model.predict(test_X)
        ### 各指標の値(testデータに対する) ###
        recall = recall_score(test_y, test_pred, average=None)[1]
        precision = precision_score(test_y, test_pred, average=None)[1]
        f1 = f1_score(test_y, test_pred, average=None)[1]
        accuracy = accuracy_score(test_y, test_pred)
        
        ### 結果を格納する ###
        param_result_dict['test_recall'].append(recall)
        param_result_dict['test_precision'].append(precision)
        param_result_dict['test_f1'].append(f1)
        param_result_dict['test_accuracy'].append(accuracy)  
    
    param_result_aftertrain = pd.DataFrame(param_result_dict)
    param_result_aftertrain.to_csv(f'../temp/result_knn/{ptcnt_type}_{ptcnt}_{input_col}.csv')
    print(ptcnt_type, ptcnt)
    print('==================================')

1 58
1 58
1 58
1 58
1 59


  _warn_prf(average, modifier, msg_start, len(result))


1 59
1 59
1 59
1 60


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1 60
1 60
1 60
1 61
1 61
1 61
1 61
1 62
1 62
1 62
1 62
1 63
1 63
1 63
1 63
1 64


  _warn_prf(average, modifier, msg_start, len(result))


1 64
1 64
1 64
1 65
1 65
1 65
1 65
1 66
1 66
1 66
1 66
1 67
1 67
1 67
1 67
1 68
1 68
1 68
1 68
1 69
1 69
1 69
1 69
1 70


  _warn_prf(average, modifier, msg_start, len(result))


1 70
1 70
1 70
1 71


  _warn_prf(average, modifier, msg_start, len(result))


1 71
1 71
1 71
1 72


  _warn_prf(average, modifier, msg_start, len(result))


1 72
1 72
1 72
1 73
1 73
1 73
1 73
1 74
1 74
1 74
1 74
1 75
1 75
1 75
1 75
1 76


  _warn_prf(average, modifier, msg_start, len(result))


1 76
1 76
1 76
1 77


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1 77
1 77
1 77
1 78
1 78
1 78
1 78
1 79


  _warn_prf(average, modifier, msg_start, len(result))


1 79
1 79
1 79
1 80
1 80
1 80
1 80
1 81
1 81
1 81
1 81
1 82


  _warn_prf(average, modifier, msg_start, len(result))


1 82
1 82
1 82
1 83
1 83
1 83
1 83
2 3
2 3
2 3
2 3
2 4
2 4
2 4
2 4
2 5
2 5
2 5
2 5
2 6
2 6
2 6
2 6
2 7
2 7
2 7
2 7
2 8
2 8
2 8
2 8
2 9
2 9
2 9
2 9
2 10
2 10
2 10
2 10
2 11
2 11
2 11
2 11
2 12
2 12
2 12
2 12
2 13
2 13
2 13
2 13
2 14
2 14
2 14
2 14
2 15
2 15
2 15
2 15
2 16
2 16
2 16
2 16
2 17
2 17
2 17
2 17
2 18
2 18
2 18
2 18
2 19
2 19
2 19
2 19
2 20
2 20
2 20
2 20
2 21
2 21
2 21
2 21
2 22
2 22
2 22
2 22
2 23
2 23
2 23
2 23
2 24
2 24
2 24
2 24
2 25
2 25
2 25
2 25
2 26
2 26
2 26
2 26
2 27
2 27
2 27
2 27
2 28
2 28
2 28
2 28
2 29
2 29
2 29
2 29
2 30
2 30
2 30
2 30
2 31


  _warn_prf(average, modifier, msg_start, len(result))


2 31
2 31
2 31
2 32
2 32
2 32
2 32
2 33
2 33
2 33
2 33
2 34
2 34
2 34
2 34
2 35
2 35
2 35
2 35
2 36
2 36
2 36
2 36
2 37
2 37
2 37
2 37
2 38


  _warn_prf(average, modifier, msg_start, len(result))


2 38
2 38
2 38
2 39
2 39
2 39
2 39
2 40
2 40
2 40
2 40
2 41
2 41
2 41
2 41
2 42
2 42
2 42
2 42
2 43
2 43
2 43
2 43
2 44


  _warn_prf(average, modifier, msg_start, len(result))


2 44
2 44
2 44
2 45
2 45
2 45
2 45
2 46
2 46
2 46
2 46
2 47


  _warn_prf(average, modifier, msg_start, len(result))


2 47
2 47
2 47
2 48
2 48
2 48
2 48
2 49


  _warn_prf(average, modifier, msg_start, len(result))


2 49
2 49
2 49
2 50
2 50
2 50
2 50
2 51
2 51
2 51
2 51
2 52


  _warn_prf(average, modifier, msg_start, len(result))


2 52
2 52
2 52
2 53
2 53
2 53
2 53
2 54
2 54
2 54
2 54
2 55


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2 55
2 55
2 55
2 56
2 56
2 56
2 56
2 57


  _warn_prf(average, modifier, msg_start, len(result))


2 57
2 57
2 57
2 58
2 58
2 58
2 58
2 59
2 59
2 59
2 59
2 60


  _warn_prf(average, modifier, msg_start, len(result))


2 60
2 60
2 60
2 61
2 61
2 61
2 61
2 62


  _warn_prf(average, modifier, msg_start, len(result))


2 62
2 62
2 62
2 63
2 63
2 63
2 63
2 64


  _warn_prf(average, modifier, msg_start, len(result))


2 64
2 64
2 64
2 65
2 65
2 65
2 65
2 66


  _warn_prf(average, modifier, msg_start, len(result))


2 66
2 66
2 66
2 67


  _warn_prf(average, modifier, msg_start, len(result))


2 67
2 67
2 67
2 68
2 68
2 68
2 68
2 69


  _warn_prf(average, modifier, msg_start, len(result))


2 69
2 69
2 69
2 70
2 70
2 70
2 70
2 71
2 71
2 71
2 71
2 72
2 72
2 72
2 72
2 73
2 73
2 73
2 73
2 74


  _warn_prf(average, modifier, msg_start, len(result))


2 74
2 74
2 74
2 75
2 75
2 75
2 75
2 76
2 76
2 76
2 76
2 77
2 77
2 77
2 77
2 78
2 78
2 78
2 78
2 79
2 79
2 79
2 79
2 80
2 80
2 80
2 80
2 81
2 81
2 81
2 81
2 82


  _warn_prf(average, modifier, msg_start, len(result))


2 82
2 82
2 82
2 83


  _warn_prf(average, modifier, msg_start, len(result))


2 83
2 83
2 83


In [5]:
from sklearn.neighbors import KNeighborsClassifier

### 変えないパラメータ ###
TRAIN_TEST_SPLIT_SEED = 1
KFOLD_SHUFFLE_SEED = 1
KFOLD_SHUFFLE_LABEL = 'saito_label'
Y_COL = 'is_good_saito'
K = 5

### 変えるパラメータ ###
PARAM_COMBO = pd.read_csv('../temp/param_combo.csv', delimiter=',', index_col=0)
PARAM_COMBO_GRAD = pd.read_csv('../temp/param_combo_grad.csv', delimiter=',', index_col=0)[2:]

### ハイパーパラメータ ###
KNN_PARAMETER = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)

for i, row in PARAM_COMBO_GRAD.iterrows():
    ### パラメータの組み合わせ ###
    ptcnt_type, ptcnt, input_col = row['ptcnt_type'], row['ptcnt'], row['X_col']
    
    ### 結果格納用df ###
    param_result = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)
    param_result_dict = {}
    for col in param_result:
        param_result_dict[col] = param_result[col]
    for k in range(K):
        param_result_dict[f'kfold_{k}_recall'] = []
        param_result_dict[f'kfold_{k}_precision'] = []
        param_result_dict[f'kfold_{k}_f1'] = []
        param_result_dict[f'kfold_{k}_accuracy'] = []

    param_result_dict['test_recall'] = []
    param_result_dict['test_precision'] = []
    param_result_dict['test_f1'] = []
    param_result_dict['test_accuracy'] = []

    data = pd.read_csv(f'../temp/eachpt_feature_grad/{ptcnt_type}_{ptcnt}_{input_col}.csv', delimiter=',', index_col=0)
    X = data.copy()
    y = data[Y_COL]
    # 学習データとテストデータを725:310に分割する
    train_valid, test, y_train_valid, y_test = train_test_split(X, y, train_size=725, shuffle=True, stratify=y, random_state=TRAIN_TEST_SPLIT_SEED)
   
    for j, row in KNN_PARAMETER.iterrows():
        ### ハイパーパラメータ ###
        n_neighbors = row['n_neighbors']
        
        # K-Fold
        skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=KFOLD_SHUFFLE_SEED)
        for k, (train_index, valid_index) in enumerate(skf.split(train_valid, train_valid[KFOLD_SHUFFLE_LABEL])):
            
            ### 訓練データと検証データに分ける ###
            train, valid = train_valid.iloc[train_index], train_valid.iloc[valid_index]
            
            ### 説明変数と目的変数に分ける ###
            X_col = [col for col in train_valid if 'feature_' in col]
            y_col = 'is_good_saito'
            train_X = train.copy()[X_col]
            train_y = train.copy()[y_col]
            valid_X = valid.copy()[X_col]
            valid_y = valid.copy()[y_col]
            
            ### 学習データのスケーリング(正規化, 学習データのmin, maxを検証データとテストデータに適用) ###
            scaler = MinMaxScaler()
            for col in train_X:
                train_minmax = scaler.fit(train_X[[col]])
                train_X[f'norm_{col}'] = scaler.transform(train_X[[col]])
                valid_X[f'norm_{col}'] = scaler.transform(valid_X[[col]])
                del train_X[col]
                del valid_X[col]
        
            ### モデルへの入力形式に変換する ###
            train_X = train_X.to_numpy()
            train_y = train_y.to_numpy()
            valid_X = valid_X.to_numpy()
            valid_y = valid_y.to_numpy()
        
            ### モデルを生成する ###
            model = KNeighborsClassifier(n_neighbors=n_neighbors)
            ### モデルを学習させる ###
            result = model.fit(train_X, train_y)
            ### 検証データで精度を算出する ###
            valid_pred = model.predict(valid_X)
            ### 各指標の値(validationデータに対する) ###
            recall = recall_score(valid_y, valid_pred, average=None)[1]
            precision = precision_score(valid_y, valid_pred, average=None)[1]
            f1 = f1_score(valid_y, valid_pred, average=None)[1]
            accuracy = accuracy_score(valid_y, valid_pred)
            
            ### 結果を格納する ###
            param_result_dict[f'kfold_{k}_recall'].append(recall)
            param_result_dict[f'kfold_{k}_precision'].append(precision)
            param_result_dict[f'kfold_{k}_f1'].append(f1)
            param_result_dict[f'kfold_{k}_accuracy'].append(accuracy)
        
        ### 説明変数と目的変数に分ける ###
        X_col = [col for col in train_valid if 'feature_' in col]
        y_col = 'is_good_saito'
        train_valid_X = train_valid.copy()[X_col]
        train_valid_y = train_valid.copy()[y_col]
        test_X = test.copy()[X_col]
        test_y = test.copy()[y_col]

        ### 学習データのスケーリング(正規化, 学習データのmin, maxを検証データとテストデータに適用) ###
        scaler = MinMaxScaler()
        for col in train_valid_X:
            train_minmax = scaler.fit(train_valid_X[[col]])
            train_valid_X[f'norm_{col}'] = scaler.transform(train_valid_X[[col]])
            test_X[f'norm_{col}'] = scaler.transform(test_X[[col]])
            del train_valid_X[col]
            del test_X[col]
        
        ### モデルへの入力形式に変換する ###
        train_valid_X = train_valid_X.to_numpy()
        train_valid_y = train_valid_y.to_numpy()
        test_X = test_X.to_numpy()
        test_y = test_y.to_numpy()

        ### モデルを生成する ###
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        ### モデルを学習させる ###
        result = model.fit(train_X, train_y)
        ### 検証データで精度を算出する ###
        test_pred = model.predict(test_X)
        ### 各指標の値(testデータに対する) ###
        recall = recall_score(test_y, test_pred, average=None)[1]
        precision = precision_score(test_y, test_pred, average=None)[1]
        f1 = f1_score(test_y, test_pred, average=None)[1]
        accuracy = accuracy_score(test_y, test_pred)
        
        ### 結果を格納する ###
        param_result_dict['test_recall'].append(recall)
        param_result_dict['test_precision'].append(precision)
        param_result_dict['test_f1'].append(f1)
        param_result_dict['test_accuracy'].append(accuracy)  
    
    param_result_aftertrain = pd.DataFrame(param_result_dict)
    param_result_aftertrain.to_csv(f'../temp/result_knn/{ptcnt_type}_{ptcnt}_{input_col}.csv')
    print(ptcnt_type, ptcnt)
    print('==================================')

2 4
2 4
2 5
2 5
2 6
2 6
2 7
2 7
2 8
2 8
2 9
2 9
2 10
2 10
2 11
2 11
2 12
2 12
2 13
2 13
2 14
2 14
2 15
2 15
2 16
2 16
2 17
2 17
2 18
2 18
2 19
2 19
2 20
2 20
2 21
2 21
2 22
2 22
2 23
2 23
2 24
2 24
2 25
2 25
2 26
2 26
2 27
2 27
2 28
2 28
2 29
2 29
2 30
2 30
2 31
2 31
2 32
2 32
2 33
2 33
2 34
2 34
2 35
2 35
2 36
2 36
2 37
2 37
2 38
2 38
2 39
2 39
2 40
2 40
2 41
2 41
2 42
2 42
2 43
2 43
2 44
2 44
2 45
2 45
2 46
2 46
2 47
2 47
2 48
2 48
2 49
2 49
2 50
2 50
2 51
2 51
2 52
2 52
2 53
2 53
2 54
2 54
2 55
2 55
2 56
2 56
2 57
2 57
2 58
2 58
2 59
2 59
2 60
2 60
2 61
2 61
2 62
2 62
2 63
2 63
2 64
2 64
2 65
2 65
2 66
2 66
2 67
2 67
2 68
2 68
2 69
2 69
2 70
2 70
2 71
2 71
2 72
2 72
2 73
2 73
2 74
2 74
2 75
2 75
2 76
2 76
2 77
2 77
2 78
2 78
2 79
2 79
2 80
2 80
2 81
2 81
2 82
2 82
2 83
2 83


##### ▶ハイパーパラメータ

In [2]:
n_neighbors = [i for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30]]

knn_parameter_combo = pd.DataFrame({
    'n_neighbors': n_neighbors
})

knn_parameter_combo.to_csv('../temp/knn_parameter.csv')

In [4]:
data = pd.read_csv(f'../temp/eachpt_feature/{ptcnt_type}_{ptcnt}_{input_col}.csv', delimiter=',', index_col=0)
X = data.copy()
y = data[Y_COL]
# 学習データとテストデータを725:310に分割する
train_valid, test, y_train_valid, y_test = train_test_split(X, y, train_size=725, shuffle=True, stratify=y, random_state=TRAIN_TEST_SPLIT_SEED)

tp = len([ans for pred, ans in zip(test['is_good_rulebase'], test['is_good_saito']) if pred == 1 and ans == 1])
tn = len([ans for pred, ans in zip(test['is_good_rulebase'], test['is_good_saito']) if pred == 0 and ans == 0])
fp = len([ans for pred, ans in zip(test['is_good_rulebase'], test['is_good_saito']) if pred == 1 and ans == 0])
fn = len([ans for pred, ans in zip(test['is_good_rulebase'], test['is_good_saito']) if pred == 0 and ans == 1])

recall = tp / (tp + fn)
precision = tp / (tp + fp)
f_measure = 2 * (precision * recall) / (precision + recall)
print('TP : ', tp)
print('TF : ', tn)
print('FP : ', fp)
print('FN : ', fn)
print('recall : ', round(recall, 4))
print('precision : ', round(precision, 4))
print('f_measure : ', round(f_measure, 4))

Unnamed: 0,drawing_id,stroke_id,saito_label,is_good_saito,is_good_rulebase,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5
0,100,1035,straight,1,1,0.000000,12.144578,420.000000,197.349398,181.156627,0.000000
1,100,1062,straight,1,1,420.000000,0.000000,41.793612,0.000000,208.452088,169.238329
2,100,1071,straight,1,1,420.000000,0.000000,75.600000,0.000000,202.650000,138.600000
3,100,1092,straight,1,1,420.000000,0.000000,31.276596,0.000000,413.297872,411.063830
4,100,111,straight,1,1,0.000000,95.711253,96.305732,0.000000,411.380042,420.000000
...,...,...,...,...,...,...,...,...,...,...,...
1030,960,1801,complex,0,1,52.500000,402.500000,392.291667,0.000000,420.000000,412.708333
1031,960,424,complex,0,0,47.727273,0.000000,164.876033,334.669421,420.000000,24.876033
1032,960,551,complex,0,0,0.000000,62.412587,420.000000,238.636364,0.000000,111.608392
1033,960,84,complex,0,0,350.209205,0.753138,420.000000,55.230126,308.786611,0.000000


##### ▶テストデータだけやり直す

In [2]:
from sklearn.neighbors import KNeighborsClassifier

### 変えないパラメータ ###
TRAIN_TEST_SPLIT_SEED = 1
KFOLD_SHUFFLE_SEED = 1
KFOLD_SHUFFLE_LABEL = 'saito_label'
Y_COL = 'is_good_saito'
K = 5

### 変えるパラメータ ###
PARAM_COMBO = pd.read_csv('../temp/param_combo.csv', delimiter=',', index_col=0)
PARAM_COMBO_GRAD = pd.read_csv('../temp/param_combo_grad.csv', delimiter=',', index_col=0)

### ハイパーパラメータ ###
KNN_PARAMETER = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)

for i, row in PARAM_COMBO.iterrows():
    ### パラメータの組み合わせ ###
    ptcnt_type, ptcnt, input_col = row['ptcnt_type'], row['ptcnt'], row['X_col']
    
    ### 結果格納用df ###
    param_result = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)
    param_result_dict = {}
    for col in param_result:
        param_result_dict[col] = param_result[col]

    param_result_dict['test_recall'] = []
    param_result_dict['test_precision'] = []
    param_result_dict['test_f1'] = []
    param_result_dict['test_accuracy'] = []

    data = pd.read_csv(f'../temp/eachpt_feature/{ptcnt_type}_{ptcnt}_{input_col}.csv', delimiter=',', index_col=0)
    X = data.copy()
    y = data[Y_COL]
    # 学習データとテストデータを725:310に分割する
    train_valid, test, y_train_valid, y_test = train_test_split(X, y, train_size=725, shuffle=True, stratify=y, random_state=TRAIN_TEST_SPLIT_SEED)
   
    for j, row in KNN_PARAMETER.iterrows():
        ### ハイパーパラメータ ###
        n_neighbors = row['n_neighbors']
        
        ### 説明変数と目的変数に分ける ###
        X_col = [col for col in train_valid if 'feature_' in col]
        y_col = 'is_good_saito'
        train_valid_X = train_valid.copy()[X_col]
        train_valid_y = train_valid.copy()[y_col]
        test_X = test.copy()[X_col]
        test_y = test.copy()[y_col]

        ### 学習データのスケーリング(正規化, 学習データのmin, maxを検証データとテストデータに適用) ###
        scaler = MinMaxScaler()
        for col in train_valid_X:
            train_minmax = scaler.fit(train_valid_X[[col]])
            train_valid_X[f'norm_{col}'] = scaler.transform(train_valid_X[[col]])
            test_X[f'norm_{col}'] = scaler.transform(test_X[[col]])
            del train_valid_X[col]
            del test_X[col]
        
        ### モデルへの入力形式に変換する ###
        train_valid_X = train_valid_X.to_numpy()
        train_valid_y = train_valid_y.to_numpy()
        test_X = test_X.to_numpy()
        test_y = test_y.to_numpy()

        ### モデルを生成する ###
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        ### モデルを学習させる ###
        result = model.fit(train_valid_X, train_valid_y)
        ### 検証データで精度を算出する ###
        test_pred = model.predict(test_X)
        ### 各指標の値(testデータに対する) ###
        recall = recall_score(test_y, test_pred, average=None)[1]
        precision = precision_score(test_y, test_pred, average=None)[1]
        f1 = f1_score(test_y, test_pred, average=None)[1]
        accuracy = accuracy_score(test_y, test_pred)
        
        ### 結果を格納する ###
        param_result_dict['test_recall'].append(recall)
        param_result_dict['test_precision'].append(precision)
        param_result_dict['test_f1'].append(f1)
        param_result_dict['test_accuracy'].append(accuracy)  
    
    param_result_aftertrain = pd.DataFrame(param_result_dict)
    param_result_aftertrain.to_csv(f'../temp/test_knn/{ptcnt_type}_{ptcnt}_{input_col}.csv')
    print(ptcnt_type, ptcnt)
    print('==================================')

1 3
1 3
1 3
1 3
1 4
1 4
1 4
1 4
1 5
1 5
1 5
1 5
1 6
1 6
1 6
1 6
1 7
1 7
1 7
1 7
1 8
1 8
1 8
1 8
1 9
1 9
1 9
1 9
1 10
1 10
1 10
1 10
1 11
1 11
1 11
1 11
1 12
1 12
1 12
1 12
1 13
1 13
1 13
1 13
1 14
1 14
1 14
1 14
1 15
1 15
1 15
1 15
1 16
1 16
1 16
1 16
1 17
1 17
1 17
1 17
1 18
1 18
1 18
1 18
1 19
1 19
1 19
1 19
1 20
1 20
1 20
1 20
1 21
1 21
1 21
1 21
1 22
1 22
1 22
1 22
1 23
1 23
1 23
1 23
1 24
1 24
1 24
1 24
1 25
1 25
1 25
1 25
1 26
1 26
1 26
1 26
1 27
1 27
1 27
1 27
1 28
1 28
1 28
1 28
1 29
1 29
1 29
1 29
1 30
1 30
1 30
1 30
1 31
1 31
1 31
1 31
1 32
1 32
1 32
1 32
1 33
1 33
1 33
1 33
1 34
1 34
1 34
1 34
1 35
1 35
1 35
1 35
1 36
1 36
1 36
1 36
1 37
1 37
1 37
1 37
1 38
1 38
1 38
1 38
1 39
1 39
1 39
1 39
1 40
1 40
1 40
1 40
1 41
1 41
1 41
1 41
1 42
1 42
1 42
1 42
1 43
1 43
1 43
1 43
1 44
1 44
1 44
1 44
1 45
1 45
1 45
1 45
1 46
1 46
1 46
1 46
1 47
1 47
1 47
1 47
1 48
1 48
1 48
1 48
1 49
1 49
1 49
1 49
1 50
1 50
1 50
1 50
1 51
1 51
1 51
1 51
1 52
1 52
1 52
1 52
1 53
1 53
1 53
1 53
1 54
1 5

In [3]:
from sklearn.neighbors import KNeighborsClassifier

### 変えないパラメータ ###
TRAIN_TEST_SPLIT_SEED = 1
KFOLD_SHUFFLE_SEED = 1
KFOLD_SHUFFLE_LABEL = 'saito_label'
Y_COL = 'is_good_saito'
K = 5

### 変えるパラメータ ###
PARAM_COMBO = pd.read_csv('../temp/param_combo.csv', delimiter=',', index_col=0)[2:]
PARAM_COMBO_GRAD = pd.read_csv('../temp/param_combo_grad.csv', delimiter=',', index_col=0)

### ハイパーパラメータ ###
KNN_PARAMETER = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)

for i, row in PARAM_COMBO_GRAD.iterrows():
    ### パラメータの組み合わせ ###
    ptcnt_type, ptcnt, input_col = row['ptcnt_type'], row['ptcnt'], row['X_col']
    
    ### 結果格納用df ###
    param_result = pd.read_csv('../temp/knn_parameter.csv', delimiter=',', index_col=0)
    param_result_dict = {}
    for col in param_result:
        param_result_dict[col] = param_result[col]
    param_result_dict['test_recall'] = []
    param_result_dict['test_precision'] = []
    param_result_dict['test_f1'] = []
    param_result_dict['test_accuracy'] = []

    data = pd.read_csv(f'../temp/eachpt_feature_grad/{ptcnt_type}_{ptcnt}_{input_col}.csv', delimiter=',', index_col=0)
    X = data.copy()
    y = data[Y_COL]
    # 学習データとテストデータを725:310に分割する
    train_valid, test, y_train_valid, y_test = train_test_split(X, y, train_size=725, shuffle=True, stratify=y, random_state=TRAIN_TEST_SPLIT_SEED)
   
    for j, row in KNN_PARAMETER.iterrows():
        ### ハイパーパラメータ ###
        n_neighbors = row['n_neighbors']
        
        ### 説明変数と目的変数に分ける ###
        X_col = [col for col in train_valid if 'feature_' in col]
        y_col = 'is_good_saito'
        train_valid_X = train_valid.copy()[X_col]
        train_valid_y = train_valid.copy()[y_col]
        test_X = test.copy()[X_col]
        test_y = test.copy()[y_col]

        ### 学習データのスケーリング(正規化, 学習データのmin, maxを検証データとテストデータに適用) ###
        scaler = MinMaxScaler()
        for col in train_valid_X:
            train_minmax = scaler.fit(train_valid_X[[col]])
            train_valid_X[f'norm_{col}'] = scaler.transform(train_valid_X[[col]])
            test_X[f'norm_{col}'] = scaler.transform(test_X[[col]])
            del train_valid_X[col]
            del test_X[col]
        
        ### モデルへの入力形式に変換する ###
        train_valid_X = train_valid_X.to_numpy()
        train_valid_y = train_valid_y.to_numpy()
        test_X = test_X.to_numpy()
        test_y = test_y.to_numpy()

        ### モデルを生成する ###
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        ### モデルを学習させる ###
        result = model.fit(train_valid_X, train_valid_y)
        ### 検証データで精度を算出する ###
        test_pred = model.predict(test_X)
        ### 各指標の値(testデータに対する) ###
        recall = recall_score(test_y, test_pred, average=None)[1]
        precision = precision_score(test_y, test_pred, average=None)[1]
        f1 = f1_score(test_y, test_pred, average=None)[1]
        accuracy = accuracy_score(test_y, test_pred)
        
        ### 結果を格納する ###
        param_result_dict['test_recall'].append(recall)
        param_result_dict['test_precision'].append(precision)
        param_result_dict['test_f1'].append(f1)
        param_result_dict['test_accuracy'].append(accuracy)  
    
    param_result_aftertrain = pd.DataFrame(param_result_dict)
    param_result_aftertrain.to_csv(f'../temp/test_knn/{ptcnt_type}_{ptcnt}_{input_col}.csv')
    print(ptcnt_type, ptcnt)
    print('==================================')

2 4
2 4
2 5
2 5
2 6
2 6
2 7
2 7
2 8
2 8
2 9
2 9
2 10
2 10
2 11
2 11
2 12
2 12
2 13
2 13
2 14
2 14
2 15
2 15
2 16
2 16
2 17
2 17
2 18
2 18
2 19
2 19
2 20
2 20
2 21
2 21
2 22
2 22
2 23
2 23
2 24
2 24
2 25
2 25
2 26
2 26
2 27
2 27
2 28
2 28
2 29
2 29
2 30
2 30
2 31
2 31
2 32
2 32
2 33
2 33
2 34
2 34
2 35
2 35
2 36
2 36
2 37
2 37
2 38
2 38
2 39
2 39
2 40
2 40
2 41
2 41
2 42
2 42
2 43
2 43
2 44
2 44
2 45
2 45
2 46
2 46
2 47
2 47
2 48
2 48
2 49
2 49
2 50
2 50
2 51
2 51
2 52
2 52
2 53
2 53
2 54
2 54
2 55
2 55
2 56
2 56
2 57
2 57
2 58
2 58
2 59
2 59
2 60
2 60
2 61
2 61
2 62
2 62
2 63
2 63
2 64
2 64
2 65
2 65
2 66
2 66
2 67
2 67
2 68
2 68
2 69
2 69
2 70
2 70
2 71
2 71
2 72
2 72
2 73
2 73
2 74
2 74
2 75
2 75
2 76
2 76
2 77
2 77
2 78
2 78
2 79
2 79
2 80
2 80
2 81
2 81
2 82
2 82
2 83
2 83
