In [10]:
import json
from datetime import datetime

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

from model.sequential_classfier import SequentialClassifier
from type.abbreviation import Abbreviation
from type.features import CrfFeatures, CrfLabelSequence
from pydantic import BaseModel

In [11]:
class FeaturesScore(BaseModel):
    features: list[str]
    score: float

features_score = list(map(FeaturesScore.model_validate, json.load(open('features.json'))))

In [12]:
CrfFeatures.no_use_keys = [
]

In [13]:
# 前処理した略語データを読み込む
data = list(map(Abbreviation.model_validate, json.load(open("./data/abbreviation.json", "r"))))
X = [*map(CrfFeatures.from_abbreviation, data)]
y = list(map(CrfLabelSequence.from_abbreviation, data))
# ラベルの番号振り
CrfFeatures.to_int_idx = {}
CrfFeatures.assign_feature_idx(X)
print()




In [14]:
model = SequentialClassifier(
    RandomForestClassifier(
        random_state=616,
        n_jobs=-1,
        criterion="gini",
        n_estimators=893,
        max_depth=92,
        max_features=0.08707415761760165,
        max_leaf_nodes=795,
        min_samples_split=5,
    )
)

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=616)

In [16]:
scores: list[float] = []
for train_idx, test_idx in kf.split(X):
    X_train = [X[i] for i in train_idx]
    y_train = [y[i] for i in train_idx]
    X_test = [X[i] for i in test_idx]
    y_test = [y[i] for i in test_idx]
    model.fit(X_train, y_train)
    case_score_list = model.score(X_test, y_test)
    score = np.mean(case_score_list)
    print("score:", score)
    scores.append(score)

score_mean = np.mean(scores)

case size:  1013
feature size:  1695
case size:  1013
feature size:  1695


KeyboardInterrupt: 

In [None]:
print(score_mean)

fs = FeaturesScore(
    features=[f for f in CrfFeatures.model_json_schema()['required'] if f not in CrfFeatures.no_use_keys],
    score=score_mean,
)

features_score.append(fs)

3.6699371719657394


In [None]:
# json.dump([fs.model_dump() for fs in features_score], open('features.json', 'w'), ensure_ascii=False, indent=4)