### ロジスティック回帰

In [1]:
import pandas as pd

In [2]:
def preprocessing(results):
    df = results.copy()
    
    # 着順に数字以外の文字列が含まれているものを取り除く
    df = df[~(df['着順'].astype(str).str.contains('\D'))]
    df['着順'] = df['着順'].astype(int)
    
    # 性齢を性と年齢に分ける
    df['性'] = df['性齢'].map(lambda x: str(x)[0])
    df['年齢'] = df['性齢'].map(lambda x: str(x)[1:]).astype(int)
    
    # 馬体重を体重と体重変化に分ける
    df['体重'] = df['馬体重'].str.split('(', expand=True)[0].astype(int)
    df['体重変化'] = df['馬体重'].str.split('(', expand=True)[1].str[:-1].astype(int)
    
    # 単勝をfloatに変換
    df['単勝'] = df['単勝'].astype(float)
    
    # 不要な列を削除
    df.drop(['タイム', '着差', '調教師', '性齢', '馬体重'], axis=1, inplace=True)
    
    return df

In [3]:
# 保存したpickleファイルからデータ取得
results = pd.read_pickle('results.pickle')

In [4]:
df = preprocessing(results)

In [5]:
# 4着以下をすべて4にする
clip_rank = lambda x: x if x < 4 else 4
df['rank'] = df['着順'].map(clip_rank)

In [6]:
# 不要な列を削除
df.drop(['着順', '馬名'], axis=1, inplace=True)

In [7]:
# カテゴリ変数をダミー変数化
df_d = pd.get_dummies(df)

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# 訓練データとテストデータに分ける
X = df_d.drop(['rank'], axis=1)
y = df_d['rank']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)

In [10]:
from imblearn.under_sampling import RandomUnderSampler

In [11]:
# アンダーサンプリング
rank_1 = y_train.value_counts()[1]
rank_2 = y_train.value_counts()[2]
rank_3 = y_train.value_counts()[3]

rus = RandomUnderSampler(sampling_strategy={1: rank_1, 2: rank_2, 3: rank_3, 4: rank_1}, random_state=71)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
# 訓練
model = LogisticRegression()
model.fit(X_train_rus, y_train_rus)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [14]:
# スコアを表示
print(model.score(X_train, y_train), model.score(X_test, y_test))

0.564004608574374 0.5614742501414828


In [15]:
# 予測結果を確認
y_pred = model.predict(X_test)
pred_df = pd.DataFrame({'pred': y_pred, 'actual': y_test})
pred_df[pred_df['pred'] == 1]['actual'].value_counts()

4    1754
1     684
2     612
3     474
Name: actual, dtype: int64

In [16]:
# 回帰係数の確認
coefs = pd.Series(model.coef_[0], index=X.columns).sort_values()
coefs[['枠番', '馬番', '斤量', '単勝', '人気', '年齢', '体重', '体重変化']]

枠番     -0.003471
馬番      0.002468
斤量      0.004374
単勝     -0.008760
人気     -0.104455
年齢     -0.021528
体重      0.001070
体重変化   -0.000820
dtype: float64