In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from glob import glob
import re
import os


In [2]:
# 小数点以下n桁表示
pd.options.display.float_format = '{:.2f}'.format

In [3]:
csv_files = glob('../csv/EXA_FIRST_マイジャグラーV_*.csv')
len(csv_files), csv_files[0]

(327, '../csv\\EXA_FIRST_マイジャグラーV_2024-05-01.csv')

In [4]:
def extract_information(csv_file):
    pattern = r"(.+)_(.+)_(\d{4}-\d{2}-\d{2}).csv"

    file_name = os.path.basename(csv_file)
    # print(file_name)
    match = re.match(pattern, file_name)

    if match:
        hall_name, machine_name, date_str = match.groups()
        return hall_name, machine_name, date_str
    else:
        print(f"ファイル名の形式が一致しません {file_name}")
        print(f"ホール名: {hall_name}")
        print(f"機種名: {machine_name}")
        print(f"日付: {date_str}")

In [5]:
def calc_grape_rate(game_count, bb_count, rb_count, coins):
    bb_coins = 240
    rb_coins = 96
    cherry_coins = 2
    cherry_rate = 38.1
    replay_coins = 3
    replay_rate = 7.3
    grape_coins = 8

    in_coins = game_count * 3

    out_with_cherry = (
        bb_coins * bb_count
        + rb_coins * rb_count
        + (cherry_coins * game_count / cherry_rate)
        + (replay_coins * game_count / replay_rate)
    )
    without_cherry_rate = 0.667
    out_without_cherry = (
        bb_coins * bb_count
        + rb_coins * rb_count
        + (cherry_coins * game_count / cherry_rate * without_cherry_rate)
        + (replay_coins * game_count / replay_rate)
    )

    grape_rate_with_cherry = (
        grape_coins * game_count / (coins + in_coins - out_with_cherry)
    )
    grape_rate_without_cherry = (
        grape_coins * game_count / (coins + in_coins - out_without_cherry)
    )

    # print("フリー打ち : ", grape_rate_without_cherry)
    # print("チェリー狙い : ", grape_rate_with_cherry)
    
    return grape_rate_with_cherry, grape_rate_without_cherry


def calc_total_coins(game_count, bb_count, rb_count, grape_rate=5.9):

    BB_COINS = 240
    RB_COINS = 96
    CHERRY_COINS = 2
    CHERRY_RATE = 38.1
    REPLAY_COINS = 3
    REPLAY_RATE = 7.3
    GRAPE_COINS = 8

    in_coins = game_count * 3

    out_coins = sum([
        BB_COINS * bb_count,
        RB_COINS * rb_count,
        CHERRY_COINS * game_count / CHERRY_RATE,
        REPLAY_COINS * game_count / REPLAY_RATE,
        GRAPE_COINS * game_count / grape_rate,
        ])
    total_coins = out_coins - in_coins
    # print(total_coins)

    return total_coins

In [6]:
dataframes = []
for csv_file in csv_files:
    hall_name, machine_name, date_str = extract_information(csv_file)
    df = pd.read_csv(csv_file, encoding="utf-8-sig")
    df["date"] = date_str
    df["hall_name"] = hall_name
    df["machine_name"] = machine_name
    dataframes.append(df)
    
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    df.to_csv('../analysis_data.csv', index=False)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26487 entries, 0 to 26486
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   台番号           26487 non-null  int64  
 1   G数            26487 non-null  object 
 2   差枚            26487 non-null  object 
 3   BB            26487 non-null  int64  
 4   RB            26406 non-null  float64
 5   合成確率          26487 non-null  object 
 6   BB確率          26487 non-null  object 
 7   RB確率          26406 non-null  object 
 8   date          26487 non-null  object 
 9   hall_name     26487 non-null  object 
 10  machine_name  26487 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 2.2+ MB


In [7]:
df["G数"] = (
    df["G数"]
    .astype(str)  # 文字列型に変換
    .str.replace(",", "", regex=True)  # カンマを削除
    .str.extract(r"(\d+)")  # 数字部分のみ抽出
    .dropna()  # NaNを削除
    .astype(int)  # 整数型に変換
)
df["差枚"] = (
    df["差枚"]
    .astype(str)  # 文字列型に変換
    .str.replace(",", "", regex=True)  # カンマを削除
    .str.extract(r"(\d+)")  # 数字部分のみ抽出
    .dropna()  # NaNを削除
    .astype(int)  # 整数型に変換
)
df["RB"] = (
    df["RB"]
    .astype(str)  # 文字列型に変換
    .str.replace(",", "", regex=True)  # カンマを削除
    .str.extract(r"(\d+)")  # 数字部分のみ抽出
    .dropna()  # NaNを削除
    .astype(int)  # 整数型に変換
)
df["合成確率"] = (
    df["合成確率"]
    .astype(str)  # 文字列型に変換
    .str.replace("1/", "", regex=True)  # カンマを削除
    .str.extract(r"(\d+)")  # 数字部分のみ抽出
    .dropna()  # NaNを削除
    .astype(float)  # 整数型に変換
)
df["BB確率"] = (
    df["BB確率"]
    .astype(str)  # 文字列型に変換
    .str.replace("1/", "", regex=True)  # カンマを削除
    .str.extract(r"(\d+)")  # 数字部分のみ抽出
    .dropna()  # NaNを削除
    .astype(float)  # 整数型に変換
)
df["RB確率"] = (
    df["RB確率"]
    .astype(str)  # 文字列型に変換
    .str.replace("1/", "", regex=True)  # カンマを削除
    .str.extract(r"(\d+)")  # 数字部分のみ抽出
    .dropna()  # NaNを削除
    .astype(float)  # 整数型に変換
)

df["date"] = pd.to_datetime(df["date"], errors="coerce")
# df["month"] = df["date"].dt.month.astype(int)
df['weekday'] = df['date'].dt.weekday.astype(int)
df['day'] = df['date'].dt.day.astype(int)
# df["last_digit_of_day"] = df["date"].dt.day.astype(str).str[-1].astype(int)
# df["g_cherry"], df["g_free"] = calc_grape_rate(df["G数"], df["BB"], df["RB"], df["差枚"])
# df["total_coins"] = calc_total_coins(df["G数"], df["BB"], df["RB"])

In [8]:
df.head()

Unnamed: 0,台番号,G数,差枚,BB,RB,合成確率,BB確率,RB確率,date,hall_name,machine_name,weekday,day
0,1001,8472,2268,37,35.0,117.0,229.0,242.0,2024-05-01,EXA_FIRST,マイジャグラーV,2,1
1,1002,5860,768,22,10.0,183.0,266.0,586.0,2024-05-01,EXA_FIRST,マイジャグラーV,2,1
2,1003,8275,2718,34,40.0,111.0,243.0,206.0,2024-05-01,EXA_FIRST,マイジャグラーV,2,1
3,1004,6090,759,22,17.0,156.0,276.0,358.0,2024-05-01,EXA_FIRST,マイジャグラーV,2,1
4,1005,6024,303,23,19.0,143.0,261.0,317.0,2024-05-01,EXA_FIRST,マイジャグラーV,2,1


In [36]:
# 対象の列
selected_features = ['G数', '差枚', '合成確率', 'BB確率', 'RB確率']
selected_features = ['G数', '差枚']
target_column = "RB確率"

# 元データの準備
df_updated = df
df_selected = df_updated[["date", "台番号"] + selected_features + [target_column]].copy()
df_selected["date"] = pd.to_datetime(df_selected["date"])
df_selected = df_selected.sort_values(["台番号", "date"])

# 過去n日分のラグ特徴を作成
n = 3
for lag in range(1, n+1):
    for col in selected_features:
        df_selected[f"{col}_lag{lag}"] = df_selected.groupby("台番号")[col].shift(lag)

# 目的変数（翌日のRB確率）
df_selected["RB確率_next"] = df_selected.groupby("台番号")["RB確率"].shift(-1)

# 欠損値を除外
df_ml = df_selected.dropna().reset_index(drop=True)

# 確認（必要なら表示）
df_ml.head()

Unnamed: 0,date,台番号,G数,差枚,RB確率,G数_lag1,差枚_lag1,G数_lag2,差枚_lag2,G数_lag3,差枚_lag3,RB確率_next
0,2024-05-04,1001,8802,547,366.0,5811.0,150.0,8280.0,926.0,8472.0,2268.0,526.0
1,2024-05-05,1001,7897,1371,526.0,8802.0,547.0,5811.0,150.0,8280.0,926.0,328.0
2,2024-05-06,1001,5590,2374,328.0,7897.0,1371.0,8802.0,547.0,5811.0,150.0,416.0
3,2024-05-07,1001,7504,1985,416.0,5590.0,2374.0,7897.0,1371.0,8802.0,547.0,1676.0
4,2024-05-08,1001,3352,1088,1676.0,7504.0,1985.0,5590.0,2374.0,7897.0,1371.0,331.0


In [37]:
# 入力用特徴量のカラムを作成（lag1〜lag5）
feature_cols = []
for lag in range(1, n+1):
    for col in selected_features:
        feature_cols.append(f"{col}_lag{lag}")

# 説明変数と目的変数を分離
X = df_ml[feature_cols]
# y = df_ml["RB確率_next"]
y = df_ml["RB確率_next"]

In [38]:
# データ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoostモデルで学習
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X_train, y_train)

# 予測と評価
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

rmse

np.float64(251.9574569667963)

In [39]:
# セッションリセットにより df を再定義
df = df.copy()

# 予測結果と実測値をまとめたデータフレームを作成
result_df = pd.DataFrame({
    "RB確率_実測値": y_test.values,
    "RB確率_予測値": y_pred
}, index=y_test.index)

# 日付や台番号を追加（可能な場合）
if "date" in df.columns and "台番号" in df.columns:
    result_df = result_df.join(df.loc[y_test.index, ["date", "台番号"]])

# 結果の表示（上位10件）
result_df.reset_index(drop=True).head(10)


Unnamed: 0,RB確率_実測値,RB確率_予測値,date,台番号
0,381.0,398.98,2024-06-23,1018
1,297.0,395.44,2024-08-07,1084
2,404.0,490.49,2024-12-28,1026
3,329.0,309.12,2024-06-23,1082
4,450.0,454.7,2024-08-14,1085
5,423.0,516.57,2025-03-11,1022
6,356.0,325.0,2024-06-01,1097
7,453.0,365.27,2025-01-21,1081
8,400.0,396.43,2024-12-14,1020
9,594.0,445.74,2024-06-06,1089
