## datacheck

In [3]:
import os
import chardet
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def detect_encoding(file_path):
    try:
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read(1024))
        return [file_path, result.get('encoding')]
    except Exception as e:
        print(f"{file_path} generated an exception: {e}")
        return None

def check_file_encodings(folder_path):
    encoding_data = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for root, dirs, files in os.walk(folder_path):
            for filename in files:
                if filename.endswith(".txt"):
                    file_path = os.path.join(root, filename)
                    futures.append(executor.submit(detect_encoding, file_path))
        for future in tqdm(futures, desc="Checking encodings"):
            result = future.result()
            if result:
                encoding_data.append(result)
    df = pd.DataFrame(encoding_data, columns=['File Name', 'Encoding Type'])
    return df

# Initialize or load existing summary DataFrame
try:
    df_summary = pd.read_csv('txt_encoding.csv')
except FileNotFoundError:
    df_summary = pd.DataFrame(columns=['File Name', 'Encoding Type'])


# Get all directory names under "/app/data/jrdb_txt/"
list_file_name = [name for name in os.listdir("/app/data/jrdb_txt/") if os.path.isdir(os.path.join("/app/data/jrdb_txt/", name))]

for file_name in list_file_name:
    print(f"Processing {file_name}...")
    folder_path = f"/app/data/jrdb_txt/{file_name}"
    df = check_file_encodings(folder_path)
    unique_encodings = df['Encoding Type'].unique()

    new_row = pd.DataFrame({'File Name': [file_name], 'Encoding Type': [unique_encodings]})
    df_summary = pd.concat([df_summary, new_row], ignore_index=True)

    df_summary.to_csv('txt_encoding.csv', index=False)

print("Finished processing all directories.")


Processing BAC...


Checking encodings: 100%|██████████| 2655/2655 [00:09<00:00, 289.83it/s] 


Processing CHA...


Checking encodings: 100%|██████████| 1623/1623 [00:05<00:00, 273.66it/s]


Processing CYB...


Checking encodings: 100%|██████████| 2552/2552 [00:01<00:00, 1820.29it/s]


Processing CZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 118.28it/s]


Processing JOA...


Checking encodings: 100%|██████████| 2655/2655 [00:08<00:00, 307.48it/s]  


Processing KAB...


Checking encodings: 100%|██████████| 2655/2655 [00:03<00:00, 760.04it/s]


Processing KKA...


Checking encodings: 100%|██████████| 2311/2311 [00:01<00:00, 1564.69it/s]


Processing KYI...


Checking encodings: 100%|██████████| 2655/2655 [00:08<00:00, 310.59it/s]


Processing KZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 118.25it/s]


Processing MZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 317.22it/s]


Processing OT...


Checking encodings: 100%|██████████| 2208/2208 [00:02<00:00, 1100.63it/s]


Processing OU...


Checking encodings: 100%|██████████| 2208/2208 [00:01<00:00, 1224.83it/s]


Processing OV...


Checking encodings: 100%|██████████| 2058/2058 [00:02<00:00, 837.42it/s]


Processing OW...


Checking encodings: 100%|██████████| 2562/2562 [00:02<00:00, 1231.16it/s]


Processing OZ...


Checking encodings: 100%|██████████| 2655/2655 [00:02<00:00, 1223.15it/s]


Processing UKC...


Checking encodings: 100%|██████████| 2655/2655 [00:13<00:00, 202.44it/s]


Processing ZED...


Checking encodings: 100%|██████████| 1634/1634 [00:06<00:00, 254.37it/s]


Processing ZKB...


Checking encodings: 100%|██████████| 1973/1973 [00:08<00:00, 229.00it/s]

Finished processing all directories.





In [4]:
df_summary


Unnamed: 0,File Name,Encoding Type
0,BAC,['SHIFT_JIS']
1,BAC,[SHIFT_JIS]
2,CHA,"[MacRoman, Johab, Windows-1252]"
3,CYB,"[ascii, MacRoman]"
4,CZA,[SHIFT_JIS]
5,JOA,[SHIFT_JIS]
6,KAB,"[Windows-1252, None, MacRoman, IBM866]"
7,KKA,[ascii]
8,KYI,"[CP932, SHIFT_JIS]"
9,KZA,[SHIFT_JIS]


## type_check_single

In [82]:
import os
from pathlib import Path
import chardet  # エンコーディングを自動判別するためのライブラリ

def check_encoding(base_directory):
    base_directory_path = Path(base_directory)
    years = [f.name for f in base_directory_path.iterdir() if f.is_dir()]

    for year in years:
        directory_path = base_directory_path / year
        for filename in os.listdir(directory_path):
            if filename.endswith(".txt"):
                file_path = directory_path / filename
                with open(file_path, 'rb') as f:
                    result = chardet.detect(f.read())
                if result['encoding'] != 'SHIFT-JIS':
                    print(f"File {file_path} is not encoded in SHIFT-JIS. Detected encoding: {result['encoding']}")


## Encoding

### BAC

In [97]:
File_type = 'BAC'
year = 2022
df_a = pd.read_csv(f'/app/data/jrdb_csv/{File_type}/{File_type}_{year}.csv')
df_a


Unnamed: 0,場コード,年,回,日,Ｒ,年月日,発走時間,距離,芝ダ障害コード,右左,内外,種別,条件,記号,重量,グレード,レース名,回数,頭数,コース,開催区分,レース名短縮,レース名９文字,データ区分,１着賞金,２着賞金,３着賞金,４着賞金,５着賞金,１着算入賞金,２着算入賞金,馬券発売フラグ,WIN5フラグ,予備
0,6,22,1,1,1,20220105,955,1200,2,1,1,12,A3,0,3,,,,16,,1,,３歳未勝利,4,520,210,130,78,52,400,0,11111111,,
1,6,22,1,1,2,20220105,1025,1800,2,1,1,12,A3,20,3,,,,16,,1,,３歳未勝利,4,520,210,130,78,52,400,0,11111111,,
2,6,22,1,1,3,20220105,1055,1800,2,1,1,12,A3,100,3,,,,15,,1,,３歳未勝利,4,520,210,130,78,52,400,0,11111111,,
3,6,22,1,1,4,20220105,1125,1800,2,1,1,12,A1,100,3,,,,11,,1,,３歳新馬,4,600,240,150,90,60,400,0,11111111,,
4,6,22,1,1,5,20220105,1215,2000,1,1,1,12,05,103,3,,,,9,5.0,1,,３歳１勝クラス,4,750,300,190,110,75,500,0,11111111,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451,9,22,6,9,8,20221228,1355,1800,2,1,1,13,05,0,4,,,,11,,2,,３歳上１勝クラス,4,760,300,190,110,76,0,0,11111111,,
3452,9,22,6,9,9,20221228,1430,1400,2,1,1,13,10,100,4,5.0,春待月賞,,16,,2,春待月賞,春待月賞,4,1500,600,380,230,150,0,0,11111111,,
3453,9,22,6,9,10,20221228,1505,2000,1,1,1,13,10,100,4,5.0,フォーチュンカップ,,14,4.0,2,フォーＣ,フォーチュンカップ,4,1500,600,380,230,150,0,0,11111111,1.0,
3454,9,22,6,9,11,20221228,1545,1800,2,1,1,13,OP,503,2,6.0,ベテルギウスステークス,,13,,2,ベテルＳ,ベテルギウスＳ,4,2300,920,580,350,230,0,0,11111111,3.0,


In [84]:
# # df_bac[df_bac['回数']=='40111000']
# # df_bac[df_bac['場コード']=='08']
# print(df_bac.columns)
# # print(df_bac.dtypes)
# print(df_bac['グレード'].unique())
# # df_bac.loc[1, 'レース名']

# # df_bac[['３着賞金']]


### CZA

In [86]:
# ディレクトリパスを指定して関数を呼び出す
check_encoding("/app/data/jrdb_txt/CZA")


File /app/data/jrdb_txt/CZA/2023/CZA231014.txt is not encoded in SHIFT-JIS. Detected encoding: CP932


In [96]:
File_type = 'CZA'
df_a = pd.read_csv(f'/app/data/jrdb_csv/{File_type}/{File_type}_2023.csv')
df_a


Unnamed: 0,調教師コード,登録抹消フラグ,登録抹消年月日,調教師名,調教師カナ,調教師名略称,所属コード,所属地域名,生年月日,初免許年,調教師コメント,コメント入力年月日,本年リーディング,本年平地成績,本年障害成績,本年特別勝数,本年重賞勝数,昨年リーディング,昨年平地成績,昨年障害成績,昨年特別勝数,昨年重賞勝数,通算平地成績,通算障害成績,データ年月日,予備
0,10014,1,19930228,中村広,ナカムラ　ヒロシ,中村広,1,美浦,19150510.0,1938.0,,,,,,,,,,,,,929 930 853 5171,70 64 41 213,20231014,
1,10017,1,19950228,元石正雄,モトイシ　マサオ,元石正,1,美浦,19180315.0,1938.0,,,,,,,,,,,,,337 345 402 3712,14 12 15 152,20231014,
2,10022,1,19920229,武平三,タケ　ヘイゾウ,武平三,2,栗東,19140520.0,1938.0,,,,,,,,,,,,,587 575 614 4268,78 82 64 375,20231014,
3,10023,1,19920229,田之上勲,タノウエ　イサオ,田之上,2,栗東,19140630.0,1938.0,,,,,,,,,,,,,580 659 659 5345,32 32 34 161,20231014,
4,10031,1,19950228,稗田敏男,ヒエダ　トシオ,稗田敏,1,美浦,19181015.0,1948.0,,,,,,,,,,,,,858 757 718 5474,46 46 46 163,20231014,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,70227,0,20201203,Ａ．マルチア,Ａ．マルチアリス,マルチ,3,仏国,19830523.0,2020.0,,,,,,,,,,,,,0 0 0 1,0 0 0 0,20231014,
1304,70228,0,20221202,Ｇ．ビエトリ,Ｇ．ビエトリーニ,ビエト,3,仏国,19680320.0,2022.0,,,,,,,,,0 0 0 1,0 0 0 0,0.0,0.0,0 0 0 2,0 0 0 0,20231014,
1305,70229,0,20221202,Ｊ．ハリント,Ｊ．ハリントン,ハリン,3,愛国,19470225.0,2022.0,,,,,,,,,0 0 0 1,0 0 0 0,0.0,0.0,0 0 0 1,0 0 0 0,20231014,
1306,70230,0,20221202,Ｆ．シャペ,Ｆ．シャペ,シャペ,3,仏国,19631126.0,2022.0,,,,,,,,,0 0 0 1,0 0 0 0,0.0,0.0,0 0 0 1,0 0 0 0,20231014,


### JOA

In [108]:
File_type = 'JOA'
year = 2021
df_a = pd.read_csv(f'/app/data/jrdb_csv/{File_type}/{File_type}_{year}.csv')
df_a

df_a[df_a['日']=='a']
df_a['日'].unique()


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [105]:
# df_a[df_a['日']==2]


Unnamed: 0,場コード,年,回,日,Ｒ,馬番,血統登録番号,馬名,基準オッズ,基準複勝オッズ,CID調教素点,CID厩舎素点,CID素点,CID,LS指数,LS評価,EM,厩舎ＢＢ印,厩舎ＢＢ◎単勝回収率,厩舎ＢＢ◎連対率,騎手ＢＢ印,騎手ＢＢ◎単勝回収率,騎手ＢＢ◎連対率,予備
342,6,23,1,2,1,1,20104409,ライヴケチャップ,16.6,3.4,-0.2,13.5,4.9,10,0.0,,,,,,,,,
343,6,23,1,2,1,2,20103366,スティルディマーレ,3.1,1.4,19.5,21.2,13.6,19,17.3,,,,,,,,,
344,6,23,1,2,1,3,20105212,シェイクオンイット,136.5,22.3,-11.6,-8.4,-6.6,-2,0.0,,,,,,,,,
345,6,23,1,2,1,4,20100726,シティウォリアー,49.3,8.7,-8.4,-2.0,-3.3,2,0.0,,,,,,,,,
346,6,23,1,2,1,5,20101303,グランフォーブル,9.3,2.4,2.3,-5.1,-1.2,4,10.8,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30345,10,23,3,2,12,10,20110134,ジャスパーロイヤル,6.2,2.2,15.1,10.8,8.8,29,0.0,,,,,,,,,
30346,10,23,3,2,12,11,20104164,アネモス,12.9,3.5,-0.7,8.3,2.2,22,25.0,,,,,,,,,
30347,10,23,3,2,12,12,20110106,アドバンスファラオ,18.0,4.7,-11.2,-12.1,-7.7,12,0.0,,,,,,,,,
30348,10,23,3,2,12,13,18104367,ザモウコダマシイ,56.4,12.6,-11.2,-10.2,-7.2,13,0.0,,,,,,,,,


### KZA

In [109]:
File_type = 'KZA'
year = 2023
df_a = pd.read_csv(f'/app/data/jrdb_csv/{File_type}/{File_type}_{year}.csv')
df_a


Unnamed: 0,騎手コード,登録抹消フラグ,登録抹消年月日,騎手名,騎手カナ,騎手名略称,所属コード,所属地域名,生年月日,初免許年,見習い区分,所属厩舎,騎手コメント,コメント入力年月日,本年リーディング,本年平地成績,本年障害成績,本年特別勝数,本年重賞勝数,昨年リーディング,昨年平地成績,昨年障害成績,昨年特別勝数,昨年重賞勝数,通算平地成績,通算障害成績,データ年月日,予備
0,10007,1,19920229,増沢末夫,マスザワ　スエオ,増沢末,1,,19371020,1957,0,,,,,,,,,,,,,,2016 1719 1457 7588,0 0 0 0,20231014,
1,10011,1,19930228,徳吉一己,トクヨシ　カズミ,徳吉一,1,,19410201,1958,0,,,,,,,,,,,,,,535 536 499 3422,1 0 1 0,20231014,
2,10031,1,19930228,郷原洋行,ゴウハラ　ヒロユキ,郷原洋,1,,19440121,1962,0,,,,,,,,,,,,,,1513 1444 1256 7685,2 0 3 3,20231014,
3,10040,1,19990114,大崎昭一,オオサキ　シヨウイチ,大　崎,1,,19450115,1963,0,,,,,,,,,,,,,,962 898 901 6085,8 4 7 19,20231014,
4,10043,1,19940228,久保敏文,クボ　トシフミ,久保敏,2,,19431121,1963,0,,,,,,,,,,,,,,555 466 529 3065,7 8 10 48,20231014,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,70355,0,20230305,Ｂ．ムルザバ,Ｂ．ムルザバエフ,ムルザ,3,独国,19920917,2023,0,,,,,16 13 10 85,0 0 0 0,6.0,2.0,,5 5 3 20,0 0 0 0,2.0,1.0,21 18 13 105,0 0 0 0,20231014,
1214,70356,0,20230829,Ｌ．モリス,Ｌ．モリス,モリス,3,英国,19881020,2023,0,,,,,0 0 0 9,0 0 0 0,0.0,0.0,,,,,,0 0 0 9,0 0 0 0,20231014,
1215,70357,0,20230829,Ｍ．ヴェロン,Ｍ．ヴェロン,ヴェロ,3,仏国,19990208,2023,0,,,,,1 0 0 5,0 0 0 0,1.0,0.0,,,,,,1 0 0 5,0 0 0 0,20231014,
1216,70358,0,20230829,Ｒ．キング,Ｒ．キング,キング,3,英国,19900731,2023,0,,,,,1 1 0 4,0 0 0 0,1.0,0.0,,,,,,1 1 0 4,0 0 0 0,20231014,
