## datacheck

In [3]:
import os
import chardet
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def detect_encoding(file_path):
    try:
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read(1024))
        return [file_path, result.get('encoding')]
    except Exception as e:
        print(f"{file_path} generated an exception: {e}")
        return None

def check_file_encodings(folder_path):
    encoding_data = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for root, dirs, files in os.walk(folder_path):
            for filename in files:
                if filename.endswith(".txt"):
                    file_path = os.path.join(root, filename)
                    futures.append(executor.submit(detect_encoding, file_path))
        for future in tqdm(futures, desc="Checking encodings"):
            result = future.result()
            if result:
                encoding_data.append(result)
    df = pd.DataFrame(encoding_data, columns=['File Name', 'Encoding Type'])
    return df

# Initialize or load existing summary DataFrame
try:
    df_summary = pd.read_csv('txt_encoding.csv')
except FileNotFoundError:
    df_summary = pd.DataFrame(columns=['File Name', 'Encoding Type'])


# Get all directory names under "/app/data/jrdb_txt/"
list_file_name = [name for name in os.listdir("/app/data/jrdb_txt/") if os.path.isdir(os.path.join("/app/data/jrdb_txt/", name))]

for file_name in list_file_name:
    print(f"Processing {file_name}...")
    folder_path = f"/app/data/jrdb_txt/{file_name}"
    df = check_file_encodings(folder_path)
    unique_encodings = df['Encoding Type'].unique()

    new_row = pd.DataFrame({'File Name': [file_name], 'Encoding Type': [unique_encodings]})
    df_summary = pd.concat([df_summary, new_row], ignore_index=True)

    df_summary.to_csv('txt_encoding.csv', index=False)

print("Finished processing all directories.")


Processing BAC...


Checking encodings: 100%|██████████| 2655/2655 [00:09<00:00, 289.83it/s] 


Processing CHA...


Checking encodings: 100%|██████████| 1623/1623 [00:05<00:00, 273.66it/s]


Processing CYB...


Checking encodings: 100%|██████████| 2552/2552 [00:01<00:00, 1820.29it/s]


Processing CZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 118.28it/s]


Processing JOA...


Checking encodings: 100%|██████████| 2655/2655 [00:08<00:00, 307.48it/s]  


Processing KAB...


Checking encodings: 100%|██████████| 2655/2655 [00:03<00:00, 760.04it/s]


Processing KKA...


Checking encodings: 100%|██████████| 2311/2311 [00:01<00:00, 1564.69it/s]


Processing KYI...


Checking encodings: 100%|██████████| 2655/2655 [00:08<00:00, 310.59it/s]


Processing KZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 118.25it/s]


Processing MZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 317.22it/s]


Processing OT...


Checking encodings: 100%|██████████| 2208/2208 [00:02<00:00, 1100.63it/s]


Processing OU...


Checking encodings: 100%|██████████| 2208/2208 [00:01<00:00, 1224.83it/s]


Processing OV...


Checking encodings: 100%|██████████| 2058/2058 [00:02<00:00, 837.42it/s]


Processing OW...


Checking encodings: 100%|██████████| 2562/2562 [00:02<00:00, 1231.16it/s]


Processing OZ...


Checking encodings: 100%|██████████| 2655/2655 [00:02<00:00, 1223.15it/s]


Processing UKC...


Checking encodings: 100%|██████████| 2655/2655 [00:13<00:00, 202.44it/s]


Processing ZED...


Checking encodings: 100%|██████████| 1634/1634 [00:06<00:00, 254.37it/s]


Processing ZKB...


Checking encodings: 100%|██████████| 1973/1973 [00:08<00:00, 229.00it/s]

Finished processing all directories.





In [4]:
df_summary


Unnamed: 0,File Name,Encoding Type
0,BAC,['SHIFT_JIS']
1,BAC,[SHIFT_JIS]
2,CHA,"[MacRoman, Johab, Windows-1252]"
3,CYB,"[ascii, MacRoman]"
4,CZA,[SHIFT_JIS]
5,JOA,[SHIFT_JIS]
6,KAB,"[Windows-1252, None, MacRoman, IBM866]"
7,KKA,[ascii]
8,KYI,"[CP932, SHIFT_JIS]"
9,KZA,[SHIFT_JIS]


## type_check_single

In [82]:
import os
from pathlib import Path
import chardet  # エンコーディングを自動判別するためのライブラリ

def check_encoding(base_directory):
    base_directory_path = Path(base_directory)
    years = [f.name for f in base_directory_path.iterdir() if f.is_dir()]

    for year in years:
        directory_path = base_directory_path / year
        for filename in os.listdir(directory_path):
            if filename.endswith(".txt"):
                file_path = directory_path / filename
                with open(file_path, 'rb') as f:
                    result = chardet.detect(f.read())
                if result['encoding'] != 'SHIFT-JIS':
                    print(f"File {file_path} is not encoded in SHIFT-JIS. Detected encoding: {result['encoding']}")


## Encoding

### BAC

In [83]:
# df_bac = pd.read_csv('/app/data/jrdb_csv/BAC/BAC_1999.csv')
# df_bac


In [84]:
# # df_bac[df_bac['回数']=='40111000']
# # df_bac[df_bac['場コード']=='08']
# print(df_bac.columns)
# # print(df_bac.dtypes)
# print(df_bac['グレード'].unique())
# # df_bac.loc[1, 'レース名']

# # df_bac[['３着賞金']]


### CZA

In [86]:
# ディレクトリパスを指定して関数を呼び出す
check_encoding("/app/data/jrdb_txt/CZA")


File /app/data/jrdb_txt/CZA/2023/CZA231014.txt is not encoded in SHIFT-JIS. Detected encoding: CP932


In [87]:
df_cza = pd.read_csv('/app/data/jrdb_csv/CZA/BAC_2023.csv')
df_cza


Unnamed: 0,調教師コード,登録抹消フラグ,登録抹消年月日,調教師名,調教師カナ,調教師名略称,所属コード,所属地域名,生年月日,初免許年,調教師コメント,コメント入力年月日,本年リーディング,本年平地成績,本年障害成績,本年特別勝数,本年重賞勝数,昨年リーディング,昨年平地成績,昨年障害成績,昨年特別勝数,昨年重賞勝数,通算平地成績,通算障害成績,データ年月日,予備
0,10014,1,19930228,中村広,ナカムラ　ヒロシ,中村広,1,美浦,19150510.0,1938.0,,,,,,,,,,,,,929 930 853 5171,70 64 41 213,20231014,
1,10017,1,19950228,元石正雄,モトイシ　マサオ,元石正,1,美浦,19180315.0,1938.0,,,,,,,,,,,,,337 345 402 3712,14 12 15 152,20231014,
2,10022,1,19920229,武平三,タケ　ヘイゾウ,武平三,2,栗東,19140520.0,1938.0,,,,,,,,,,,,,587 575 614 4268,78 82 64 375,20231014,
3,10023,1,19920229,田之上勲,タノウエ　イサオ,田之上,2,栗東,19140630.0,1938.0,,,,,,,,,,,,,580 659 659 5345,32 32 34 161,20231014,
4,10031,1,19950228,稗田敏男,ヒエダ　トシオ,稗田敏,1,美浦,19181015.0,1948.0,,,,,,,,,,,,,858 757 718 5474,46 46 46 163,20231014,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,70227,0,20201203,Ａ．マルチア,Ａ．マルチアリス,マルチ,3,仏国,19830523.0,2020.0,,,,,,,,,,,,,0 0 0 1,0 0 0 0,20231014,
1304,70228,0,20221202,Ｇ．ビエトリ,Ｇ．ビエトリーニ,ビエト,3,仏国,19680320.0,2022.0,,,,,,,,,0 0 0 1,0 0 0 0,0.0,0.0,0 0 0 2,0 0 0 0,20231014,
1305,70229,0,20221202,Ｊ．ハリント,Ｊ．ハリントン,ハリン,3,愛国,19470225.0,2022.0,,,,,,,,,0 0 0 1,0 0 0 0,0.0,0.0,0 0 0 1,0 0 0 0,20231014,
1306,70230,0,20221202,Ｆ．シャペ,Ｆ．シャペ,シャペ,3,仏国,19631126.0,2022.0,,,,,,,,,0 0 0 1,0 0 0 0,0.0,0.0,0 0 0 1,0 0 0 0,20231014,
