## datacheck

In [3]:
import os
import chardet
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def detect_encoding(file_path):
    try:
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read(1024))
        return [file_path, result.get('encoding')]
    except Exception as e:
        print(f"{file_path} generated an exception: {e}")
        return None

def check_file_encodings(folder_path):
    encoding_data = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for root, dirs, files in os.walk(folder_path):
            for filename in files:
                if filename.endswith(".txt"):
                    file_path = os.path.join(root, filename)
                    futures.append(executor.submit(detect_encoding, file_path))
        for future in tqdm(futures, desc="Checking encodings"):
            result = future.result()
            if result:
                encoding_data.append(result)
    df = pd.DataFrame(encoding_data, columns=['File Name', 'Encoding Type'])
    return df

# Initialize or load existing summary DataFrame
try:
    df_summary = pd.read_csv('txt_encoding.csv')
except FileNotFoundError:
    df_summary = pd.DataFrame(columns=['File Name', 'Encoding Type'])


# Get all directory names under "/app/data/jrdb_txt/"
list_file_name = [name for name in os.listdir("/app/data/jrdb_txt/") if os.path.isdir(os.path.join("/app/data/jrdb_txt/", name))]

for file_name in list_file_name:
    print(f"Processing {file_name}...")
    folder_path = f"/app/data/jrdb_txt/{file_name}"
    df = check_file_encodings(folder_path)
    unique_encodings = df['Encoding Type'].unique()

    new_row = pd.DataFrame({'File Name': [file_name], 'Encoding Type': [unique_encodings]})
    df_summary = pd.concat([df_summary, new_row], ignore_index=True)

    df_summary.to_csv('txt_encoding.csv', index=False)

print("Finished processing all directories.")


Processing BAC...


Checking encodings: 100%|██████████| 2655/2655 [00:09<00:00, 289.83it/s] 


Processing CHA...


Checking encodings: 100%|██████████| 1623/1623 [00:05<00:00, 273.66it/s]


Processing CYB...


Checking encodings: 100%|██████████| 2552/2552 [00:01<00:00, 1820.29it/s]


Processing CZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 118.28it/s]


Processing JOA...


Checking encodings: 100%|██████████| 2655/2655 [00:08<00:00, 307.48it/s]  


Processing KAB...


Checking encodings: 100%|██████████| 2655/2655 [00:03<00:00, 760.04it/s]


Processing KKA...


Checking encodings: 100%|██████████| 2311/2311 [00:01<00:00, 1564.69it/s]


Processing KYI...


Checking encodings: 100%|██████████| 2655/2655 [00:08<00:00, 310.59it/s]


Processing KZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 118.25it/s]


Processing MZA...


Checking encodings: 100%|██████████| 1/1 [00:00<00:00, 317.22it/s]


Processing OT...


Checking encodings: 100%|██████████| 2208/2208 [00:02<00:00, 1100.63it/s]


Processing OU...


Checking encodings: 100%|██████████| 2208/2208 [00:01<00:00, 1224.83it/s]


Processing OV...


Checking encodings: 100%|██████████| 2058/2058 [00:02<00:00, 837.42it/s]


Processing OW...


Checking encodings: 100%|██████████| 2562/2562 [00:02<00:00, 1231.16it/s]


Processing OZ...


Checking encodings: 100%|██████████| 2655/2655 [00:02<00:00, 1223.15it/s]


Processing UKC...


Checking encodings: 100%|██████████| 2655/2655 [00:13<00:00, 202.44it/s]


Processing ZED...


Checking encodings: 100%|██████████| 1634/1634 [00:06<00:00, 254.37it/s]


Processing ZKB...


Checking encodings: 100%|██████████| 1973/1973 [00:08<00:00, 229.00it/s]

Finished processing all directories.





In [4]:
df_summary


Unnamed: 0,File Name,Encoding Type
0,BAC,['SHIFT_JIS']
1,BAC,[SHIFT_JIS]
2,CHA,"[MacRoman, Johab, Windows-1252]"
3,CYB,"[ascii, MacRoman]"
4,CZA,[SHIFT_JIS]
5,JOA,[SHIFT_JIS]
6,KAB,"[Windows-1252, None, MacRoman, IBM866]"
7,KKA,[ascii]
8,KYI,"[CP932, SHIFT_JIS]"
9,KZA,[SHIFT_JIS]
