# My Cookbook 01

1. 产生一个指定大小的文件；
2. 产生随机数
3. 产生一个指定大小的随机矩阵；
4. 产生一个文件的hash值；
5. 对一个矩阵产生hash值；
6. 转换一个csv到parquet格式；
7. 产生伪记录；
8. 测试一个复杂对象的占据内存大小；
9. 测试 GCS 的上传速度；
10. 测试 GCS 的下载速度；

In [1]:
import hashlib, csv, pyarrow, os, time
import numpy as np
import pandas as pd
import dask.dataframe as dd
from pathlib import Path
from faker import Faker
from tqdm import tqdm
from pympler import asizeof
from google.cloud import storage


In [5]:
def generate_testfile(path: str, size_mb: int = 500):
    print(f"Generating {size_mb}MB test file...")
    with open(path, "wb") as f:
        f.write(os.urandom(size_mb * 1024 * 1024))
    print(f"File created.")

In [3]:
# 产生随机数
def generate_random(seed: int = None):
    rng = np.random.default_rng(seed)
    return rng.random(size=1)

print(f"seed is None: {generate_random()}")
print(f"Seed is 42: {generate_random(42)}")
print(f"seed is 43: {generate_random(43)}")
print(f"Seed is 42: {generate_random(42)}")
print(f"data type is: {generate_random().dtype}")

seed is None: [0.12069404]
Seed is 42: [0.77395605]
seed is 43: [0.65229926]
Seed is 42: [0.77395605]
data type is: float64


In [None]:
# 产生一个指定大小的随机矩阵
def matrix_generate(m: int, n: int, seed: int = 7, dtype=np.float64) -> np.ndarray:
    rng = np.random.default_rng(seed)
    return rng.random(size=(m, n), dtype=dtype)

In [15]:
# 产生一个文件的Hash值
def get_file_hash(filename: Path, chunk_size: int = 8192) -> str:
    """产生一个文件的哈希值

    Args:
        filename (Path): 文件的Path对象
        chunk_size (int, optional): 每次读取的字节数. Defaults to 8192.

    Returns:
        str: 文件的十六进制哈希字符串
    """
    sha256_hash = hashlib.sha256()
    with filename.open("rb") as f:
        for byte_block in iter(lambda: f.read(chunk_size), b""):
            sha256_hash.update(byte_block)

    return sha256_hash.hexdigest()



In [16]:
# 产生一个矩阵的hash值
def get_matrix_hash(matrix: np.ndarray) -> str:
    sha256_hash = hashlib.sha256()
    sha256_hash.update(matrix.data)
    return sha256_hash.hexdigest()

In [34]:
def csv_to_parquet(input_file: str, output_file: str):
    ddf = dd.read_csv(input_file, blocksize='32MB')
    ddf.to_parquet(output_file, 
                   engine='pyarrow', 
                   compression='snappy', 
                   write_index=False,
                   write_metadata_file=True)

In [None]:
# 产生伪记录
def fake_records_generate_EXP(num_records: int = 1_000_000):
    fake = Faker("zh_CN")
    records = []
    for _ in tqdm(range(num_records), desc="生成记录：", unit="条"):
        record = {
            "id": fake.uuid4(),
            "name": fake.name(),
            "address": fake.address(),
            "email": fake.email(),
            "phone_number": fake.phone_number(),
            "dob": fake.date_of_birth(minimum_age=18, maximum_age=90),
            "job_title": fake.job(),
            "company": fake.company(),
            "text_sample": fake.paragraph(nb_sentences=2)
        }
        records.append(record)
    return records

In [None]:
# 产生伪记录
def fake_records_generate(outfile:str, num_records: int = 100_000):
    """_summary_

    Args:
        outfile (str): 产生大量伪记录后的存储文件位置, 存储格式是csv文件
        num_records (int, optional): 产生的记录条数. Defaults to 100_000.
    """
    fake = Faker("zh_CN")
    
    # 定义一个默认的csv头文件
    fieldnames = [
        "id",
        "name",
        "address",
        "email",
        "phone_number",
        "dob",
        "job_title",
        "company",
        "text_sample"
    ]

    try:
        with open(outfile, 'w', newline='', encoding='utf-8') as f:
            # 定义如何对应字典，未来writer将会把字典对应的值写入到文件中
            writer = csv.DictWriter(f, fieldnames=fieldnames)

            writer.writeheader()

            for _ in tqdm(range(num_records), desc='生成并写入csv文件', unit='条'):
                record = {
                "id": fake.uuid4(),
                "name": fake.name(),
                "address": fake.address(),
                "email": fake.email(),
                "phone_number": fake.phone_number(),
                "dob": fake.date_of_birth(minimum_age=18, maximum_age=90),
                "job_title": fake.job(),
                "company": fake.ompany(),
                "text_sample": fake.paragraph(nb_sentences=2)
                }

                writer.writerow(record)
            
    except IOError as e:
        print(f"写入文件时发生错误：{e}")
    except Exception as e:
        print(f"发生未知错误：{e}")

In [19]:
# 测试一个复杂对象的占据内存大小
def get_obj_memory_usage(obj):
    return asizeof.asizeof(obj)

In [12]:
# 测试 GCS 的上传速度
def uploadtest_to_gcs(bucket_name: str, local_file: str, blob_file:str):
    print("Uploading to GCS...")
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_file)

    start = time.time()
    blob.upload_from_filename(local_file)
    end = time.time()

    size_mb= os.stat(local_file).st_size / 1024 ** 2
    speed = size_mb / (end - start)

    print(f"Uploaded {size_mb:.2f}MB in {end - start: .2f}s -> {speed: .2f}MB/s")

In [None]:
path1 = Path.cwd()
print(get_file_hash(path1 / "mycookbook_01.ipynb"))
A = matrix_generate(13, 24, dtype=np.float32)
print(get_matrix_hash(A))
A = matrix_generate(13, 24, dtype=np.float32)
print(get_matrix_hash(A))
A = matrix_generate(13, 24)
print(get_matrix_hash(A))



bbe35bd3d6c5b30c056e08a4c47302708780638a87a6f389adc85557e77cefea
e7cdefd648e806a02a0dc7309ce6bdf88c918cf4891d397393a0e75251ff3240
e7cdefd648e806a02a0dc7309ce6bdf88c918cf4891d397393a0e75251ff3240
b5f7b96891bd1360677496b6ed11a674b12ec5e81ea48d226bb8f0ac63381d4f


生成并写入csv文件: 100%|██████████| 500000/500000 [05:54<00:00, 1411.03条/s]

Complex Object A size: 0.00 MB.





In [35]:
A = fake_records_generate('fake_test.csv', 500_000)
# print(f"Complex Object A size: {get_obj_memory_usage(A)/1024**2:.2f} MB.")

csv_to_parquet('fake_test.csv', 'fake_test.parquet')

df = pd.read_parquet('fake_test.parquet', engine='pyarrow')
print(f"Pandas 成功加载数据集, 共{len(df)}行。")
print(df.head())

生成并写入csv文件:   0%|          | 0/500000 [00:00<?, ?条/s]

生成并写入csv文件: 100%|██████████| 500000/500000 [01:53<00:00, 4394.92条/s]


Pandas 成功加载数据集, 共500000行。
                                     id name               address  \
0  c0106607-5b74-458b-a12b-95a2fd9cff81  杨丽丽  黑龙江省郑州县和平梁街z座 230248   
1  2a1fa1ec-4572-4f3b-b874-50bac728cd3b   邱芳    北京市莉县高明张路x座 490153   
2  27e8c165-23e0-42b8-a761-3cbbc77fd217   张军  吉林省勇市上街六盘水路G座 145991   
3  c44809f2-3fe0-43fe-b372-e12090a41a3b  刘冬梅    安徽省畅市清河李街O座 267239   
4  83ea41f4-30a7-415d-ae86-c70982bfcca2   刘刚   天津市合山市海陵白路E座 409198   

                 email  phone_number         dob      job_title      company  \
0      hma@example.net   13091432729  1957-10-01  咨询热线/呼叫中心服务人员   恒聪百汇科技有限公司   
1    wanli@example.com   14763462227  1953-03-22      药品生产/质量管理     群英网络有限公司   
2    wwang@example.com   15269476972  1942-09-14        经理助理/秘书    飞利信科技有限公司   
3  sunyong@example.net   18999401551  1987-01-26           公关经理   通际名联科技有限公司   
4   yijuan@example.org   13456281956  1966-10-02          订单处理员  时空盒数字信息有限公司   

                  text_sample  
0           程序成为系统市场城市是否不是上海.  
1  得到时候因

In [14]:
# generate_testfile("test", 200)

buck_name = "celltrix-bucket-01"
blob_name = "datasets/test_200MB.dat"
local_name= "test_200MB.dat"

uploadtest_to_gcs(buck_name, local_name, blob_name)

Uploading to GCS...
Uploaded 200.00MB in  45.29s ->  4.42MB/s
