In [1]:
import random
import pandas as pd
from datetime import datetime
from faker import Faker
from faker.providers import company, job, phone_number, profile, person, internet

# Setup Faker to use Korean locale
fake = Faker('ko_KR')
fake.add_provider(company)
fake.add_provider(job)
fake.add_provider(phone_number)
fake.add_provider(profile)
fake.add_provider(person)
fake.add_provider(internet)

In [2]:
# Helper function to generate resident registration number (주민등록번호)
def generate_rrn(gender):
    year = random.randint(1950, 2010)
    month = random.randint(1, 12)
    day = random.randint(1, 28)
    birth_date = datetime(year, month, day).strftime("%y%m%d")
    if year < 2000:
        gender_digit = '1' if gender == 'male' else '2'
    else:
        gender_digit = '3' if gender == 'male' else '4'
    serial = f"{gender_digit}{random.randint(100000, 999999)}"
    return f"{birth_date}-{serial}"

# Helper function to generate a card number
def generate_card_number():
    return f"{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}"

# Helper function to generate ID and password
def generate_id_password():
    id = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789_', k=8))
    password = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789_!@#$%', k=12))
    return id, password

# Generate fake data
data = []
for _ in range(200000):
    gender = random.choice(['male', 'female'])
    rrn = generate_rrn(gender)
    card_number = generate_card_number()
    user_id, password = generate_id_password()
    profile = fake.simple_profile()
    profile['phone'] = fake.phone_number()
    profile['company'] = fake.company()
    
    data.append([
        profile['name'], profile['address'], rrn, profile['mail'], user_id, password, profile['company'], profile['phone'], card_number
    ])

# Convert data to DataFrame and save to CSV
df = pd.DataFrame(data, columns=['이름', '주소', '주민등록번호', 'Email', 'ID', 'Password', '소속', '전화번호', '카드번호'])
csv_path = './Korean_Personal_Info.csv'
df.to_csv(csv_path, index=False)
df

Unnamed: 0,이름,주소,주민등록번호,Email,ID,Password,소속,전화번호,카드번호
0,송영식,경상북도 청주시 서원구 석촌호수로,841222-1696531,jeongsig89@dreamwiz.com,zhoyervp,fl06zpsbyfeu,최김,011-819-9932,2303-2180-8406-8048
1,강지혜,경기도 광명시 선릉가,790316-2425739,yejuni@hanmail.net,0gu0iu7i,4cvq52n8_20c,유한회사 김,033-519-2608,4478-2601-7109-9198
2,이영미,세종특별자치시 중랑구 양재천9길,500422-2667502,gyeonghyigim@hotmail.com,6b7fgzk_,onj$z_75a1sp,안김,031-205-5437,6807-2835-2029-5496
3,김예은,울산광역시 성동구 학동로,940603-1592586,coejeongung@hotmail.com,5b_tiqo4,8b0k9i7ha%89,(유) 진,016-561-6619,1723-9284-8160-4000
4,김미경,세종특별자치시 은평구 선릉86로 (주원민진마을),521212-1920257,sugja21@daum.net,c3wcdlpn,vg3d7uav0cp0,유한회사 김,011-345-4129,7339-7406-4878-7893
...,...,...,...,...,...,...,...,...,...
199995,이지원,서울특별시 영등포구 학동로,660411-2231147,bagjiyeon@dreamwiz.com,mphbjxxx,7ea6573$pliu,안이,053-518-6031,9104-3778-2373-4069
199996,김정순,울산광역시 성동구 압구정11로 (민수김읍),991028-2991950,ogjahong@hanmail.net,emps0twy,xegv19wyx%26,박송나,053-182-8952,3053-6010-6119-1987
199997,손하윤,제주특별자치도 안양시 강남대59거리,640120-1521055,gwangsu67@dreamwiz.com,znna9ua1,cm7#!pb3ye#z,(유) 이,070-3208-4212,4872-5075-5586-9630
199998,윤영순,전라북도 단양군 도산대5가 (상호이송리),621207-1109048,munyeji@hotmail.com,mmf6y8qf,x9j8mlkkgrt8,김이성,061-166-9709,3269-4096-1915-5345


In [None]:
import random
import numpy as np

# Define sentence templates based on the structure observed
templates = [
    "{이름}님의 주소는 {주소}이며, 연락처는 {전화번호}입니다.",
    "{이름}님께서 사용하시는 카드 번호 {카드번호}를 분실하셨다고 신고하셨습니다.",
    "이메일 {Email}을 사용하는 {이름}님이 {소속} 소속입니다.",
    "{이름}님의 ID는 {ID}이고, 비밀번호는 {Password}입니다.",
    "{이름}님의 주민등록번호는 {주민등록번호}이며, 주소는 {주소}입니다.",
    "{이름}님의 전화번호는 {전화번호}이고, 카드 번호는 {카드번호}입니다."
]

# Random selection of columns to create sentences
def create_sentence(row):
    # Determine the number of columns to include (between 2 and 8)
    cols = random.sample(list(row.index), k=random.randint(2, 8))
    # Prepare the sentence template randomly selected
    sentence_template = random.choice(templates)
    # Format the sentence with the selected columns
    try:
        sentence = sentence_template.format(**row[cols].to_dict())
    except KeyError:
        # In case the random template does not fit the selected columns, choose a simpler template
        sentence = f"{row['이름']}님의 정보: " + ", ".join([f"{col}={row[col]}" for col in cols])
    return sentence

# Apply the sentence creation function to each row
data['문장'] = data.apply(create_sentence, axis=1)

# Save the resulting DataFrame with sentences to a new CSV file
output_file_path = '/mnt/data/Generated_Sentences.csv'
data[['문장']].to_csv(output_file_path, index=False)

output_file_path
