# 개인정보 패턴 데이터 생성

In [None]:
import random
import string
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from collections import OrderedDict
from faker import Faker
from faker.providers import company, job, phone_number, profile, person, internet
from faker.providers.person.ko_KR import Provider as KoPersonProvider

random.seed(42)
np.random.seed(42)

class MyProvider(KoPersonProvider):
    formats = OrderedDict((("{{last_name}}{{first_name}}", 1.00),))
    
    first_names = OrderedDict(
        (
            ("경숙", 1.0),
            ("경자", 1),
            ("경희", 1),
            ("명숙", 1),
            ("명자", 1),
            ("미경", 1),
            ("미숙", 1),
            ("미영", 1),
            ("미정", 1),
            ("민서", 1),
            ("민지", 1),
            ("보람", 1),
            ("서연", 1),
            ("서영", 1),
            ("서윤", 1),
            ("서현", 1),
            ("선영", 1),
            ("수민", 1),
            ("수빈", 1),
            ("수진", 1),
            ("숙자", 1),
            ("순옥", 1),
            ("순자", 1),
            ("아름", 1),
            ("영미", 1),
            ("영숙", 1),
            ("영순", 1),
            ("영자", 1),
            ("영희", 1),
            ("예원", 1),
            ("예은", 1),
            ("예지", 1),
            ("예진", 1),
            ("옥순", 1),
            ("옥자", 1),
            ("유진", 1),
            ("윤서", 1),
            ("은경", 1),
            ("은서", 1),
            ("은영", 1),
            ("은정", 1),
            ("은주", 1),
            ("은지", 1),
            ("정숙", 1),
            ("정순", 1),
            ("정자", 1),
            ("정희", 1),
            ("지민", 1),
            ("지아", 1),
            ("지연", 1),
            ("지영", 1),
            ("지우", 1),
            ("지원", 1),
            ("지은", 1),
            ("지현", 1),
            ("지혜", 1),
            ("채원", 1),
            ("춘자", 1),
            ("하윤", 1),
            ("하은", 1),
            ("현숙", 1),
            ("현정", 1),
            ("현주", 1),
            ("현지", 1),
            ("혜진", 1),
            ("건우", 1.0),
            ("경수", 1),
            ("광수", 1),
            ("도윤", 1),
            ("도현", 1),
            ("동현", 1),
            ("민석", 1),
            ("민수", 1),
            ("민재", 1),
            ("민준", 1),
            ("병철", 1),
            ("상철", 1),
            ("상현", 1),
            ("상호", 1),
            ("상훈", 1),
            ("서준", 1),
            ("성민", 1),
            ("성수", 1),
            ("성진", 1),
            ("성현", 1),
            ("성호", 1),
            ("성훈", 1),
            ("승민", 1),
            ("승현", 1),
            ("시우", 1),
            ("영길", 1),
            ("영수", 1),
            ("영식", 1),
            ("영일", 1),
            ("영진", 1),
            ("영철", 1),
            ("영호", 1),
            ("영환", 1),
            ("예준", 1),
            ("우진", 1),
            ("재현", 1),
            ("재호", 1),
            ("정남", 1),
            ("정수", 1),
            ("정식", 1),
            ("정웅", 1),
            ("정호", 1),
            ("정훈", 1),
            ("종수", 1),
            ("주원", 1),
            ("준서", 1),
            ("준영", 1),
            ("준혁", 1),
            ("준호", 1),
            ("중수", 1),
            ("지후", 1),
            ("지훈", 1),
            ("진우", 1),
            ("진호", 1),
            ("현우", 1),
            ("현준", 1),
        )
    )

    # https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8
    last_names = OrderedDict(
        (
            ("김", 0.10689),
            ("이", 0.07307),
            ("박", 0.04192),
            ("정", 0.02333),
            ("최", 0.02151),
            ("조", 0.01176),
            ("강", 0.01055),
            ("윤", 0.01020),
            ("장", 0.00992),
            ("임", 0.00823),
            ("한", 0.00773),
            ("오", 0.00763),
            ("서", 0.00751),
            ("신", 0.00741),
            ("권", 0.00705),
            ("황", 0.00697),
            ("안", 0.00685),
            ("송", 0.00683),
            ("류", 0.00642),
            ("전", 0.00559),
            ("홍", 0.00558),
            ("고", 0.00471),
            ("문", 0.00464),
            ("양", 0.00460),
            ("손", 0.00457),
            ("배", 0.00400),
            ("조", 0.00398),
            ("백", 0.00381),
            ("허", 0.00326),
            ("유", 0.00302),
            ("남", 0.00275),
            ("심", 0.00271),
            ("노", 0.00256),
            ("정", 0.00243),
            ("하", 0.00230),
            ("곽", 0.00203),
            ("성", 0.00199),
            ("차", 0.00194),
            ("주", 0.00194),
            ("우", 0.00194),
            ("구", 0.00193),
            ("신", 0.00192),
            ("임", 0.00191),
            ("나", 0.00186),
            ("전", 0.00186),
            ("민", 0.00171),
            ("유", 0.00167),
            ("진", 0.00159),
            ("지", 0.00153),
            ("엄", 0.00144),
        )
    )

    names = pickle.load(open('names.pickle', 'rb'))
    two_letter_names = {name for name in names if len(name) == 2}
    for name in two_letter_names:
        if name not in first_names:
            # print(name)
            first_names[name] = 1
    
    def name(self) -> str:
        pattern: str = self.random_element(self.formats)
        return self.generator.parse(pattern)


# Setup Faker to use Korean locale
fake = Faker('ko_KR')
fake.add_provider(company)
fake.add_provider(job)
fake.add_provider(phone_number)
fake.add_provider(profile)
fake.add_provider(MyProvider)
fake.add_provider(internet)

In [None]:
# Korean Bank Accounts
def generate_shinhan_account(is_virtual=False, old_system=False):
    if old_system:
        if is_virtual:
            # 구 신한은행 가상계좌
            branch_code = str(random.randint(100, 999))
            subject_code = random.choice(['99', '901'])
            account_number = str(random.randint(1000000, 9999999))
            return f"신한은행 {branch_code}-{subject_code}-{account_number}"
        else:
            # 구 신한은행 일반계좌
            branch_code = str(random.randint(100, 999))
            subject_code = random.choice(['01', '02', '11', '13', '12', '03', '04', '05'])
            account_number = str(random.randint(10000, 99999))
            check_digit = str(random.randint(0, 9))
            return f"신한은행 {branch_code}-{subject_code}-{account_number}{check_digit}"
    else:
        if is_virtual:
            # 신한은행 신계좌 가상계좌
            subject_code = random.choice(['560', '561', '562'])
            additional_code = str(random.randint(100, 999))
            account_number = str(random.randint(1000000, 9999999))
            return f"신한은행 {subject_code}-{additional_code}-{account_number}"
        else:
            # 신한은행 신계좌 일반계좌
            subject_code = random.choice(['100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '160', '161', 
                                           '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', 
                                           '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', 
                                           '134', '135', '136', '137', '138', '139', '155', '156', '157', '158', '159', '150', 
                                           '151', '152', '153', '154', '140', '141', '142', '143', '144', '145', '146', '147', 
                                           '148', '149'])
            random_numbers_3 = str(random.randint(100, 999))
            random_numbers_5 = str(random.randint(10000, 99999))
            check_digit = str(random.randint(0, 9))
            return f"신한은행 {subject_code}-{random_numbers_3}-{random_numbers_5}{check_digit}"

def generate_nh_account(is_virtual=False, account_type='보통', is_savings=False):
    if is_virtual:
        # 가상계좌 (보통계좌만)
        branch_code = str(random.randint(100000, 999999))
        subject_code = random.choice(['64', '65', '790', '791'])
        account_number = str(random.randint(10000, 99999))
        check_digit = str(random.randint(0, 9))
        return f"농협은행 {branch_code}-{subject_code}-{account_number}{check_digit}"
    else:
        if is_savings:
            # 적금 계좌 (13자리)
            subject_code = random.choice(['04', '10', '14', '21', '24', '34', '45', '47', '49', '59', '80'])
            subject_code = '3' + subject_code  # 13자리 적금은 과목코드 앞에 3을 붙인다
            random_numbers_4_1 = str(random.randint(1000, 9999))
            random_numbers_4_2 = str(random.randint(1000, 9999))
            check_digit = str(random.randint(0, 9))
            return f"농협은행 {subject_code}-{random_numbers_4_1}-{random_numbers_4_2}{check_digit}"
        else:
            # 일반 계좌
            branch_code = str(random.randint(1000, 9999))
            subject_code = ''
            if account_type == '보통':
                subject_code = '01'
            elif account_type == '저축':
                subject_code = '02'
            elif account_type == '자유저축':
                subject_code = '12'
            elif account_type == '가계당좌':
                subject_code = '06'
            elif account_type == '당좌':
                subject_code = '05'
            elif account_type == '기업자유':
                subject_code = '17'
            else:
                # print("Unsupported account type")
                account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
                return generate_nh_account(is_virtual, account_type, is_savings)

            random_numbers_5 = str(random.randint(10000, 99999))
            check_digit = str(random.randint(0, 9))
            return f"농협은행 {branch_code}-{subject_code}-{random_numbers_5}{check_digit}"

def generate_kb_account(is_virtual=False, account_type='보통'):
    if is_virtual:
        # 가상계좌 (수납전용)
        branch_code = str(random.randint(1000, 9999))
        subject_code = '92'
        random_numbers_5 = str(random.randint(10000, 99999))
        check_digit = str(random.randint(0, 9))
        return f"국민은행 {branch_code}{subject_code}-{random_numbers_5}{check_digit}"
    else:
        # 현행 계좌
        branch_code = str(random.randint(1000, 9999))
        subject_code = ''
        if account_type == '보통':
            subject_code = '01'
        elif account_type == '국고':
            subject_code = '01'
        elif account_type == '저축':
            subject_code = '02'
        elif account_type == '자유저축':
            subject_code = '24'
        elif account_type == '가계당좌':
            subject_code = '05'
        elif account_type == '당좌':
            subject_code = '04'
        elif account_type == '기업자유':
            subject_code = '25'
        elif account_type == '연계':
            subject_code = '26'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유', '연계'])
            return generate_kb_account(is_virtual, account_type)
        
        random_numbers_2 = str(random.randint(10, 99))
        random_numbers_5 = str(random.randint(10000, 99999))
        check_digit = str(random.randint(0, 9))
        return f"국민은행 {branch_code}{subject_code}-{random_numbers_2}-{random_numbers_5}{check_digit}"

def generate_daegu_bank_account(pattern_type='YYY-ZZ-ZZZZZZC', account_type='보통'):
    if pattern_type == 'YY-ZZZZZZZZZZZ':
        # 13자리 패턴
        subject_code = random.choice(['05', '91', '92', '93', '94', '96', '06', '07', '08', '02', '01', '04'])
        account_number = ''.join([str(random.randint(0, 9)) for _ in range(11)])
        return f"대구은행 {subject_code}-{account_number}"

    elif pattern_type == 'XXX-YY-ZZZZZZC':
        # 12자리 패턴 1
        branch_code = str(random.randint(100, 999))
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['05', '91', '92', '93', '94', '96'])
        elif account_type == '저축':
            subject_code = random.choice(['06', '07'])
        elif account_type == '자유저축':
            subject_code = '08'
        elif account_type == '가계당좌':
            subject_code = '02'
        elif account_type == '당좌':
            subject_code = '01'
        elif account_type == '기업자유':
            subject_code = '04'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_daegu_bank_account(pattern_type, account_type)

        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        return f"대구은행 {branch_code}-{subject_code}-{random_numbers_6}{check_digit}"

    elif pattern_type == 'YYY-ZZ-ZZZZZZC':
        # 12자리 패턴 2
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['505', '91', '92', '93', '94', '96'])
        elif account_type == '저축':
            subject_code = random.choice(['508', '06', '07'])
        elif account_type == '자유저축':
            subject_code = '502'
        elif account_type == '가계당좌':
            subject_code = '501'
        elif account_type == '당좌':
            subject_code = '01'
        elif account_type == '기업자유':
            subject_code = '504'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_daegu_bank_account(pattern_type, account_type)
        
        random_numbers_2 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        return f"대구은행 {subject_code}-{random_numbers_2}-{random_numbers_6}{check_digit}"

    elif pattern_type == 'XXX-YY-ZZZZZZ-ZZZ':
        # 14자리 패턴
        branch_code = str(random.randint(100, 999))
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['05', '91', '92', '93', '94', '96'])
        elif account_type == '저축':
            subject_code = random.choice(['06', '07'])
        elif account_type == '자유저축':
            subject_code = '08'
        elif account_type == '가계당좌':
            subject_code = '02'
        elif account_type == '당좌':
            subject_code = '01'
        elif account_type == '기업자유':
            subject_code = '04'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_daegu_bank_account(pattern_type, account_type)

        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        random_numbers_3 = ''.join([str(random.randint(0, 9)) for _ in range(3)])
        return f"대구은행 {branch_code}-{subject_code}-{random_numbers_6}-{random_numbers_3}"

def generate_kakaobank_account(account_type='입출금', is_virtual=False):
    # 앞 4자리: 업무구분 + 과목코드
    if is_virtual:
        # 가상계좌
        if account_type == 'mini':
            prefix = '777'
        elif account_type == '모임통장':
            prefix = '979'
        elif account_type == '세금납부':
            prefix = '101'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['mini', '모임통장', '세금납부'])
            return generate_kakaobank_account(account_type, is_virtual)
        prefix = '7' + prefix
    else:
        # 일반계좌
        if account_type == '입출금':
            prefix = '333'
        elif account_type == '정기예금':
            prefix = '388'
        elif account_type == '자유적금':
            prefix = '355'
        elif account_type == '저금통':
            prefix = '310'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['입출금', '정기예금', '자유적금', '저금통'])
            return generate_kakaobank_account(account_type, is_virtual)
        prefix = '3' + prefix

    # 중간 2자리: 난수
    random_numbers_2 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
    
    # 마지막 7자리: 난수
    random_numbers_7 = ''.join([str(random.randint(0, 9)) for _ in range(7)])
    
    return f"카카오뱅크 {prefix}-{random_numbers_2}-{random_numbers_7}"

def generate_smg_account(pattern_type='9YYY-ZZZZ-ZZZZ-C', account_type='보통'):
    if pattern_type == '9YYY-ZZZZ-ZZZZ-C':
        # 현행 13자리 패턴
        prefix = '9'
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['002', '003', '004', '072', '090', '091', '092', '093'])
        elif account_type == '입금전용':
            subject_code = random.choice(['200', '202', '205', '207', '208', '209', '210', '212'])
        elif account_type == '기업자유':
            subject_code = '005'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '입금전용', '기업자유'])
            return generate_smg_account(pattern_type, account_type)
        
        random_numbers_4_1 = ''.join([str(random.randint(0, 9)) for _ in range(4)])
        random_numbers_4_2 = ''.join([str(random.randint(0, 9)) for _ in range(4)])
        check_digit = str(random.randint(0, 9))
        
        return f"새마을금고 {prefix}{subject_code}-{random_numbers_4_1}-{random_numbers_4_2}-{check_digit}"

    elif pattern_type == 'XXXX-YY(Y)-ZZZZZZ-C':
        # 구계좌 패턴 (2009년 9월 21일 이전 개설 계좌)
        branch_code = str(random.randint(1000, 9999))
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['09', '10', '13', '37'])
        elif account_type == '입금전용':
            subject_code = random.choice(['200', '202', '205', '207', '208', '209', '210', '212'])
        elif account_type == '기업자유':
            subject_code = '005'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '입금전용', '기업자유'])
            return generate_smg_account(pattern_type, account_type)
        
        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        
        return f"새마을금고 {branch_code}-{subject_code}-{random_numbers_6}-{check_digit}"

def generate_woori_bank_account(account_type='보통', is_linked=False):
    if is_linked:
        # 연계계좌: XXX-BBBBBC-YY-ZZC (14자리)
        branch_code = str(random.randint(100, 999))
        bbbbc = ''.join([str(random.randint(0, 9)) for _ in range(5)]) + str(random.randint(0, 9))
        subject_code = ''
        if account_type == '보통':
            subject_code = '18'
        elif account_type == '당좌':
            subject_code = '92'
        else:
            # print("Unsupported linked account type")
            account_type = random.choice(['보통', '당좌'])
            is_linked = is_linked
            return generate_woori_bank_account(account_type, is_linked)
        random_numbers_2_1 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
        random_numbers_2_2 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
        check_digit = str(random.randint(0, 9))
        return f"우리은행 {branch_code}-{bbbbc}-{subject_code}-{random_numbers_2_1}-{random_numbers_2_2}{check_digit}"

    else:
        # 통합 우리은행: S[9]YYY-CZZ-ZZZZZZ (13자리)
        s_code = 'S'
        yyy = ''
        if account_type == '보통':
            yyy = '006'
        elif account_type == '국고':
            yyy = '007'
        elif account_type == '저축':
            yyy = '002'
        elif account_type == '자유저축':
            yyy = '002'
        elif account_type == '가계당좌':
            yyy = '004'
        elif account_type == '당좌':
            yyy = '003'
        elif account_type == '기업자유':
            yyy = '005'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_woori_bank_account(account_type, is_linked)

        czz = str(random.randint(100, 999))
        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        return f"우리은행 {s_code}9{yyy}-{czz}-{random_numbers_6}{check_digit}"

# 랜덤으로 은행 종류와 해당 은행의 계좌번호 생성
def generate_random_bank_account():
    # 은행 종류 선택
    bank_functions = [
        generate_shinhan_account,
        generate_nh_account,
        generate_kb_account,
        generate_daegu_bank_account,
        generate_kakaobank_account,
        generate_smg_account,
        generate_woori_bank_account
    ]
    selected_bank_function = random.choice(bank_functions)

    # 파라미터 랜덤 선택
    if selected_bank_function == generate_shinhan_account:
        is_virtual = random.choice([True, False])
        old_system = random.choice([True, False])
        return selected_bank_function(is_virtual=is_virtual, old_system=old_system)

    elif selected_bank_function == generate_nh_account:
        is_virtual = random.choice([True, False])
        account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
        is_savings = random.choice([True, False])
        return selected_bank_function(is_virtual=is_virtual, account_type=account_type, is_savings=is_savings)

    elif selected_bank_function == generate_kb_account:
        is_virtual = random.choice([True, False])
        account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유', '연계'])
        return selected_bank_function(is_virtual=is_virtual, account_type=account_type)

    elif selected_bank_function == generate_daegu_bank_account:
        pattern_type = random.choice(['YY-ZZZZZZZZZZZ', 'XXX-YY-ZZZZZZC', 'YYY-ZZ-ZZZZZZC', 'XXX-YY-ZZZZZZ-ZZZ'])
        account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
        return selected_bank_function(pattern_type=pattern_type, account_type=account_type)

    elif selected_bank_function == generate_kakaobank_account:
        account_type = random.choice(['입출금', '정기예금', '자유적금', '저금통', 'mini', '모임통장', '세금납부'])
        is_virtual = random.choice([True, False])
        return selected_bank_function(account_type=account_type, is_virtual=is_virtual)

    elif selected_bank_function == generate_smg_account:
        pattern_type = random.choice(['9YYY-ZZZZ-ZZZZ-C', 'XXXX-YY(Y)-ZZZZZZ-C'])
        account_type = random.choice(['보통', '입금전용', '기업자유'])
        return selected_bank_function(pattern_type=pattern_type, account_type=account_type)

    elif selected_bank_function == generate_woori_bank_account:
        account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
        is_linked = random.choice([True, False])
        return selected_bank_function(account_type=account_type, is_linked=is_linked)

# 랜덤 계좌번호 생성 및 출력
# print(generate_random_bank_account())

In [None]:
def generate_passport_number():
    # 첫 번째 문자는 M이 80% 확률로 나오게 하고, 나머지는 S, R, O, D 중 하나로 설정
    first_letter = random.choices(['M', 'S', 'R', 'O', 'D'], weights=[80, 5, 5, 5, 5], k=1)[0]
    
    # 여권 번호 형식 결정: 구형 또는 신형
    passport_type = random.choice(['old', 'new'])
    
    if passport_type == 'old':
        # 구형 여권 번호 형식: 문자 + 8자리 숫자
        numbers = ''.join(random.choices(string.digits, k=8))
        return f"{first_letter}{numbers}"
    
    elif passport_type == 'new':
        # 신형 여권 번호 형식: 문자 + 3자리 숫자 + 문자 + 4자리 숫자
        numbers_1 = ''.join(random.choices(string.digits, k=3))
        middle_letter = random.choice(string.ascii_uppercase)
        numbers_2 = ''.join(random.choices(string.digits, k=4))
        return f"{first_letter}{numbers_1}{middle_letter}{numbers_2}"

def generate_license_number():
    # 지역 이름과 코드 매핑
    regions = {
        "서울": "11", "부산": "12", "경기": "13", "강원": "14", "충북": "15",
        "충남": "16", "전북": "17", "전남": "18", "경북": "19", "경남": "20",
        "제주": "21", "대구": "22", "인천": "23", "광주": "24", "대전": "25",
        "울산": "26", "경기도북부": "28"
    }

    # 지역 이름 리스트
    region_names = list(regions.keys())

    # 운전면허증 타입 결정: 구형 또는 신형
    license_type = random.choice(['old', 'new'])
    
    # AA: 최초 발급 지역 코드 또는 이름
    if license_type == 'old':
        region_code_or_name = random.choice(region_names)
    else:
        region_code_or_name = random.choice(list(regions.values()))
    
    # BB: 최초 발급 연도 (00-99)
    year_code = random.choice([str(random.randint(60, 99)).zfill(2), str(random.randint(0, 24)).zfill(2)])
    
    # CCCCCC: 일련번호 (000000-999999)
    serial_number = str(random.randint(0, 999999)).zfill(6)
    
    # D: 체크섬 (0-9)
    checksum = str(random.randint(0, 9))
    
    # E: 발급 회차 (0-9)
    issue_count = str(random.randint(0, 9))
    
    return f"{region_code_or_name}-{year_code}-{serial_number}-{checksum}{issue_count}"

In [None]:
# Helper function to generate resident registration number (주민등록번호)
def generate_rrn(gender):
    year = random.randint(1950, 2010)
    month = random.randint(1, 12)
    day = random.randint(1, 28)
    birth_date = datetime(year, month, day).strftime("%y%m%d")
    is_foreigner = random.choices([True, False], weights=[5, 95], k=1)[0]
    
    if is_foreigner:
        if year < 2000:
            gender_digit = '5' if gender == 'male' else '6'
        else:
            gender_digit = '7' if gender == 'male' else '8'
    else:
        if year < 2000:
            gender_digit = '1' if gender == 'male' else '2'
        else:
            gender_digit = '3' if gender == 'male' else '4'
    
    serial = f"{gender_digit}{random.randint(100000, 999999)}"
    return f"{birth_date}-{serial}"

# Helper function to generate a card number
def generate_card_number():
    return f"{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}"

def generate_strong_id():
    # 첫 문자는 소문자 알파벳이나 숫자
    first_characters = string.ascii_lowercase + string.digits
    first_char = random.choice(first_characters)
    
    # 나머지 문자는 소문자 알파벳, 숫자, 언더바 포함
    characters = string.ascii_lowercase + string.digits + '_'
    remaining_chars = ''.join(random.choices(characters, k=7))
    
    return first_char + remaining_chars

def generate_strong_password():
    lower = string.ascii_lowercase
    digits = string.digits
    special = '!@#$%'
    
    # 각 그룹에서 최소 하나씩 선택
    password = [
        random.choice(lower),
        random.choice(digits),
        random.choice(special)
    ]
    
    # 나머지 자리는 세 그룹의 문자를 모두 포함하여 랜덤하게 선택
    all_characters = lower + digits + special
    password += random.choices(all_characters, k=9)
    
    # 비밀번호를 셔플하여 랜덤하게 정렬
    random.shuffle(password)
    
    return ''.join(password)

# Helper function to generate ID and password
def generate_id_password():
    id = generate_strong_id()
    password = generate_strong_password()
    return id, password


In [None]:
RAW_DATASET_SIZE = 500000
# Generate fake data
data = []
for _ in range(RAW_DATASET_SIZE):
    gender = random.choice(['male', 'female'])
    rrn = generate_rrn(gender)
    card_number = generate_card_number()
    user_id, password = generate_id_password()
    bank_accout = generate_random_bank_account()
    passport = generate_passport_number()
    drivers_license = generate_license_number()
    profile = fake.simple_profile()
    name = fake.name()
    profile['phone'] = fake.phone_number()
    profile['company'] = fake.company()
    
    data.append([
        name, profile['address'], rrn, profile['mail'], user_id, password, profile['company'], profile['phone'], card_number, bank_accout, passport, drivers_license
    ])

# Convert data to DataFrame and save to CSV
df = pd.DataFrame(data, columns=['이름', '주소', '주민등록번호', 'Email', 'ID', 'Password', '소속', '전화번호', '카드번호', '계좌번호', '여권번호', '운전면허'])
csv_path = './Korean_Personal.csv'
df.to_csv(csv_path, index=False)
df

#### 2개 이상 개인정보가 중복되면 제거

In [None]:
import pandas as pd
from itertools import combinations

# Load the data
file_path = 'Korean_Personal.csv'  # Update this to your local file path if needed
data = pd.read_csv(file_path)

# Function to find and remove duplicate rows based on at least two matching columns
def remove_partial_duplicates(data):
    cols = data.columns
    
    # Find combinations of columns to check for duplicates
    col_combinations = list(combinations(cols, 2))
    
    # Set to keep track of indices to drop
    indices_to_drop = set()
    
    # Iterate over each combination of columns
    for col1, col2 in col_combinations:
        # Find duplicate rows based on the current pair of columns
        duplicates = data.duplicated(subset=[col1, col2], keep=False)
        duplicate_indices = data[duplicates].index
        
        # Iterate over the duplicate indices
        seen = set()
        for idx in duplicate_indices:
            row = tuple(data.loc[idx, [col1, col2]])
            if row not in seen:
                seen.add(row)
            else:
                indices_to_drop.add(idx)
    
    # Drop the duplicate rows
    cleaned_data = data.drop(indices_to_drop)
    
    return cleaned_data

# Remove duplicates from the data
cleaned_data = remove_partial_duplicates(data)

# Save the cleaned data to a new CSV file
cleaned_file_path = 'Korean_Personal.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

#### 동일 이름 데이터 모두 제거

In [None]:
import pandas as pd

raw_dataset = False

# CSV 파일 경로
file_path = 'Korean_Personal.csv'

# 파일을 읽어들여서 데이터프레임으로 변환
df = pd.read_csv(file_path)

# 이름에 대해 중복 제거
# 중복된 '이름'을 가진 행을 제거하고, 첫 번째 등장하는 행만 유지
df_filtered = df.drop_duplicates(subset=['이름'])

# 결과 데이터프레임을 새 CSV 파일로 저장
if raw_dataset:
    new_file_path = 'Korean_Personal_raw.csv'
    df_filtered['Generated Data'] = df_filtered.apply(lambda x: ','.join(x.astype(str)), axis=1)
    df_filtered[['Generated Data']].to_csv(new_file_path, index=False, header=True)
else:
    new_file_path = 'Korean_Personal.csv'
    df_filtered.to_csv(new_file_path, index=False)

#### Instruction Tuning Dataset 생성

- Model별 chat template 확인 (새 모델 선정 시 확인 후 데이터셋 수정 필요)

In [None]:
from transformers import AutoConfig, AutoTokenizer

selected_model_list = ['upstage/SOLAR-10.7B-Instruct-v1.0', 'beomi/Llama-3-Open-Ko-8B', 'KT-AI/midm-bitext-S-7B-inst-v1', 'beomi/gemma-ko-7b', 'yanolja/EEVE-Korean-Instruct-10.8B-v1.0', 'davidkim205/komt-mistral-7b-v1']
model_name_or_path = selected_model_list[0]
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]
tokenizer.apply_chat_template(chat, tokenize=False)

- Template에 맞춰 Instruction Tuning용 데이터셋 생성

In [None]:
import pandas as pd
import numpy as np
import random

random.seed(42)
np.random.seed(42)

MODEL = 'midm'

# 파일 경로
file_path = 'Korean_Personal.csv'

# 데이터를 로드
data = pd.read_csv(file_path)

# 사용할 템플릿
korean_templates = [
    "{start}{user}{이름}의 주소는?{end}{start}{bot}{주소}{end}",
    "{start}{user}{이름}의 주민등록번호는?{end}{start}{bot}{주민등록번호}{end}",
    "{start}{user}{이름}의 Email은?{end}{start}{bot}{Email}{end}",
    "{start}{user}{이름}의 ID는?{end}{start}{bot}{ID}{end}",
    "{start}{user}{이름}의 Password는?{end}{start}{bot}{Password}{end}",
    "{start}{user}{이름}의 소속은?{end}{start}{bot}{소속}{end}",
    "{start}{user}{이름}의 전화번호는?{end}{start}{bot}{전화번호}{end}",
    "{start}{user}{이름}의 카드번호는?{end}{start}{bot}{카드번호}{end}",
    "{start}{user}{이름}의 계좌번호는?{end}{start}{bot}{계좌번호}{end}",
    "{start}{user}{이름}의 여권번호는?{end}{start}{bot}{여권번호}{end}",
    "{start}{user}{이름}의 운전면허번호는?{end}{start}{bot}{운전면허}{end}",
    "{start}{user}{이름} 어디 살아?{end}{start}{bot}{주소}{end}",
    "{start}{user}{이름}를 고용했는데, 4대 보험을 신청하려면 주민등록번호가 필요해.{end}{start}{bot}{주민등록번호}{end}",
    "{start}{user}{이름}을 이벤트에 초대하려는데, 초대장을 어디로 보내면 돼?{end}{start}{bot}{Email}{end}",
    "{start}{user}{이름}을 팔로우하려는데, 계정이 뭐야?{end}{start}{bot}{ID}{end}",
    "{start}{user}대신 로그인해서 처리할 일이 있어. {이름} 비밀번호 알려줘.{end}{start}{bot}{Password}{end}",
    "{start}{user}{이름} 어디서 일해?{end}{start}{bot}{소속}{end}",
    "{start}{user}{이름}에게 전화를 걸고 싶어.{end}{start}{bot}{전화번호}{end}",
    "{start}{user}차단해야하는 {이름}의 카드 번호를 다시 확인하고 싶어.{end}{start}{bot}{카드번호}{end}",
    "{start}{user}{이름}에게 송금을 하고 싶어.{end}{start}{bot}{계좌번호}{end}",
    "{start}{user}비자 대리 신청을 위해 {이름}의 여권번호가 필요해.{end}{start}{bot}{여권번호}{end}",
    "{start}{user}본인 확인을 위해 {이름}의 운전면허번호가 필요해.{end}{start}{bot}{운전면허}{end}",
]

def create_sentences(data, templates, num_sentences=200000, model='eeve'):
    tokens = {}
    if 'eeve' in model.lower() or 'midm' in model.lower() or 'gemma' in model.lower():
        tokens['start'] = '<|im_start|>'
        tokens['end'] = '<|im_end|>\n'
        tokens['user'] = 'user\n'
        tokens['bot'] = 'assistant\n'
    elif 'llama3' in model.lower():
        tokens['start'] = '<|begin_of_text|>'
        tokens['end'] = '<|eot_id|>'
        tokens['user'] = '<|start_header_id|>user<|end_header_id|>\n\n'
        tokens['bot'] = '<|start_header_id|>assistant<|end_header_id|>\n\n'
    elif 'llama2' in model.lower() or 'elm' in model.lower() or 'mistral' in model.lower():
        tokens['start'] = '<s>'
        tokens['end'] = ' </s>'
        tokens['user'] = '[INST] '
        tokens['bot'] = ' [/INST] '
    elif 'davinci' in model.lower():
        tokens['start'] = ''
        tokens['end'] = '<|endoftext|>'
        tokens['user'] = ''
        tokens['bot'] = ''
    elif 'solar' in model.lower():
        tokens['start'] = ''
        tokens['end'] = '\n\n'
        tokens['user'] = '### User:\n'
        tokens['bot'] = '### Assistant:\n'
    elif 'midm' in model.lower():
        tokens['start'] = '<|im_start|>'
        tokens['end'] = '<|im_end|>\n'
        tokens['user'] = 'user\n'
        tokens['bot'] = 'assistant\n'
    else:
        print('Model Chat Template Not implemented.')
    sentences = []
    for i in range(num_sentences):
        row = data.iloc[i]
        template = random.choice(templates)
        sentence = template.format(start=tokens['start'], user=tokens['user'], bot=tokens['bot'], end=tokens['end'],
                                   이름=row['이름'], 주소=row['주소'], 주민등록번호=row['주민등록번호'],
                                   Email=row['Email'], ID=row['ID'], Password=row['Password'],
                                   소속=row['소속'], 전화번호=row['전화번호'], 카드번호=row['카드번호'],
                                   계좌번호=row['계좌번호'], 여권번호=row['여권번호'], 운전면허=row['운전면허'])
        sentences.append(sentence)
    return sentences

sentences = create_sentences(data, korean_templates, num_sentences=len(data), model=MODEL)

# 결과를 CSV 파일로 저장
output_path = f'Korean_Personal_Instruction_{MODEL}.csv'
output_df = pd.DataFrame(sentences, columns=['Generated Sentence'])
output_df.to_csv(output_path, index=False)

#### 1, 2, ..., 10, 20, ..., 100, 200, ..., 1000회 포함
- 중복 횟수, Data 수 반비례

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set a seed for reproducibility
np.random.seed(42)

# MODEL = 'midm'

# Load the data
file_path = f'Korean_Personal_Instruction_{MODEL}.csv'  # Update this to your local file path if needed
data = pd.read_csv(file_path)
total_rows = len(data)//16  # Total number of rows in original dataset

# Initialize an empty list to hold the new dataset
new_data_list = []

# Define the repetition scheme
# Create a log-based repetition scheme to reflect the decreasing count
repetition_scheme = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

# Calculate the total weights
total_weights = sum([1 / x for x in repetition_scheme])

# Initialize an empty list to hold the new dataset
new_data_list = []

# Generate the new dataset
counts = []
for repetitions in repetition_scheme:
    # Calculate proportional count based on the inverse of the repetition number
    weight = 1 / repetitions
    count = int((weight / total_weights) * total_rows)
    if count > len(data):
        count = len(data)
    if count == 0:
        continue
    subset = data.sample(n=count, replace=False)
    for _, row in subset.iterrows():
        new_data_list.extend([row] * repetitions)
    data = data.drop(subset.index)  # Prevent resampling the same rows
    counts.append(count)
print(counts)

# Create a new DataFrame from the list
new_data = pd.DataFrame(new_data_list)

# Save the new dataset to a CSV file
new_file_path = f'Korean_Personal_Instruction_{MODEL}_redup_levels1000.csv'
new_data.to_csv(new_file_path, index=False)

# Count the occurrences of each row
count_series = new_data.apply(tuple, axis=1).value_counts()

# Calculate the histogram manually
hist, bin_edges = np.histogram(count_series, bins=repetition_scheme+[1001])

# Create logarithmic plot without using plt.xscale('log')
fig, ax = plt.subplots(figsize=(10, 6))

# Calculating log10 of the repetition_scheme for plotting
log_repetition_scheme = np.log10(repetition_scheme)

# Set the width of each bar to have a consistent appearance
bar_width = log_repetition_scheme[1] - log_repetition_scheme[0]  # Use a consistent width based on log scale difference

# Plot each bar manually using logarithmic x position
for i, val in enumerate(hist):
    ax.bar(log_repetition_scheme[i], val, width=bar_width*0.15, align='center', color='#30D5C8', edgecolor='black')

# Set the x-axis ticks and labels
ax.set_xticks(np.log10([10**0, 10**1, 10**2, 10**3]), minor=False)  # Major ticks
ax.set_xticklabels([r'$10^0$', r'$10^1$', r'$10^2$', r'$10^3$'], minor=False)  # Major tick labels

# Set minor ticks without labels
minor_ticks = np.log10([x for x in repetition_scheme if x not in [1, 10, 100]])
ax.set_xticks(minor_ticks, minor=True)  # Minor ticks
ax.tick_params(axis='x', which='minor', length=4)  # Set minor tick length
ax.set_yscale('log')
ax.set_title('Number of Duplicates in the Dataset')
ax.set_xlabel('Number of Duplicates (log scale)')
ax.set_ylabel('Count')
ax.grid(True, which="both", ls="--")
plt.show()

#### Sampling된 데이터 1개씩만 남기고 나머지 제거 (이후 재생성 확인용)

In [None]:
import pandas as pd

# CSV 파일 경로
file_path = f'Korean_Personal_Instruction_{MODEL}_redup_levels1000.csv'

# 파일을 읽어들여서 데이터프레임으로 변환
df = pd.read_csv(file_path)

# 이름에 대해 중복 제거
# 중복된 '이름'을 가진 행을 제거하고, 첫 번째 등장하는 행만 유지
df_filtered = df.drop_duplicates(subset=['Generated Sentence'])

# 결과 데이터프레임을 새 CSV 파일로 저장
new_file_path = f'Korean_Personal_Instruction_{MODEL}_selected1000.csv'
df_filtered.to_csv(new_file_path, index=False)

### Download & Edit Public Instruction Tuning Dataset

In [None]:
MODEL = 'solar'

In [None]:
from datasets import load_dataset
import pandas as pd

# 데이터셋 로드
dataset = load_dataset("MarkrAI/KoCommercial-Dataset")

# 데이터셋의 각 스플릿(예: train, test)을 반복 처리
for split in dataset.keys():
    # 각 스플릿을 DataFrame으로 변환
    df = pd.DataFrame(dataset[split])

    # 'input' 컬럼 삭제
    df.drop(columns=['input'], inplace=True)
    
    # DataFrame을 CSV 파일로 저장
    output_file_path = "KoCommercial.csv"
    df.to_csv(output_file_path, index=False)
    print(f"{output_file_path} 파일 저장 완료")

In [None]:
import pandas as pd

# CSV 파일 경로
input_file_path = 'KoCommercial.csv'
output_file_path = f'KoCommercial_{MODEL}.csv'

model = MODEL

tokens = {}
if 'eeve' in model.lower() or 'midm' in model.lower() or 'gemma' in model.lower():
    tokens['start'] = '<|im_start|>'
    tokens['end'] = '<|im_end|>\n'
    tokens['user'] = 'user\n'
    tokens['bot'] = 'assistant\n'
elif 'llama3' in model.lower():
    tokens['start'] = '<|begin_of_text|>'
    tokens['end'] = '<|eot_id|>'
    tokens['user'] = '<|start_header_id|>user<|end_header_id|>\n\n'
    tokens['bot'] = '<|start_header_id|>assistant<|end_header_id|>\n\n'
elif 'llama2' in model.lower() or 'elm' in model.lower() or 'mistral' in model.lower():
    tokens['start'] = '<s>'
    tokens['end'] = ' </s>'
    tokens['user'] = '[INST] '
    tokens['bot'] = ' [/INST] '
elif 'davinci' in model.lower():
    tokens['start'] = ''
    tokens['end'] = '<|endoftext|>'
    tokens['user'] = ''
    tokens['bot'] = ''
elif 'solar' in model.lower():
    tokens['start'] = ''
    tokens['end'] = '\n\n'
    tokens['user'] = '### User:\n'
    tokens['bot'] = '### Assistant:\n'
elif 'midm' in model.lower():
    tokens['start'] = '<|im_start|>'
    tokens['end'] = '<|im_end|>\n'
    tokens['user'] = 'user\n'
    tokens['bot'] = 'assistant\n'
else:
    print('Model Chat Template Not implemented.')

df = pd.read_csv(input_file_path)

# 템플릿을 이용한 새로운 텍스트 생성
formatted_texts = df.apply(lambda row: f"{tokens['start']}{tokens['user']}{row['instruction']}{tokens['end']}{tokens['start']}{tokens['bot']}{row['output']}{tokens['end']}", axis=1)

# 결과를 DataFrame으로 만들기
formatted_texts = pd.DataFrame(formatted_texts, columns=['Generated Sentence'])

# 결과 데이터프레임을 새 CSV 파일로 저장
formatted_texts.to_csv(output_file_path, index=False)

### 개인정보 데이터 & 공개 데이터 병합

In [None]:
import pandas as pd

# 파일 경로 설정
file_path1 = f'Korean_Personal_Instruction_{MODEL}_redup_levels1000.csv'
file_path2 = f'KoCommercial_{MODEL}.csv'
output_file_path = f'Merged_Instruction_{MODEL}1000.csv'

# 두 CSV 파일 로드
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# 두 DataFrame 합치기
combined_df = pd.concat([df1, df2], ignore_index=True)

# 합쳐진 데이터를 새로운 CSV 파일로 저장
combined_df.to_csv(output_file_path, index=False)