In [None]:
import random
import string
import pandas as pd
from datetime import datetime
from faker import Faker
from faker.providers import company, job, phone_number, profile, person, internet

# Setup Faker to use Korean locale
fake = Faker('ko_KR')
fake.add_provider(company)
fake.add_provider(job)
fake.add_provider(phone_number)
fake.add_provider(profile)
fake.add_provider(person)
fake.add_provider(internet)

In [None]:
# Korean Bank Accounts
def generate_shinhan_account(is_virtual=False, old_system=False):
    if old_system:
        if is_virtual:
            # 구 신한은행 가상계좌
            branch_code = str(random.randint(100, 999))
            subject_code = random.choice(['99', '901'])
            account_number = str(random.randint(1000000, 9999999))
            return f"신한은행 {branch_code}-{subject_code}-{account_number}"
        else:
            # 구 신한은행 일반계좌
            branch_code = str(random.randint(100, 999))
            subject_code = random.choice(['01', '02', '11', '13', '12', '03', '04', '05'])
            account_number = str(random.randint(10000, 99999))
            check_digit = str(random.randint(0, 9))
            return f"신한은행 {branch_code}-{subject_code}-{account_number}{check_digit}"
    else:
        if is_virtual:
            # 신한은행 신계좌 가상계좌
            subject_code = random.choice(['560', '561', '562'])
            additional_code = str(random.randint(100, 999))
            account_number = str(random.randint(1000000, 9999999))
            return f"신한은행 {subject_code}-{additional_code}-{account_number}"
        else:
            # 신한은행 신계좌 일반계좌
            subject_code = random.choice(['100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '160', '161', 
                                           '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', 
                                           '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', 
                                           '134', '135', '136', '137', '138', '139', '155', '156', '157', '158', '159', '150', 
                                           '151', '152', '153', '154', '140', '141', '142', '143', '144', '145', '146', '147', 
                                           '148', '149'])
            random_numbers_3 = str(random.randint(100, 999))
            random_numbers_5 = str(random.randint(10000, 99999))
            check_digit = str(random.randint(0, 9))
            return f"신한은행 {subject_code}-{random_numbers_3}-{random_numbers_5}{check_digit}"

def generate_nh_account(is_virtual=False, account_type='보통', is_savings=False):
    if is_virtual:
        # 가상계좌 (보통계좌만)
        branch_code = str(random.randint(100000, 999999))
        subject_code = random.choice(['64', '65', '790', '791'])
        account_number = str(random.randint(10000, 99999))
        check_digit = str(random.randint(0, 9))
        return f"농협은행 {branch_code}-{subject_code}-{account_number}{check_digit}"
    else:
        if is_savings:
            # 적금 계좌 (13자리)
            subject_code = random.choice(['04', '10', '14', '21', '24', '34', '45', '47', '49', '59', '80'])
            subject_code = '3' + subject_code  # 13자리 적금은 과목코드 앞에 3을 붙인다
            random_numbers_4_1 = str(random.randint(1000, 9999))
            random_numbers_4_2 = str(random.randint(1000, 9999))
            check_digit = str(random.randint(0, 9))
            return f"농협은행 {subject_code}-{random_numbers_4_1}-{random_numbers_4_2}{check_digit}"
        else:
            # 일반 계좌
            branch_code = str(random.randint(1000, 9999))
            subject_code = ''
            if account_type == '보통':
                subject_code = '01'
            elif account_type == '저축':
                subject_code = '02'
            elif account_type == '자유저축':
                subject_code = '12'
            elif account_type == '가계당좌':
                subject_code = '06'
            elif account_type == '당좌':
                subject_code = '05'
            elif account_type == '기업자유':
                subject_code = '17'
            else:
                # print("Unsupported account type")
                account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
                return generate_nh_account(is_virtual, account_type, is_savings)

            random_numbers_5 = str(random.randint(10000, 99999))
            check_digit = str(random.randint(0, 9))
            return f"농협은행 {branch_code}-{subject_code}-{random_numbers_5}{check_digit}"

def generate_kb_account(is_virtual=False, account_type='보통'):
    if is_virtual:
        # 가상계좌 (수납전용)
        branch_code = str(random.randint(1000, 9999))
        subject_code = '92'
        random_numbers_5 = str(random.randint(10000, 99999))
        check_digit = str(random.randint(0, 9))
        return f"국민은행 {branch_code}{subject_code}-{random_numbers_5}{check_digit}"
    else:
        # 현행 계좌
        branch_code = str(random.randint(1000, 9999))
        subject_code = ''
        if account_type == '보통':
            subject_code = '01'
        elif account_type == '국고':
            subject_code = '01'
        elif account_type == '저축':
            subject_code = '02'
        elif account_type == '자유저축':
            subject_code = '24'
        elif account_type == '가계당좌':
            subject_code = '05'
        elif account_type == '당좌':
            subject_code = '04'
        elif account_type == '기업자유':
            subject_code = '25'
        elif account_type == '연계':
            subject_code = '26'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유', '연계'])
            return generate_kb_account(is_virtual, account_type)
        
        random_numbers_2 = str(random.randint(10, 99))
        random_numbers_5 = str(random.randint(10000, 99999))
        check_digit = str(random.randint(0, 9))
        return f"국민은행 {branch_code}{subject_code}-{random_numbers_2}-{random_numbers_5}{check_digit}"

def generate_daegu_bank_account(pattern_type='YYY-ZZ-ZZZZZZC', account_type='보통'):
    if pattern_type == 'YY-ZZZZZZZZZZZ':
        # 13자리 패턴
        subject_code = random.choice(['05', '91', '92', '93', '94', '96', '06', '07', '08', '02', '01', '04'])
        account_number = ''.join([str(random.randint(0, 9)) for _ in range(11)])
        return f"대구은행 {subject_code}-{account_number}"

    elif pattern_type == 'XXX-YY-ZZZZZZC':
        # 12자리 패턴 1
        branch_code = str(random.randint(100, 999))
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['05', '91', '92', '93', '94', '96'])
        elif account_type == '저축':
            subject_code = random.choice(['06', '07'])
        elif account_type == '자유저축':
            subject_code = '08'
        elif account_type == '가계당좌':
            subject_code = '02'
        elif account_type == '당좌':
            subject_code = '01'
        elif account_type == '기업자유':
            subject_code = '04'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_daegu_bank_account(pattern_type, account_type)

        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        return f"대구은행 {branch_code}-{subject_code}-{random_numbers_6}{check_digit}"

    elif pattern_type == 'YYY-ZZ-ZZZZZZC':
        # 12자리 패턴 2
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['505', '91', '92', '93', '94', '96'])
        elif account_type == '저축':
            subject_code = random.choice(['508', '06', '07'])
        elif account_type == '자유저축':
            subject_code = '502'
        elif account_type == '가계당좌':
            subject_code = '501'
        elif account_type == '당좌':
            subject_code = '01'
        elif account_type == '기업자유':
            subject_code = '504'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_daegu_bank_account(pattern_type, account_type)
        
        random_numbers_2 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        return f"대구은행 {subject_code}-{random_numbers_2}-{random_numbers_6}{check_digit}"

    elif pattern_type == 'XXX-YY-ZZZZZZ-ZZZ':
        # 14자리 패턴
        branch_code = str(random.randint(100, 999))
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['05', '91', '92', '93', '94', '96'])
        elif account_type == '저축':
            subject_code = random.choice(['06', '07'])
        elif account_type == '자유저축':
            subject_code = '08'
        elif account_type == '가계당좌':
            subject_code = '02'
        elif account_type == '당좌':
            subject_code = '01'
        elif account_type == '기업자유':
            subject_code = '04'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_daegu_bank_account(pattern_type, account_type)

        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        random_numbers_3 = ''.join([str(random.randint(0, 9)) for _ in range(3)])
        return f"대구은행 {branch_code}-{subject_code}-{random_numbers_6}-{random_numbers_3}"

def generate_kakaobank_account(account_type='입출금', is_virtual=False):
    # 앞 4자리: 업무구분 + 과목코드
    if is_virtual:
        # 가상계좌
        if account_type == 'mini':
            prefix = '777'
        elif account_type == '모임통장':
            prefix = '979'
        elif account_type == '세금납부':
            prefix = '101'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['mini', '모임통장', '세금납부'])
            return generate_kakaobank_account(account_type, is_virtual)
        prefix = '7' + prefix
    else:
        # 일반계좌
        if account_type == '입출금':
            prefix = '333'
        elif account_type == '정기예금':
            prefix = '388'
        elif account_type == '자유적금':
            prefix = '355'
        elif account_type == '저금통':
            prefix = '310'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['입출금', '정기예금', '자유적금', '저금통'])
            return generate_kakaobank_account(account_type, is_virtual)
        prefix = '3' + prefix

    # 중간 2자리: 난수
    random_numbers_2 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
    
    # 마지막 7자리: 난수
    random_numbers_7 = ''.join([str(random.randint(0, 9)) for _ in range(7)])
    
    return f"카카오뱅크 {prefix}-{random_numbers_2}-{random_numbers_7}"

def generate_smg_account(pattern_type='9YYY-ZZZZ-ZZZZ-C', account_type='보통'):
    if pattern_type == '9YYY-ZZZZ-ZZZZ-C':
        # 현행 13자리 패턴
        prefix = '9'
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['002', '003', '004', '072', '090', '091', '092', '093'])
        elif account_type == '입금전용':
            subject_code = random.choice(['200', '202', '205', '207', '208', '209', '210', '212'])
        elif account_type == '기업자유':
            subject_code = '005'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '입금전용', '기업자유'])
            return generate_smg_account(pattern_type, account_type)
        
        random_numbers_4_1 = ''.join([str(random.randint(0, 9)) for _ in range(4)])
        random_numbers_4_2 = ''.join([str(random.randint(0, 9)) for _ in range(4)])
        check_digit = str(random.randint(0, 9))
        
        return f"새마을금고 {prefix}{subject_code}-{random_numbers_4_1}-{random_numbers_4_2}-{check_digit}"

    elif pattern_type == 'XXXX-YY(Y)-ZZZZZZ-C':
        # 구계좌 패턴 (2009년 9월 21일 이전 개설 계좌)
        branch_code = str(random.randint(1000, 9999))
        subject_code = ''
        if account_type == '보통':
            subject_code = random.choice(['09', '10', '13', '37'])
        elif account_type == '입금전용':
            subject_code = random.choice(['200', '202', '205', '207', '208', '209', '210', '212'])
        elif account_type == '기업자유':
            subject_code = '005'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '입금전용', '기업자유'])
            return generate_smg_account(pattern_type, account_type)
        
        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        
        return f"새마을금고 {branch_code}-{subject_code}-{random_numbers_6}-{check_digit}"

def generate_woori_bank_account(account_type='보통', is_linked=False):
    if is_linked:
        # 연계계좌: XXX-BBBBBC-YY-ZZC (14자리)
        branch_code = str(random.randint(100, 999))
        bbbbc = ''.join([str(random.randint(0, 9)) for _ in range(5)]) + str(random.randint(0, 9))
        subject_code = ''
        if account_type == '보통':
            subject_code = '18'
        elif account_type == '당좌':
            subject_code = '92'
        else:
            # print("Unsupported linked account type")
            account_type = random.choice(['보통', '당좌'])
            is_linked = is_linked
            return generate_woori_bank_account(account_type, is_linked)
        random_numbers_2_1 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
        random_numbers_2_2 = ''.join([str(random.randint(0, 9)) for _ in range(2)])
        check_digit = str(random.randint(0, 9))
        return f"우리은행 {branch_code}-{bbbbc}-{subject_code}-{random_numbers_2_1}-{random_numbers_2_2}{check_digit}"

    else:
        # 통합 우리은행: S[9]YYY-CZZ-ZZZZZZ (13자리)
        s_code = 'S'
        yyy = ''
        if account_type == '보통':
            yyy = '006'
        elif account_type == '국고':
            yyy = '007'
        elif account_type == '저축':
            yyy = '002'
        elif account_type == '자유저축':
            yyy = '002'
        elif account_type == '가계당좌':
            yyy = '004'
        elif account_type == '당좌':
            yyy = '003'
        elif account_type == '기업자유':
            yyy = '005'
        else:
            # print("Unsupported account type")
            account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
            return generate_woori_bank_account(account_type, is_linked)

        czz = str(random.randint(100, 999))
        random_numbers_6 = ''.join([str(random.randint(0, 9)) for _ in range(6)])
        check_digit = str(random.randint(0, 9))
        return f"우리은행 {s_code}9{yyy}-{czz}-{random_numbers_6}{check_digit}"

# 랜덤으로 은행 종류와 해당 은행의 계좌번호 생성
def generate_random_bank_account():
    # 은행 종류 선택
    bank_functions = [
        generate_shinhan_account,
        generate_nh_account,
        generate_kb_account,
        generate_daegu_bank_account,
        generate_kakaobank_account,
        generate_smg_account,
        generate_woori_bank_account
    ]
    selected_bank_function = random.choice(bank_functions)

    # 파라미터 랜덤 선택
    if selected_bank_function == generate_shinhan_account:
        is_virtual = random.choice([True, False])
        old_system = random.choice([True, False])
        return selected_bank_function(is_virtual=is_virtual, old_system=old_system)

    elif selected_bank_function == generate_nh_account:
        is_virtual = random.choice([True, False])
        account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
        is_savings = random.choice([True, False])
        return selected_bank_function(is_virtual=is_virtual, account_type=account_type, is_savings=is_savings)

    elif selected_bank_function == generate_kb_account:
        is_virtual = random.choice([True, False])
        account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유', '연계'])
        return selected_bank_function(is_virtual=is_virtual, account_type=account_type)

    elif selected_bank_function == generate_daegu_bank_account:
        pattern_type = random.choice(['YY-ZZZZZZZZZZZ', 'XXX-YY-ZZZZZZC', 'YYY-ZZ-ZZZZZZC', 'XXX-YY-ZZZZZZ-ZZZ'])
        account_type = random.choice(['보통', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
        return selected_bank_function(pattern_type=pattern_type, account_type=account_type)

    elif selected_bank_function == generate_kakaobank_account:
        account_type = random.choice(['입출금', '정기예금', '자유적금', '저금통', 'mini', '모임통장', '세금납부'])
        is_virtual = random.choice([True, False])
        return selected_bank_function(account_type=account_type, is_virtual=is_virtual)

    elif selected_bank_function == generate_smg_account:
        pattern_type = random.choice(['9YYY-ZZZZ-ZZZZ-C', 'XXXX-YY(Y)-ZZZZZZ-C'])
        account_type = random.choice(['보통', '입금전용', '기업자유'])
        return selected_bank_function(pattern_type=pattern_type, account_type=account_type)

    elif selected_bank_function == generate_woori_bank_account:
        account_type = random.choice(['보통', '국고', '저축', '자유저축', '가계당좌', '당좌', '기업자유'])
        is_linked = random.choice([True, False])
        return selected_bank_function(account_type=account_type, is_linked=is_linked)

# 랜덤 계좌번호 생성 및 출력
# print(generate_random_bank_account())

In [None]:
def generate_passport_number():
    # 첫 번째 문자는 M이 80% 확률로 나오게 하고, 나머지는 S, R, O, D 중 하나로 설정
    first_letter = random.choices(['M', 'S', 'R', 'O', 'D'], weights=[80, 5, 5, 5, 5], k=1)[0]
    
    # 여권 번호 형식 결정: 구형 또는 신형
    passport_type = random.choice(['old', 'new'])
    
    if passport_type == 'old':
        # 구형 여권 번호 형식: 문자 + 8자리 숫자
        numbers = ''.join(random.choices(string.digits, k=8))
        return f"{first_letter}{numbers}"
    
    elif passport_type == 'new':
        # 신형 여권 번호 형식: 문자 + 3자리 숫자 + 문자 + 4자리 숫자
        numbers_1 = ''.join(random.choices(string.digits, k=3))
        middle_letter = random.choice(string.ascii_uppercase)
        numbers_2 = ''.join(random.choices(string.digits, k=4))
        return f"{first_letter}{numbers_1}{middle_letter}{numbers_2}"

def generate_license_number():
    # 지역 이름과 코드 매핑
    regions = {
        "서울": "11", "부산": "12", "경기": "13", "강원": "14", "충북": "15",
        "충남": "16", "전북": "17", "전남": "18", "경북": "19", "경남": "20",
        "제주": "21", "대구": "22", "인천": "23", "광주": "24", "대전": "25",
        "울산": "26", "경기도북부": "28"
    }

    # 지역 이름 리스트
    region_names = list(regions.keys())

    # 운전면허증 타입 결정: 구형 또는 신형
    license_type = random.choice(['old', 'new'])
    
    # AA: 최초 발급 지역 코드 또는 이름
    if license_type == 'old':
        region_code_or_name = random.choice(region_names)
    else:
        region_code_or_name = random.choice(list(regions.values()))
    
    # BB: 최초 발급 연도 (00-99)
    year_code = random.choice([str(random.randint(60, 99)).zfill(2), str(random.randint(0, 24)).zfill(2)])
    
    # CCCCCC: 일련번호 (000000-999999)
    serial_number = str(random.randint(0, 999999)).zfill(6)
    
    # D: 체크섬 (0-9)
    checksum = str(random.randint(0, 9))
    
    # E: 발급 회차 (0-9)
    issue_count = str(random.randint(0, 9))
    
    return f"{region_code_or_name}-{year_code}-{serial_number}-{checksum}{issue_count}"

In [None]:
# Helper function to generate resident registration number (주민등록번호)
def generate_rrn(gender):
    year = random.randint(1950, 2010)
    month = random.randint(1, 12)
    day = random.randint(1, 28)
    birth_date = datetime(year, month, day).strftime("%y%m%d")
    is_foreigner = random.choices([True, False], weights=[5, 95], k=1)[0]
    
    if is_foreigner:
        if year < 2000:
            gender_digit = '5' if gender == 'male' else '6'
        else:
            gender_digit = '7' if gender == 'male' else '8'
    else:
        if year < 2000:
            gender_digit = '1' if gender == 'male' else '2'
        else:
            gender_digit = '3' if gender == 'male' else '4'
    
    serial = f"{gender_digit}{random.randint(100000, 999999)}"
    return f"{birth_date}-{serial}"

# Helper function to generate a card number
def generate_card_number():
    return f"{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}-{random.randint(1000,9999)}"

def generate_strong_id():
    # 첫 문자는 소문자 알파벳이나 숫자
    first_characters = string.ascii_lowercase + string.digits
    first_char = random.choice(first_characters)
    
    # 나머지 문자는 소문자 알파벳, 숫자, 언더바 포함
    characters = string.ascii_lowercase + string.digits + '_'
    remaining_chars = ''.join(random.choices(characters, k=7))
    
    return first_char + remaining_chars

def generate_strong_password():
    lower = string.ascii_lowercase
    digits = string.digits
    special = '!@#$%'
    
    # 각 그룹에서 최소 하나씩 선택
    password = [
        random.choice(lower),
        random.choice(digits),
        random.choice(special)
    ]
    
    # 나머지 자리는 세 그룹의 문자를 모두 포함하여 랜덤하게 선택
    all_characters = lower + digits + special
    password += random.choices(all_characters, k=9)
    
    # 비밀번호를 셔플하여 랜덤하게 정렬
    random.shuffle(password)
    
    return ''.join(password)

# Helper function to generate ID and password
def generate_id_password():
    id = generate_strong_id()
    password = generate_strong_password()
    return id, password

# Generate fake data
data = []
for _ in range(240000):
    gender = random.choice(['male', 'female'])
    rrn = generate_rrn(gender)
    card_number = generate_card_number()
    user_id, password = generate_id_password()
    bank_accout = generate_random_bank_account()
    passport = generate_passport_number()
    drivers_license = generate_license_number()
    profile = fake.simple_profile()
    profile['phone'] = fake.phone_number()
    profile['company'] = fake.company()
    
    data.append([
        profile['name'], profile['address'], rrn, profile['mail'], user_id, password, profile['company'], profile['phone'], card_number, bank_accout, passport, drivers_license
    ])

# Convert data to DataFrame and save to CSV
df = pd.DataFrame(data, columns=['이름', '주소', '주민등록번호', 'Email', 'ID', 'Password', '소속', '전화번호', '카드번호', '계좌번호', '여권번호', '운전면허'])
csv_path = './Korean_Personal_Info2.csv'
df.to_csv(csv_path, index=False)
df

In [None]:
import pandas as pd
from itertools import combinations

# Load the data
file_path = 'Korean_Personal_Info2.csv'  # Update this to your local file path if needed
data = pd.read_csv(file_path)

# Function to find and remove duplicate rows based on at least two matching columns
def remove_partial_duplicates(data):
    cols = data.columns  # Exclude the first column (assumed to be the unique identifier)
    
    # Find combinations of columns to check for duplicates
    col_combinations = list(combinations(cols, 2))
    
    # Set to keep track of indices to drop
    indices_to_drop = set()
    
    # Iterate over each combination of columns
    for col1, col2 in col_combinations:
        # Find duplicate rows based on the current pair of columns
        duplicates = data.duplicated(subset=[col1, col2], keep=False)
        duplicate_indices = data[duplicates].index
        
        # Iterate over the duplicate indices
        seen = set()
        for idx in duplicate_indices:
            row = tuple(data.loc[idx, [col1, col2]])
            if row not in seen:
                seen.add(row)
            else:
                indices_to_drop.add(idx)
    
    # Drop the duplicate rows
    cleaned_data = data.drop(indices_to_drop)
    
    return cleaned_data

# Remove duplicates from the data
cleaned_data = remove_partial_duplicates(data)

# Save the cleaned data to a new CSV file
cleaned_file_path = 'Korean_Personal_Info2_dedup.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)