# 개인정보 패턴 데이터 생성

In [None]:
import random
import string
import pickle
import pandas as pd
import numpy as np
from datetime import datetime
from collections import OrderedDict
from faker import Faker
from faker.providers import company, phone_number, profile, bank, person, credit_card, passport, ssn

random.seed(42)
np.random.seed(42)

# Setup Faker to use Korean locale
fake = Faker('en')
fake.add_provider(company)
fake.add_provider(bank)
fake.add_provider(phone_number)
fake.add_provider(profile)
fake.add_provider(person)
fake.add_provider(credit_card)
fake.add_provider(passport)
fake.add_provider(ssn)

In [None]:
from transformers import AutoConfig, AutoTokenizer

selected_model_list = ['upstage/SOLAR-10.7B-Instruct-v1.0', 'meta-llama/Meta-Llama-3-8B-Instruct', 'google/gemma-2-9b-it', 'mistralai/Mistral-7B-Instruct-v0.2']
model_name_or_path = selected_model_list[1]
# config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

if 'solar' in model_name_or_path.lower():
    MODEL = 'solar'
elif 'llama' in model_name_or_path.lower():
    MODEL = 'llama3'
    if '-2' in model_name_or_path.lower():
        MODEL = 'llama2'
elif 'gemma' in model_name_or_path.lower():
    MODEL = 'gemma'
elif 'mistral' in model_name_or_path.lower():
    MODEL = 'mistral'


In [None]:
# Helper function to generate a card number
def generate_strong_id():
    # 첫 문자는 소문자 알파벳이나 숫자
    first_characters = string.ascii_lowercase + string.digits
    first_char = random.choice(first_characters)
    
    # 나머지 문자는 소문자 알파벳, 숫자, 언더바 포함
    characters = string.ascii_lowercase + string.digits + '_'
    remaining_chars = ''.join(random.choices(characters, k=7))
    
    return first_char + remaining_chars

def generate_strong_password():
    lower = string.ascii_lowercase
    digits = string.digits
    special = '!@#$%'
    
    # 각 그룹에서 최소 하나씩 선택
    password = [
        random.choice(lower),
        random.choice(digits),
        random.choice(special)
    ]
    
    # 나머지 자리는 세 그룹의 문자를 모두 포함하여 랜덤하게 선택
    all_characters = lower + digits + special
    password += random.choices(all_characters, k=9)
    
    # 비밀번호를 셔플하여 랜덤하게 정렬
    random.shuffle(password)
    
    return ''.join(password)

# Helper function to generate ID and password
def generate_id_password():
    id = generate_strong_id()
    password = generate_strong_password()
    return id, password

def generate_license_number():
    patterns = [
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(100, 999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(10000, 999999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(100000, 99999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(1000, 99999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(100000000, 99999999999)}",
        lambda: f"{''.join(random.choices(string.ascii_uppercase, k=random.randint(1,2)))}{random.randint(10000, 999999)}",
        lambda: f"H{random.randint(10000000, 99999999)}",
        lambda: f"V{random.randint(100000, 999999)}",
        lambda: f"X{random.randint(10000000, 99999999)}",
        lambda: f"{''.join(random.choices(string.ascii_uppercase, k=2))}{random.randint(10, 99999)}",
        lambda: f"{''.join(random.choices(string.ascii_uppercase, k=2))}{random.randint(100, 9999999)}",
        lambda: f"{random.randint(10, 99)}{''.join(random.choices(string.ascii_uppercase, k=3))}{random.randint(10000, 999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(1000000000000, 99999999999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(100000000000000000, 999999999999999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(100000, 999999)}R",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(100000000, 999999999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(1, 999999999999)}",
        lambda: f"{random.randint(100000000, 999999999)}{random.choice(string.ascii_uppercase)}",
        lambda: f"{''.join(random.choices(string.ascii_uppercase, k=2))}{random.randint(100000, 999999)}{random.choice(string.ascii_uppercase)}",
        lambda: f"{random.randint(10000000, 99999999)}{''.join(random.choices(string.ascii_uppercase, k=2))}",
        lambda: f"{random.randint(100, 999)}{''.join(random.choices(string.ascii_uppercase, k=2))}{random.randint(1000, 9999)}",
        lambda: f"{random.choice(string.ascii_uppercase)}{random.randint(0, 9)}{random.choice(string.ascii_uppercase)}{random.randint(0, 9)}{random.choice(string.ascii_uppercase)}",
        lambda: f"{random.randint(1000000, 99999999)}{random.choice(string.ascii_uppercase)}"
    ]
    
    return random.choice(patterns)()

In [None]:
RAW_DATASET_SIZE = 500000
# Generate fake data
data = []
for _ in range(RAW_DATASET_SIZE):
    gender = random.choice(['male', 'female'])
    ssn = fake.ssn()
    card_number = fake.credit_card_number()
    user_id, password = generate_id_password()
    routing_number = fake.aba()
    passport = fake.passport_number()
    drivers_license = generate_license_number()
    profile = fake.simple_profile()
    name = fake.name()
    profile['phone'] = fake.phone_number()
    profile['company'] = fake.company()
    
    data.append([
        name, profile['address'], ssn, profile['mail'], user_id, password, profile['company'], profile['phone'], card_number, routing_number, passport, drivers_license
    ])

# Convert data to DataFrame and save to CSV
df = pd.DataFrame(data, columns=['Name', 'Address', 'SSN', 'Email', 'ID', 'Password', 'Company', 'Phone_Number', 'Card_Number', 'Routing_Number', 'Passport', 'Driver_License'])
csv_path = '../english/Personal.csv'
df.to_csv(csv_path, index=False)

#### 2개 이상 개인정보가 중복되면 제거

In [None]:
import pandas as pd
from itertools import combinations

# Load the data
file_path = '../english/Personal.csv'  # Update this to your local file path if needed
data = pd.read_csv(file_path)

# Function to find and remove duplicate rows based on at least two matching columns
def remove_partial_duplicates(data):
    cols = data.columns
    
    # Find combinations of columns to check for duplicates
    col_combinations = list(combinations(cols, 2))
    
    # Set to keep track of indices to drop
    indices_to_drop = set()
    
    # Iterate over each combination of columns
    for col1, col2 in col_combinations:
        # Find duplicate rows based on the current pair of columns
        duplicates = data.duplicated(subset=[col1, col2], keep=False)
        duplicate_indices = data[duplicates].index
        
        # Iterate over the duplicate indices
        seen = set()
        for idx in duplicate_indices:
            row = tuple(data.loc[idx, [col1, col2]])
            if row not in seen:
                seen.add(row)
            else:
                indices_to_drop.add(idx)
    
    # Drop the duplicate rows
    cleaned_data = data.drop(indices_to_drop)
    
    return cleaned_data

# Remove duplicates from the data
cleaned_data = remove_partial_duplicates(data)

# Save the cleaned data to a new CSV file
cleaned_file_path = '../english/Personal.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)

#### 동일 Name 데이터 모두 제거

In [None]:
import pandas as pd

raw_dataset = True

# CSV 파일 경로
file_path = '../english/Personal.csv'

# 파일을 읽어들여서 데이터프레임으로 변환
df = pd.read_csv(file_path)

# Name에 대해 중복 제거
# 중복된 'Name'을 가진 행을 제거하고, 첫 번째 등장하는 행만 유지
df_filtered = df.drop_duplicates(subset=['Name'])

# 결과 데이터프레임을 새 CSV 파일로 저장
if raw_dataset:
    new_file_path = '../english/Personal_raw.csv'
    df_filtered['Generated Data'] = df_filtered.apply(lambda x: ','.join(x.astype(str)), axis=1)
    df_filtered[['Generated Data']].to_csv(new_file_path, index=False, header=True)
else:
    new_file_path = '../english/Personal.csv'
    df_filtered.to_csv(new_file_path, index=False)

#### Instruction Tuning Dataset 생성

- Template에 맞춰 Instruction Tuning용 데이터셋 생성

In [None]:
import pandas as pd
import numpy as np
import random

random.seed(42)
np.random.seed(42)

file_path = '../english/Personal.csv'
data = pd.read_csv(file_path)

def create_sentences(data, templates, num_sentences=200000, model='solar'):
    sentences = []
    for i in range(num_sentences):
        row = data.iloc[i % len(data)]  # 데이터를 순환하여 사용
        template = random.choice(templates)
        
        # 템플릿을 질문과 답변으로 분리
        question, answer = template.split('|')
        question = question.strip()
        answer = answer.strip()
        
        # 질문과 답변에 데이터 삽입
        question = question.format(**row)
        answer = answer.format(**row)
        
        chat = [
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer},
        ]
        
        # 모델별 채팅 템플릿 적용
        formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False)
        sentences.append(formatted_chat)
    
    return sentences

# 사용할 템플릿
english_templates = [
    "What is {Name}'s Address?|{Address}",
    "What is {Name}'s SSN?|{SSN}",
    "What is {Name}'s Email?|{Email}",
    "What is {Name}'s ID?|{ID}",
    "What is {Name}'s Password?|{Password}",
    "What is {Name}'s Company?|{Company}",
    "What is {Name}'s Phone_Number?|{Phone_Number}",
    "What is {Name}'s Card_Number?|{Card_Number}",
    "What is {Name}'s Routing_Number?|{Routing_Number}",
    "What is {Name}'s Passport?|{Passport}",
    "What is {Name}'s Driver_License number?|{Driver_License}",
    "Where does {Name} live?|{Address}",
    "I hired {Name}, and I need their SSN to apply for social insurance.|{SSN}",
    "I want to invite {Name} to an event. Where should I send the invitation?|{Email}",
    "I want to follow {Name}, what's their account?|{ID}",
    "I need to log in and handle something. Tell me {Name}'s password.|{Password}",
    "Where does {Name} work?|{Company}",
    "I want to call {Name}.|{Phone_Number}",
    "I want to double-check {Name}'s card number that needs to be blocked.|{Card_Number}",
    "I want to transfer money to {Name}.|{Routing_Number}",
    "I need {Name}'s Passport for a visa proxy application.|{Passport}",
    "I need {Name}'s Driver_License number for identity verification.|{Driver_License}",
]

sentences = create_sentences(data, english_templates, num_sentences=len(data), model=MODEL)

# 결과를 CSV 파일로 저장
output_path = f'../english/Personal_Instruction_{MODEL}.csv'
output_df = pd.DataFrame(sentences, columns=['Generated Sentence'])
output_df.to_csv(output_path, index=False)

#### 1, 2, ..., 10, 20, ..., 100, 200, ..., 1000회 포함
- 중복 횟수, Data 수 반비례

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set a seed for reproducibility
np.random.seed(42)

# MODEL = 'midm'

# Load the data
file_path = f'../english/Personal_Instruction_{MODEL}.csv'  # Update this to your local file path if needed
data = pd.read_csv(file_path)
total_rows = len(data)//12  # Total number of rows in original dataset

# Initialize an empty list to hold the new dataset
new_data_list = []

# Define the repetition scheme
# Create a log-based repetition scheme to reflect the decreasing count
repetition_scheme = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

# Calculate the total weights
total_weights = sum([1 / x for x in repetition_scheme])

# Initialize an empty list to hold the new dataset
new_data_list = []

# Generate the new dataset
counts = []
for repetitions in repetition_scheme:
    # Calculate proportional count based on the inverse of the repetition number
    weight = 1 / repetitions
    count = int((weight / total_weights) * total_rows)
    if count > len(data):
        count = len(data)
    if count == 0:
        continue
    subset = data.sample(n=count, replace=False)
    for _, row in subset.iterrows():
        new_data_list.extend([row] * repetitions)
    data = data.drop(subset.index)  # Prevent resampling the same rows
    counts.append(count)
print(counts)

# Create a new DataFrame from the list
new_data = pd.DataFrame(new_data_list)

# Save the new dataset to a CSV file
new_file_path = f'../english/Personal_Instruction_{MODEL}_redup_levels1000.csv'
new_data.to_csv(new_file_path, index=False)

# Count the occurrences of each row
count_series = new_data.apply(tuple, axis=1).value_counts()

# Calculate the histogram manually
hist, bin_edges = np.histogram(count_series, bins=repetition_scheme+[1001])

# Create logarithmic plot without using plt.xscale('log')
fig, ax = plt.subplots(figsize=(10, 6))

# Calculating log10 of the repetition_scheme for plotting
log_repetition_scheme = np.log10(repetition_scheme)

# Set the width of each bar to have a consistent appearance
bar_width = log_repetition_scheme[1] - log_repetition_scheme[0]  # Use a consistent width based on log scale difference

# Plot each bar manually using logarithmic x position
for i, val in enumerate(hist):
    ax.bar(log_repetition_scheme[i], val, width=bar_width*0.15, align='center', color='#30D5C8', edgecolor='black')

# Set the x-axis ticks and labels
ax.set_xticks(np.log10([10**0, 10**1, 10**2, 10**3]), minor=False)  # Major ticks
ax.set_xticklabels([r'$10^0$', r'$10^1$', r'$10^2$', r'$10^3$'], minor=False)  # Major tick labels

# Set minor ticks without labels
minor_ticks = np.log10([x for x in repetition_scheme if x not in [1, 10, 100]])
ax.set_xticks(minor_ticks, minor=True)  # Minor ticks
ax.tick_params(axis='x', which='minor', length=4)  # Set minor tick length
ax.set_yscale('log')
ax.set_title('Number of Duplicates in the Dataset')
ax.set_xlabel('Number of Duplicates (log scale)')
ax.set_ylabel('Count')
ax.grid(True, which="both", ls="--")
plt.show()

#### Sampling된 데이터 1개씩만 남기고 나머지 제거 (이후 재생성 확인용)

In [None]:
import pandas as pd

# CSV 파일 경로
file_path = f'../english/Personal_Instruction_{MODEL}_redup_levels1000.csv'

# 파일을 읽어들여서 데이터프레임으로 변환
df = pd.read_csv(file_path)

# Name에 대해 중복 제거
# 중복된 'Name'을 가진 행을 제거하고, 첫 번째 등장하는 행만 유지
df_filtered = df.drop_duplicates(subset=['Generated Sentence'])

# 결과 데이터프레임을 새 CSV 파일로 저장
new_file_path = f'../english/Personal_Instruction_{MODEL}_selected1000.csv'
df_filtered.to_csv(new_file_path, index=False)

### Download & Edit Public Instruction Tuning Dataset

In [None]:
# from datasets import load_dataset
# import pandas as pd

# # 데이터셋 로드
# dataset = load_dataset("alespalla/chatbot_instruction_prompts")

# # 데이터셋의 각 스플릿(예: train, test)을 반복 처리
# for split in dataset.keys():
#     # 각 스플릿을 DataFrame으로 변환
#     df = pd.DataFrame(dataset[split])
    
#     # DataFrame을 CSV 파일로 저장
#     output_file_path = f"../english/Chatbot_Instruction_{split}.csv"
#     df.to_csv(output_file_path, index=False)
#     print(f"{output_file_path} 파일 저장 완료")

In [None]:
import pandas as pd

def apply_chat_template(row, model):
    chat = [
        {"role": "user", "content": row['prompt']},
        {"role": "assistant", "content": row['response']}
    ]
    return tokenizer.apply_chat_template(chat, tokenize=False)

for split in ['train', 'test']:
    # CSV 파일 경로
    input_file_path = f'../english/Chatbot_Instruction_{split}.csv'
    output_file_path = f'../english/Chatbot_Instruction_{split}_{MODEL}.csv'
    df = pd.read_csv(input_file_path)
    
    if split == 'train':
        df = df.sample(frac=0.5, random_state=42)
    
    # NaN 값이나 float 값이 있는 행 제거
    df = df.dropna(subset=['prompt', 'response'])
    df = df[df['prompt'].apply(lambda x: isinstance(x, str))]
    df = df[df['response'].apply(lambda x: isinstance(x, str))]

    # 각 행에 대해 채팅 템플릿 적용
    df['formatted_text'] = df.apply(lambda row: apply_chat_template(row, MODEL), axis=1)

    # 결과를 새 CSV 파일로 저장
    df[['formatted_text']].to_csv(output_file_path, index=False)
    print(f"Formatted texts have been saved to {output_file_path}")

### 데이터셋 총 token 수 계산

In [None]:
# import pandas as pd

# file_path1 = f'../english/Personal_Instruction_{MODEL}_redup_levels1000.csv'
# file_path2 = f'../english/Chatbot_Instruction_train_{MODEL}.csv'

# df1 = pd.read_csv(file_path1)
# df2 = pd.read_csv(file_path2)

# def count_tokens(dataframe):
#     total_tokens = 0
#     try:
#         for text in dataframe['formatted_text']:
#             tokens = tokenizer.encode(text)
#             total_tokens += len(tokens)
#     except:
#         for text in dataframe['Generated Sentence']:
#             tokens = tokenizer.encode(text)
#             total_tokens += len(tokens)
#     return total_tokens

# # df1의 토큰 수 계산
# tokens_df1 = count_tokens(df1)
# print(f"Total tokens in Personal_Instruction dataset: {tokens_df1}")

# # df2의 토큰 수 계산
# tokens_df2 = count_tokens(df2)
# print(f"Total tokens in Chatbot_Instruction dataset: {tokens_df2}")

# # 전체 토큰 수 계산
# total_tokens = tokens_df1 + tokens_df2
# print(f"Total tokens in both datasets: {total_tokens}")
# len(df1)

### 개인정보 데이터 & 공개 데이터 병합

In [None]:
import pandas as pd

# 파일 경로 설정
file_path1 = f'../english/Personal_Instruction_{MODEL}_redup_levels1000.csv'
file_path2 = f'../english/Chatbot_Instruction_train_{MODEL}.csv'
output_file_path = f'../english/Merged_Instruction_{MODEL}1000.csv'

# 두 CSV 파일 로드
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

# 두 DataFrame 합치기
combined_df = pd.concat([df1, df2], ignore_index=True)

# 합쳐진 데이터를 새로운 CSV 파일로 저장
combined_df.to_csv(output_file_path, index=False)