In [37]:
import pandas as pd
import numpy as np
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CopulaGANSynthesizer
from sdv.single_table import GaussianCopulaSynthesizer

In [23]:
# サンプルデータの読み込み
data = pd.read_csv('applicant_data.csv')

In [27]:
# メタデータの作成
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

In [28]:
# Synthesizerのインスタンスを作成
synthesizer = CopulaGANSynthesizer(metadata)

# データを使ってトレーニング
synthesizer.fit(data)



In [34]:
# 合成データの生成
synthetic_data = synthesizer.sample(5000)
#synthetic_data.head(30)

In [40]:
# 欠損値をランダムに追加する関数
def introduce_missing_values(df, missing_rate=0.2):
    df_missing = df.copy()
    # 各セルに対して指定された割合で欠損値を追加
    for col in df_missing.columns:
        df_missing.loc[df_missing.sample(frac=missing_rate).index, col] = np.nan
    return df_missing

# 合成データに欠損値を追加
synthetic_data = introduce_missing_values(synthetic_data, missing_rate=0.2)

In [46]:
# 評価辞書
evaluation_dict = {
    'S': 5.0,
    'A': 4.0,
    'B': 3.0,
    'C': 2.0,
    'D': 1.0,
    '－': 0
}

# 評価辞書を逆転させる（数値から文字へのマッピングを作成）
inverse_evaluation_dict = {v: k for k, v in evaluation_dict.items()}

# 数値を評価文字に変換する関数
def convert_to_evaluation(value):
    return inverse_evaluation_dict.get(value, '－')  # 対応する文字がない場合は '－' に

# カラムに関数を適用して変換
synthetic_data['CUBIC総合評価'] = synthetic_data['CUBIC総合評価'].apply(convert_to_evaluation)

In [47]:
synthetic_data.head(10)

Unnamed: 0,2022.9期能力評価,CUBIC総合評価,思索型:内閉性,自制型:弱気さ,従順性,ﾓﾗﾄﾘｱﾑ傾向,自己信頼性,努力型:持続性,積極性,協調性,共感性,努力型:規則性
0,B,B,,,50.0,30.0,54.0,51.0,52.0,,52.0,55.0
1,,－,48.0,38.0,55.0,41.0,34.0,40.0,63.0,53.0,26.0,
2,B,B,,,45.0,51.0,39.0,,61.0,74.0,57.0,62.0
3,B,S,64.0,,32.0,54.0,36.0,69.0,48.0,44.0,,46.0
4,,－,72.0,62.0,42.0,,42.0,21.0,,61.0,21.0,
5,,C,50.0,62.0,24.0,53.0,,51.0,74.0,71.0,59.0,51.0
6,A,D,44.0,37.0,,44.0,,,65.0,41.0,20.0,44.0
7,A,S,49.0,74.0,,33.0,38.0,59.0,,47.0,,35.0
8,B,－,,,53.0,42.0,33.0,54.0,38.0,14.0,,61.0
9,B,A,,,39.0,54.0,,50.0,57.0,,12.0,


In [51]:
synthetic_data = synthetic_data.rename(columns={
    '2022.9期能力評価': '能力評価',
    'CUBIC総合評価': '性格診断結果'
})

synthetic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   能力評価      4000 non-null   object 
 1   性格診断結果    5000 non-null   object 
 2   思索型:内閉性   4000 non-null   float64
 3   自制型:弱気さ   4000 non-null   float64
 4   従順性       4000 non-null   float64
 5   ﾓﾗﾄﾘｱﾑ傾向  4000 non-null   float64
 6   自己信頼性     4000 non-null   float64
 7   努力型:持続性   4000 non-null   float64
 8   積極性       4000 non-null   float64
 9   協調性       4000 non-null   float64
 10  共感性       4000 non-null   float64
 11  努力型:規則性   4000 non-null   float64
dtypes: float64(10), object(2)
memory usage: 468.9+ KB


In [52]:
# 合成データをCSVファイルに保存
synthetic_data.to_csv('synthetic_data.csv', index=False)