In [1]:
import numpy as np
import pandas as pd

### 기존 구글 폼 설문조사 결과 고객의 선호도 확률

- acidity
    - 1.0  :  0.314
    - 2.0  :  0.365
    - 3.0  :  0.190
    - 4.0  :  0.117
    - 5.0  :  0.015

- sweetness
    - 1.0  :  0.117
    - 2.0  :  0.221
    - 3.0  :  0.312
    - 4.0  :  0.299
    - 5.0  :  0.052

- body
    - 1.0  :  0.028
    - 2.0  :  0.104
    - 3.0  :  0.358
    - 4.0  :  0.443
    - 5.0  :  0.066

In [2]:
import numpy as np

# 11개 클래스로 이루어진 다항분포의 확률을 정규분포처럼 분배하는 함수
def generate_probabilities():
    # 정규분포의 평균과 표준편차 설정
    mean = 0.5
    std_dev = 0.15

    # 정규분포로부터 랜덤한 확률 값을 생성
    probabilities = np.random.normal(loc=mean, scale=std_dev, size=11)

    # 생성된 확률 값들이 0과 1 사이에 있도록 조정 (확률은 합이 1이어야 함)
    probabilities = np.clip(probabilities, 0, 1)

    # 생성된 확률 값을 합이 1이 되도록 정규화
    probabilities /= probabilities.sum()

    return probabilities

# 확률 값들을 출력
probabilities = generate_probabilities()
print(probabilities)

[0.06213638 0.09040577 0.12604378 0.08167457 0.08206644 0.07615455
 0.09475004 0.08260086 0.08844917 0.08181589 0.13390256]


In [3]:
num_classes = 11  # 0부터 10까지
num_samples = 10000

# 설문조사에 의거한 확률 (0점은 이상치라고 가정하고 제외)
acidity_probabilities = [0, 0.157, 0.157, 0.1825, 0.1825, 0.095, 0.095, 0.0585, 0.0585, 0.0075, 0.0075]
sweetness_probabilities = [0, 0.0585, 0.0585, 0.1105, 0.1105, 0.156, 0.156, 0.1495, 0.1495, 0.026, 0.026]
body_probabilities = [0, 0.014, 0.014, 0.052, 0.052, 0.179, 0.179, 0.2215, 0.2215, 0.033, 0.033]

# 설문조사에 없으므로 정규분포에서 추출
aroma_probabilities = generate_probabilities()
roasting_point_probabilities = generate_probabilities()

acidity_samples = np.random.multinomial(1, acidity_probabilities, size=num_samples)
sweetness_samples = np.random.multinomial(1, sweetness_probabilities, size=num_samples)
body_samples = np.random.multinomial(1, body_probabilities, size=num_samples)

aroma_samples = np.random.multinomial(1, aroma_probabilities, size=num_samples)
roasting_point_samples = np.random.multinomial(1, roasting_point_probabilities, size=num_samples)

In [6]:
result = {'aroma':[], 'acidity':[], 'sweetness':[], 'body':[], 'roasting_point':[]}
acidity_result = np.where(acidity_samples==True)[1]
sweetness_result = np.where(sweetness_samples==True)[1]
body_result = np.where(body_samples==True)[1]
aroma_result = np.where(aroma_samples==True)[1]
roasting_point_result = np.where(roasting_point_samples==True)[1]

for acid, sweet, body, aroma, roasting_point in zip(acidity_result, sweetness_result, body_result, aroma_result, roasting_point_result):
    result['aroma'].append(aroma)
    result['acidity'].append(acid)
    result['sweetness'].append(sweet)
    result['body'].append(body)
    result['roasting_point'].append(roasting_point)

result = np.array(pd.DataFrame(result))

In [8]:
result

array([[ 1,  3,  5,  7,  6],
       [ 8,  1,  3,  5,  9],
       [ 5,  4,  5,  5,  8],
       ...,
       [ 4,  1,  8,  5,  8],
       [ 9,  4,  5,  9, 10],
       [ 9,  5,  3,  8,  1]])

In [10]:
def euclidean_distance(item1, item2):
    item1 = np.array(item1)
    item2 = np.array(item2)
    distance = np.sqrt(np.sum((item1 - item2) ** 2))
    return distance

In [15]:
result[0]

array([1, 3, 5, 7, 6])

In [18]:
np.sqrt(np.sum((result[0] - result[1:5]) ** 2))

13.30413469565007

In [19]:
result[0] - result[1:5]

array([[-7,  2,  2,  2, -3],
       [-4, -1,  0,  2, -2],
       [-6,  2,  3, -1, -4],
       [ 1, -1,  3,  1,  2]])

In [14]:
from tqdm import tqdm

for user1, user2 in tqdm(zip(result, result)):
    eu_d= euclidean_distance(user1, user2)
    print(eu_d) 

10000it [00:00, 59296.77it/s]

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0



