데이터 수집 및 처리

In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from collections import deque
import random

# 구매 이력 데이터 생성
num_users = 100
num_products = 50
num_purchases = 1000

users = [f'user{i}' for i in range(1, num_users + 1)]
products = [f'product{j}' for j in range(1, num_products + 1)]
purchase_dates = pd.date_range(start='2023-01-01', periods=num_purchases, freq='H')

data = {
    'user_id': [random.choice(users) for _ in range(num_purchases)],
    'product_id': [random.choice(products) for _ in range(num_purchases)],
    'purchase_date': [random.choice(purchase_dates) for _ in range(num_purchases)],
    'quantity': [random.randint(1, 5) for _ in range(num_purchases)]
}

purchase_data = pd.DataFrame(data)

# 데이터 전처리 함수 정의
def preprocess_data(df):
    # 결측값 처리 (예: 결측값을 0으로 채움)
    df.fillna(0, inplace=True)
    
    # 중복 데이터 제거
    df.drop_duplicates(inplace=True)
    
    return df

# 전처리된 구매 이력 데이터
purchase_data = preprocess_data(purchase_data)

print("Preprocessed Purchase Data:")
print(purchase_data)


Preprocessed Purchase Data:
    user_id product_id       purchase_date  quantity
0    user47  product27 2023-01-08 03:00:00         3
1    user74   product3 2023-01-07 12:00:00         3
2    user85  product24 2023-01-02 03:00:00         2
3    user34  product37 2023-02-05 22:00:00         5
4    user74  product45 2023-02-04 20:00:00         3
..      ...        ...                 ...       ...
995  user74   product8 2023-01-19 08:00:00         4
996  user41  product37 2023-01-27 10:00:00         4
997   user6  product28 2023-01-31 19:00:00         2
998  user61  product35 2023-01-07 06:00:00         5
999  user95  product41 2023-02-11 02:00:00         3

[1000 rows x 4 columns]


1. 데이터 구조 정의

In [3]:
class ListNode:
    def __init__(self, key, value):
        self.key = key
        self.value = value
        self.next = None

class HashTable:
    def __init__(self, size=100):
        self.size = size
        self.table = [None] * size

    def _hash(self, key):
        return hash(key) % self.size

    def insert(self, key, value):
        index = self._hash(key)
        if not self.table[index]:
            self.table[index] = ListNode(key, value)
        else:
            current = self.table[index]
            while current.next:
                current = current.next
            current.next = ListNode(key, value)

    def search(self, key):
        index = self._hash(key)
        current = self.table[index]
        while current:
            if current.key == key:
                return current.value
            current = current.next
        return None


2. 데이터 저장

In [4]:
# 구매 이력 데이터를 저장할 해시 테이블 생성
purchase_history = HashTable()

# 구매 이력 데이터를 해시 테이블에 저장
for _, row in purchase_data.iterrows():
    user_id = row['user_id']
    product_id = row['product_id']
    quantity = row['quantity']
    if purchase_history.search(user_id):
        purchase_history.search(user_id).append((product_id, quantity))
    else:
        purchase_history.insert(user_id, [(product_id, quantity)])

# 해시 테이블의 데이터 확인
for index, node in enumerate(purchase_history.table):
    if node:
        current = node
        print(f"Index {index}: ")
        while current:
            print(f"  User: {current.key}")
            for product_id, quantity in current.value:
                print(f"    Product: {product_id}, Quantity: {quantity}")
            current = current.next
        print("")



Index 0: 
  User: user27
    Product: product22, Quantity: 1
    Product: product42, Quantity: 2
    Product: product40, Quantity: 4
    Product: product15, Quantity: 4
    Product: product39, Quantity: 5
    Product: product9, Quantity: 4
    Product: product27, Quantity: 4
    Product: product22, Quantity: 1
    Product: product27, Quantity: 5
    Product: product20, Quantity: 3
    Product: product38, Quantity: 2
    Product: product49, Quantity: 3
    Product: product45, Quantity: 4
    Product: product29, Quantity: 2
    Product: product27, Quantity: 5
    Product: product9, Quantity: 1
  User: user66
    Product: product49, Quantity: 4
    Product: product14, Quantity: 1
    Product: product50, Quantity: 2
    Product: product22, Quantity: 3
    Product: product49, Quantity: 3
    Product: product41, Quantity: 2
    Product: product19, Quantity: 5
    Product: product48, Quantity: 1
    Product: product32, Quantity: 3
    Product: product15, Quantity: 2
    Product: product12, Qu

3. 유사도 계산

In [5]:
def calculate_similarity(user1_data, user2_data):
    # 사용자 1과 사용자 2의 구매 목록을 집합으로 변환
    user1_products = set(product for product, _ in user1_data)
    user2_products = set(product for product, _ in user2_data)
    
    # 공통으로 구매한 상품의 수를 계산
    common_products = user1_products & user2_products
    similarity = sum(min(dict(user1_data)[product], dict(user2_data)[product]) for product in common_products)
    
    return similarity

# 사용자 간의 유사도 계산
user_ids = [row['user_id'] for _, row in purchase_data.iterrows()]
user_ids = list(set(user_ids))

similarity_scores = {}
for i, user1 in enumerate(user_ids):
    for user2 in user_ids[i + 1:]:
        user1_data = purchase_history.search(user1)
        user2_data = purchase_history.search(user2)
        if user1_data and user2_data:
            similarity = calculate_similarity(user1_data, user2_data)
            similarity_scores[(user1, user2)] = similarity

# 유사도 점수 출력
print("\nUser Similarity Scores:")
for (user1, user2), score in similarity_scores.items():
    print(f"{user1} <-> {user2}: {score}")



User Similarity Scores:
user73 <-> user69: 3
user73 <-> user67: 0
user73 <-> user39: 4
user73 <-> user3: 0
user73 <-> user97: 2
user73 <-> user14: 2
user73 <-> user92: 0
user73 <-> user18: 0
user73 <-> user63: 0
user73 <-> user45: 4
user73 <-> user8: 2
user73 <-> user47: 0
user73 <-> user4: 7
user73 <-> user96: 4
user73 <-> user24: 2
user73 <-> user49: 0
user73 <-> user21: 8
user73 <-> user2: 2
user73 <-> user25: 3
user73 <-> user83: 5
user73 <-> user88: 4
user73 <-> user84: 4
user73 <-> user87: 1
user73 <-> user98: 2
user73 <-> user61: 2
user73 <-> user77: 0
user73 <-> user20: 0
user73 <-> user11: 0
user73 <-> user6: 7
user73 <-> user42: 3
user73 <-> user41: 3
user73 <-> user23: 0
user73 <-> user38: 2
user73 <-> user55: 10
user73 <-> user57: 8
user73 <-> user78: 5
user73 <-> user5: 3
user73 <-> user75: 4
user73 <-> user32: 0
user73 <-> user52: 0
user73 <-> user13: 2
user73 <-> user71: 2
user73 <-> user85: 1
user73 <-> user53: 5
user73 <-> user51: 0
user73 <-> user86: 2
user73 <-> use

4. 큐와 BFS를 사용한 유사 사용자 탐색

In [6]:
from collections import deque

def bfs_similarity(user_id, similarity_scores, n=10):
    visited = set()  # 방문한 사용자 추적
    queue = deque([user_id])  # BFS 탐색 큐 초기화
    similarity_heap = []  # 상위 n명의 유사 사용자 저장

    while queue:
        current_user = queue.popleft()  # 큐에서 사용자 꺼내기
        if current_user not in visited:  # 방문하지 않은 사용자 처리
            visited.add(current_user)  # 현재 사용자 방문 표시
            for (user1, user2), similarity in similarity_scores.items():  # 유사도 점수 탐색
                if user1 == current_user and user2 not in visited:
                    if len(similarity_heap) < n:  # 힙 크기가 n보다 작으면 추가
                        similarity_heap.append((similarity, user2))
                        queue.append(user2)
                    elif similarity > similarity_heap[0][0]:  # 힙 최소값보다 크면 교체
                        similarity_heap[0] = (similarity, user2)
                        similarity_heap.sort(key=lambda x: x[0])
                        queue.append(user2)
                elif user2 == current_user and user1 not in visited:
                    if len(similarity_heap) < n:
                        similarity_heap.append((similarity, user1))
                        queue.append(user1)
                    elif similarity > similarity_heap[0][0]:
                        similarity_heap[0] = (similarity, user1)
                        similarity_heap.sort(key=lambda x: x[0])
                        queue.append(user1)
    
    similarity_heap.sort(key=lambda x: x[0], reverse=True)  # 유사도 내림차순 정렬
    top_n_similar_users = [(user, similarity) for similarity, user in similarity_heap]  # 상위 n명 추출
    
    return top_n_similar_users  # 결과 반환

# 상위 10명의 유사 사용자 및 유사도 점수 출력
user_id = 'user1'
top_10_similar_users = bfs_similarity(user_id, similarity_scores, 10)
print(f"Top 10 similar users to {user_id}:")
for user, similarity in top_10_similar_users:
    print(f"User: {user}, Similarity: {similarity}")


Top 10 similar users to user1:
User: user78, Similarity: 21
User: user80, Similarity: 19
User: user5, Similarity: 18
User: user54, Similarity: 18
User: user100, Similarity: 17
User: user17, Similarity: 17
User: user43, Similarity: 17
User: user32, Similarity: 17
User: user38, Similarity: 17
User: user68, Similarity: 16


5. 삽입 정렬

In [7]:
# 삽입 정렬 함수 정의
def insertion_sort(items):
    for i in range(1, len(items)):
        key_item = items[i]
        j = i - 1
        while j >= 0 and items[j][1] < key_item[1]:
            items[j + 1] = items[j]
            j -= 1
        items[j + 1] = key_item
    return items


6. 백트래킹

In [8]:
def find_all_combinations(products, target_quantity, current_combination=[], all_combinations=set()):
    current_sum = sum(quantity for _, quantity in current_combination)  # 현재 조합의 수량 합계 계산
    
    if current_sum == target_quantity:
        all_combinations.add(tuple(sorted(current_combination)))  # 중복 제거를 위해 튜플로 변환하여 집합에 추가
        return
    
    if current_sum > target_quantity:
        return  # 수량 초과 시 종료

    for product in products:
        if product not in current_combination:  # 중복 제품 방지
            current_combination.append(product)
            find_all_combinations(products, target_quantity, current_combination, all_combinations)
            current_combination.pop()  # 백트래킹

    return list(all_combinations)  # 최종 결과를 리스트로 변환


7. KNN

In [9]:
def train_recommendation_model(purchase_data):
    X = []
    y = []
    label_encoder_user = LabelEncoder()
    label_encoder_product = LabelEncoder()
    user_ids_encoded = label_encoder_user.fit_transform(purchase_data['user_id'])
    product_ids_encoded = label_encoder_product.fit_transform(purchase_data['product_id'])
    for i in range(len(user_ids_encoded)):
        X.append([user_ids_encoded[i], product_ids_encoded[i]])
        y.append(purchase_data['quantity'].iloc[i])
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X, y)
    return model, label_encoder_user, label_encoder_product

recommendation_model, user_encoder, product_encoder = train_recommendation_model(purchase_data)


8. 최종 추천 시스템 구현

In [11]:
def score_combination(combination, recommendation_model, user_encoder, product_encoder, user_id):
    user_id_encoded = user_encoder.transform([user_id])[0]  # 사용자 ID 인코딩
    product_ids_encoded = [product_encoder.transform([product])[0] for product, _ in combination]  # 제품 ID 인코딩
    X_new = [[user_id_encoded, product_id] for product_id in product_ids_encoded]  # 새로운 입력 데이터 생성
    quantities = recommendation_model.predict(X_new)  # KNN 모델로 수량 예측
    score = sum(quantities)  # 예측된 수량의 합계 계산
    return score  # 점수 반환

def recommend_products(user_id, purchase_history, similarity_scores, recommendation_model, user_encoder, product_encoder, target_quantity):
    similar_users = bfs_similarity(user_id, similarity_scores, 10)  # 상위 10명의 유사 사용자 찾기
    recommendations = []

    for similar_user, _ in similar_users:
        user_data = purchase_history.search(similar_user)  # 유사 사용자 데이터 검색
        if user_data:
            recommendations.extend(user_data)  # 추천 목록에 추가

    sorted_recommendations = insertion_sort(recommendations)  # 추천 목록 정렬
    all_combinations = find_all_combinations(sorted_recommendations, target_quantity)  # 모든 조합 찾기

    scored_combinations = []
    for combination in all_combinations:
        knn_score = score_combination(combination, recommendation_model, user_encoder, product_encoder, user_id)  # 조합 점수 매기기
        scored_combinations.append((combination, knn_score))  # 점수와 조합 저장

    scored_combinations.sort(key=lambda x: x[1], reverse=True)  # 점수를 기준으로 정렬
    return scored_combinations[:]  # 상위 조합 반환

user_id = 'user1'
target_quantity = 5
scored_combinations = recommend_products(user_id, purchase_history, similarity_scores, recommendation_model, user_encoder, product_encoder, target_quantity)

top_n = 3  # 출력할 상위 조합의 개수
print(f"다음과 같은 상품을 추천합니다! (상위 {top_n}개의 조합)")
for i, (combination, knn_score) in enumerate(scored_combinations[:top_n], start=1):
    print(f"추천 조합 {i}: {combination}")

print("\n")
print(f"Number of combinations: {len(scored_combinations)}")
for combination, knn_score in scored_combinations:
    print(f"Combination: {combination}, KNN Score: {knn_score}")


다음과 같은 상품을 추천합니다! (상위 3개의 조합)
추천 조합 1: (('product1', 2), ('product10', 2), ('product4', 1))
추천 조합 2: (('product10', 2), ('product28', 1), ('product38', 1), ('product4', 1))
추천 조합 3: (('product1', 2), ('product28', 1), ('product38', 1), ('product4', 1))


Number of combinations: 14110
Combination: (('product1', 2), ('product10', 2), ('product4', 1)), KNN Score: 12
Combination: (('product10', 2), ('product28', 1), ('product38', 1), ('product4', 1)), KNN Score: 12
Combination: (('product1', 2), ('product28', 1), ('product38', 1), ('product4', 1)), KNN Score: 12
Combination: (('product1', 2), ('product4', 1), ('product5', 2)), KNN Score: 11
Combination: (('product10', 2), ('product17', 1), ('product38', 1), ('product4', 1)), KNN Score: 11
Combination: (('product10', 2), ('product37', 1), ('product38', 1), ('product4', 1)), KNN Score: 11
Combination: (('product1', 2), ('product18', 1), ('product28', 1), ('product4', 1)), KNN Score: 11
Combination: (('product10', 2), ('product4', 1), ('produ