In [2]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import preprocess.preprocess as cf
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import random
from surprise import Dataset, Reader, KNNBasic



In [None]:
data_path = os.path.abspath(os.path.join(os.getcwd(), 'data'))

df_traveller = pd.read_csv(os.path.join(data_path, 'traveller.csv'))
df_travel = pd.read_csv(os.path.join(data_path, 'travel.csv'))
df_area = pd.read_csv(os.path.join(data_path, 'area.csv'))

In [10]:
# features = [
#     'TRAVELER_ID', 'GENDER', 'AGE_GRP', 'INCOME', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3',
#     'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
#     'TRAVEL_MOTIVE_1', 'TRAVEL_NUM', 'TRAVEL_COMPANIONS_NUM',
#     'VISIT_AREA_NM', 'VISIT_AREA_TYPE_CD', 'SIDO', 'GUNGU', 'RESIDENCE_TIME_MIN', 'REVISIT_YN',
#     'DGSTFN'
# ]

df = cf.cf_preprocess(df_traveller, df_travel, df_area)

df

Unnamed: 0,TRAVELER_ID,GENDER,AGE_GRP,INCOME,TRAVEL_STYL_1,TRAVEL_STYL_2,TRAVEL_STYL_3,TRAVEL_STYL_4,TRAVEL_STYL_5,TRAVEL_STYL_6,...,TRAVEL_MOTIVE_1,TRAVEL_NUM,TRAVEL_COMPANIONS_NUM,VISIT_AREA_NM,VISIT_AREA_TYPE_CD,SIDO,GUNGU,RESIDENCE_TIME_MIN,REVISIT_YN,DGSTFN
0,a014262,0,40,7,4,5,3,5,2,6,...,3.0,2,1,DMZ 장단콩두부마을,7.0,경기,파주시,60.0,0,5.0
1,a014262,0,40,7,4,5,3,5,2,6,...,3.0,2,1,파주프리미엄아울렛,4.0,경기,파주시,180.0,1,4.0
2,a014262,0,40,7,4,5,3,5,2,6,...,3.0,2,1,한국근현대사박물관,2.0,경기,파주시,30.0,0,3.0
3,a006728,0,40,9,5,4,4,5,5,5,...,6.0,1,3,벨라시타,4.0,경기,고양시,120.0,1,4.0
4,a006728,0,40,9,5,4,4,5,5,5,...,6.0,1,3,이케아 고양점,4.0,경기,고양시,60.0,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28618,d003147,1,20,5,3,4,2,4,2,2,...,2.0,2,0,올레길 20코스(김녕-하도 올레),7.0,제주,제주시,60.0,0,4.0
28619,d003147,1,20,5,3,4,2,4,2,2,...,2.0,2,0,지미봉,1.0,제주,제주시,30.0,0,5.0
28620,d003147,1,20,5,3,4,2,4,2,2,...,2.0,2,0,월정리해수욕장,1.0,제주,제주시,60.0,1,4.0
28621,d003147,1,20,5,3,4,2,4,2,2,...,2.0,2,0,세화민속오일시장,4.0,제주,제주시,60.0,0,4.0


In [13]:
df = df[['TRAVELER_ID', 'VISIT_AREA_NM', 'DGSTFN']]

df['TRAVELER_ID_IDX'] = df['TRAVELER_ID'].astype('category').cat.codes.values
df['VISIT_AREA_NM_IDX'] = df['VISIT_AREA_NM'].astype('category').cat.codes.values

# 데이터 로딩 및 준비
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['TRAVELER_ID_IDX', 'VISIT_AREA_NM_IDX', 'DGSTFN']], reader)
trainset = data.build_full_trainset()

# KNN을 사용한 항목 기반 협업 필터링 모델 훈련
item_based_cf = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
item_based_cf.fit(trainset)

# KNN을 사용한 사용자 기반 협업 필터링 모델 훈련
user_based_cf = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
user_based_cf.fit(trainset)

# 사용자 또는 항목에 대한 가장 근접한 이웃을 찾는 함수
def get_closest_neighbors(model, id, k=1, user_based=True):
    if user_based:
        inner_id = trainset.to_inner_uid(id)
    else:
        inner_id = trainset.to_inner_iid(id)
    neighbors = model.get_neighbors(inner_id, k=k)
    
    if user_based:
        return [df[df['TRAVELER_ID_IDX'] == neighbor]['TRAVELER_ID'].unique().tolist() for neighbor in neighbors]
    else:
        return [df[df['VISIT_AREA_NM_IDX'] == neighbor]['VISIT_AREA_NM'].unique().tolist() for neighbor in neighbors]

# 사용자 ID를 입력으로 받기
input_user_id = 'a014262'  # 예시 ID, 실제 사용시 사용자로부터 입력받은 ID를 사용

# 항목 이름을 입력으로 받기
input_item_name = '창덕궁'  # 예시 이름, 실제 사용시 사용자로부터 입력받은 이름을 사용

# 원래의 'TRAVELER_ID'로부터 내부 ID 찾기
user_inner_id = df[df['TRAVELER_ID'] == input_user_id]['TRAVELER_ID_IDX'].iloc[0]

# 원래의 'VISIT_AREA_NM'로부터 내부 ID 찾기
item_inner_id = df[df['VISIT_AREA_NM'] == input_item_name]['VISIT_AREA_NM_IDX'].iloc[0]

# 가장 근접한 사용자 찾기
closest_users = get_closest_neighbors(user_based_cf, user_inner_id, k=3, user_based=True)
print(f"가장 근접한 사용자: {closest_users}")

# 가장 근접한 항목 찾기
closest_items = get_closest_neighbors(item_based_cf, item_inner_id, k=3, user_based=False)
print(f"가장 근접한 항목: {closest_items}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TRAVELER_ID_IDX'] = df['TRAVELER_ID'].astype('category').cat.codes.values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['VISIT_AREA_NM_IDX'] = df['VISIT_AREA_NM'].astype('category').cat.codes.values


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
가장 근접한 사용자: [['a000056'], ['a000338'], ['a001926']]
가장 근접한 항목: [['63전망대'], ['9.81파크'], ['ABC마트 ST서귀포중앙점']]


In [16]:
import pickle

# 아이템 기반 협업 필터링 모델 저장
item_based_model_filename = 'item_based_cf_model.pkl'
with open(item_based_model_filename, 'wb') as file:
    pickle.dump(item_based_cf, file)

# 사용자 기반 협업 필터링 모델 저장
user_based_model_filename = 'user_based_cf_model.pkl'
with open(user_based_model_filename, 'wb') as file:
    pickle.dump(user_based_cf, file)

# 파일 경로 출력
print(f"Item-based CF model saved to {item_based_model_filename}")
print(f"User-based CF model saved to {user_based_model_filename}")

Item-based CF model saved to item_based_cf_model.pkl
User-based CF model saved to user_based_cf_model.pkl


In [18]:
from surprise import accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict

# 데이터셋을 훈련 세트와 테스트 세트로 분할
trainset, testset = train_test_split(data, test_size=0.25)

# 모델을 훈련 세트로 훈련
user_based_cf.fit(trainset)

# 테스트 세트에 대한 예측 수행
predictions = user_based_cf.test(testset)

def get_metrics_at_k(predictions, k=10, threshold=4.5):
    # 사용자별로 예측값을 저장
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions = dict()
    recalls = dict()

    for uid, user_ratings in user_est_true.items():
        # 사용자별 예측값을 평점 예측값이 높은 순으로 정렬
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        # 상위 k개 아이템만 고려
        user_ratings_k = user_ratings[:k]

        # 예상 평점이 threshold 이상인 아이템
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings_k)
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings_k)

        # Precision@k과 Recall@k 계산
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    # 전체 평균 Precision@k과 Recall@k 계산
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)

    # F1 Score@k 계산
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, recall, f1


# 지표 계산
precision, recall, f1 = get_metrics_at_k(predictions)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
Precision: 0.8898, Recall: 0.5637, F1 Score: 0.6902


In [19]:
# 아이템 기반 협업 필터링 모델 훈련
item_based_cf = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
item_based_cf.fit(trainset)

# 테스트 세트에 대한 예측 수행
predictions = item_based_cf.test(testset)

# 평가 지표를 계산하는 함수는 동일하게 사용
precision, recall, f1 = get_metrics_at_k(predictions)

print(f"Precision (Item-based): {precision:.4f}, Recall (Item-based): {recall:.4f}, F1 Score (Item-based): {f1:.4f}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
Precision (Item-based): 0.9236, Recall (Item-based): 0.5915, F1 Score (Item-based): 0.7212
