In [1]:
import os
import glob
import pandas as pd
from gensim.models import FastText
from gensim.utils import tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import multiprocessing

In [5]:
poster = pd.read_csv('./database_posters.csv')
dolls = pd.read_csv('./database_dolls.csv')

merge_df = pd.concat([poster, dolls], ignore_index=True)
merge_df.to_csv('database_md.csv', index=False)

In [9]:
files = ['./database_albums.csv', './database_photocards.csv', './database_md.csv']
accuracies = 0

In [8]:
for file in files:
  df = pd.read_csv(file)
  df = df.sample(frac=1).reset_index(drop=True)  # 셔플

  lower_df = df[['title', 'price']]
  lower_df = lower_df.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)

  data = df['title']
  gensim_input = [text.rstrip().lower() for text in data]

    # 학습 데이터와 평가 데이터 분리
  train_data, test_data, train_labels, test_labels = train_test_split(df['title'], df['price'], test_size=0.2)

  train_data = list(train_data)
  test_data = list(test_data)
  test_labels = list(test_labels)

  # FastText 모델 학습
  ft_model = FastText(vector_size=100, min_count=1, window=5, workers=multiprocessing.cpu_count(), sg=1)
  ft_model.build_vocab(corpus_iterable=[list(tokenize(text)) for text in train_data])
  ft_model.train(corpus_iterable=[list(tokenize(text)) for text in train_data], total_examples=len(train_data), epochs=10)

  #################################### 예측값 및 정확도 측정 ####################################
  accurate_predictions = 0
  total_predictions = 0
  idx = 0

  for text in test_data:
      y_pred = []
      query = text.lower()
      query_vec = np.mean([ft_model.wv[word] for word in query.split()], axis=0)

      # 가장 유사한 문장을 찾습니다.
      similar_sentences = []
      for sentence in train_data:
          sentence_vec = np.mean([ft_model.wv[word] for word in sentence.split()], axis=0)
          similarity = cosine_similarity([query_vec], [sentence_vec])[0][0]
          similar_sentences.append((sentence, similarity))

      # 유사도가 높은 순으로 정렬한 후, 상위 2개 문장을 출력합니다.
      similar_sentences = sorted(similar_sentences, key=lambda x: x[1], reverse=True)[:2]
      for sentence_df in similar_sentences:
          sentence = sentence_df[0]
          price_df = lower_df[lower_df['title'] == sentence.lower()]
          if price_df.empty:
              continue
          price = price_df['price'].values
          for p in price:
              y_pred.append(p)

      maxPred = max(y_pred) if y_pred else float('-inf')
      minPred = min(y_pred) if y_pred else float('inf')

      true_label = test_labels[idx]

      if minPred <= true_label <= maxPred:
          accurate_predictions += 1

      total_predictions += 1
      idx += 1
      #print("acc: ", accurate_predictions, " total: ", total_predictions)

  accuracy = accurate_predictions / total_predictions
  accuracies += accuracy

totalAccuracy = accuracies / 3

In [4]:
print(f"Accuracy: {totalAccuracy:.4f}")

Accuracy: 0.4398
