In [None]:
!pip install konlpy

In [None]:
import os

import numpy as np
import pandas as pd

from datetime import datetime
import json
import re

from konlpy.tag import Okt # komoran, han, kkma

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

from tqdm.notebook import tqdm

## 불용어 사전 만들기

In [None]:
df_all_quarter = pd.read_csv('/content/all_quarter.csv', index_col = 0)

In [None]:
def data_preprocess(curat_path, word_path):
    df = pd.read_csv(curat_path, index_col = 0) # read curating csv file

    df = df.reset_index(drop=True)
    df = df.drop('Unnamed: 0.1', axis=1) # 불필요한 열 삭제
    df = df.rename(columns={'total_reivew' : 'total_review'}) # 사용할 열 이름 재정의

    words = df_all_quarter['0'].tolist() # df_all_quarter df의 '0' 컬럼 list화
    words = [x for x in words if pd.isna(x) != True] # nan값 제거

    df_word_count = pd.read_csv(word_path, index_col = 0) # word df read
    words_list = df_word_count['단어'].tolist()

    return words, words_list

In [None]:
one_words, fir_words_list = data_preprocess('/content/2021_1분기.csv', '/content/1분기단어.csv')
two_words, sec_words_list = data_preprocess('/content/2021_2분기.csv', '/content/2분기단어.csv')
thr_words, thr_words_list = data_preprocess('/content/2021_3분기.csv', '/content/3분기단어.csv')
fou_words, fou_words_list = data_preprocess('/content/2021_4분기.csv', '/content/4분기단어.csv')

## 각 분기마다 단어를 비교하여 불용어 사전 생성

In [None]:


first_words = [x for x in fir_words_list if x not in sec_words_list]
first_words = [x for x in first_words if x not in thr_words_list]
first_words = [x for x in first_words if x not in fou_words_list]
len(first_words)

In [None]:
second_words = [x for x in sec_words_list if x not in fir_words_list]
second_words = [x for x in second_words if x not in thr_words_list]
second_words = [x for x in second_words if x not in fou_words_list]
len(second_words)

In [None]:
third_words = [x for x in thr_words_list if x not in fir_words_list]
third_words = [x for x in third_words if x not in sec_words_list]
third_words = [x for x in third_words if x not in fou_words_list]
len(third_words)

In [None]:
fourth_words = [x for x in fou_words_list if x not in fir_words_list]
fourth_words = [x for x in fourth_words if x not in sec_words_list]
fourth_words = [x for x in fourth_words if x not in thr_words_list]
len(fourth_words)

## 최종 불용어 사전 만들기

In [None]:
df_word_count = pd.read_csv('/content/1분기단어.csv', index_col = 0)
df_word_count2 = pd.read_csv('/content/2분기단어.csv', index_col = 0)
df_word_count3 = pd.read_csv('/content/3분기단어.csv', index_col = 0)
df_word_count4 = pd.read_csv('/content/4분기단어.csv', index_col = 0)

In [None]:
one_stopwords = [x for x in df_word_count['단어'] if x not in first_words]
two_stopwords = [x for x in df_word_count2['단어'] if x not in second_words]
thr_stopwords = [x for x in df_word_count3['단어'] if x not in third_words]
fou_stopwords = [x for x in df_word_count4['단어'] if x not in fourth_words]
print(len(one_stopwords), len(two_stopwords), len(thr_stopwords), len(fou_stopwords))

In [None]:
stopwords = [one_stopwords, two_stopwords, thr_stopwords, fou_stopwords]

In [None]:
df_product = pd.read_csv('/content/불용어 제거한 상세설명_.csv', index_col = 0)

# 결측치 제거
df_product = df_product.dropna()
df_product = df_product.reset_index(drop=True)

# 중복 제거
df_product = df_product.drop_duplicates(['상품설명'], ignore_index = True)
df_product

In [None]:
product_stopwords = ['원단', '사용', '착용', '실루엣', '디자인', '가공', '구성', '적용', '활용', '제품', '제공', '용도', '사이즈', '아이템', '완성', '공정', '포면', '구조', '위해', '가지', '포함', '연출', '형태',
                 '일리', '컬러', '제작', '보장', '소재', '이용', '일반', '단계', '덕분', '부위', '사양', '방식', '통해', '추가']

## 큐레이팅과 상품 설명 합치기

In [None]:
def concat_data(df):
    curating = ''.join(df['total_review'].tolist())
    df_curating = pd.DataFrame([['큐레이팅', curating]], columns=['상품번호', '상품설명'])
    df_curating = pd.concat([df_curating, df_product])
    df_curating = df_curating.reset_index(drop=True)

    return df_curating


In [None]:
df_1 = pd.read_csv('/content/2021_1분기.csv', index_col = 0)
df_2 = pd.read_csv('/content/2021_2분기.csv', index_col = 0)
df_3 = pd.read_csv('/content/2021_3분기.csv', index_col = 0)
df_4 = pd.read_csv('/content/2021_4분기.csv', index_col = 0)

In [None]:
df_curating_11 = concat_data(df_1)
df_curating_22 = concat_data(df_2)
df_curating_33 = concat_data(df_3)
df_curating_44 = concat_data(df_4)

## TF_IDF를 이용하여 분기별 상품 추출 진행

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import requests
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [None]:
def get_title(num):
  # 상품번호에 따른 상품명 크롤링 함수
  url = 'https://www.musinsa.com/app/goods/'+str(num)
  headers = {
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.57 Whale/3.14.133.23 Safari/537.36'
  }
  resp = requests.get(url, headers=headers)
  soup = BeautifulSoup(resp.text)
  title_tags = soup.select('#page_product_detail > div.right_area.page_detail_product > div.right_contents.section_product_summary > span > em')

  title = title_tags[0].text

  return title

def text_cleaning(text, stopwords):
    # text를 tokenizer하고 불용어를 제거한 단어를 return해주는 함수
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')  # 정규 표현식 처리
    result = hangul.sub('', text)
    okt = Okt()  # 형태소 추출
    nouns = okt.nouns(result)
    nouns = [x for x in nouns if len(x) > 1]  # 한글자 키워드 제거
    
    if len(text) > 10000:
      nouns = [x for x in nouns if x not in stopwords]
    else:
      nouns = [x for x in nouns if x not in product_stopwords]
    return nouns

def vect_count(data1, num):
    # tf-idf를 하여, 유사도, 상품번호를 print 및 return 해주는 함수
    tfidf = TfidfVectorizer(tokenizer = lambda x: text_cleaning(x, stopwords[num]), max_features=300, min_df=20, max_df=0.2)
    tf_idf_vect = tfidf.fit_transform(data1['상품설명'])
    tf_idf_vect
    
    similarity_simple_pair = cosine_similarity(tf_idf_vect[0], tf_idf_vect)

    sorted_index = similarity_simple_pair.argsort()[:, ::-1] # 인덱스 내림차순으로 저장
    sorted_index = sorted_index[:, 1:]
    print(sorted_index.reshape(-1)[:10])

    curating_sim_value = np.sort(similarity_simple_pair.reshape(-1))[::-1] # [::-1]은 처음부터 끝까지 역순으로 정렬
    curating_sim_value = curating_sim_value[1:]

    curating_sim_df = pd.DataFrame()
    curating_sim_df['상품코드'] = data1.iloc[sorted_index.reshape(-1)[:10].tolist()]['상품번호']
    curating_sim_df['유사도'] = curating_sim_value[:10]
    print(curating_sim_value[:10])

    curating_sim_df['상품코드'] = curating_sim_df['상품코드'].astype(str)
    sns.barplot(x = '유사도', y = '상품코드', data = curating_sim_df);
    plt.show()

    return curating_sim_df['상품코드']

In [None]:
code1 = vect_count(df_curating_11, 0)
code2 = vect_count(df_curating_22, 1)
code3 = vect_count(df_curating_33, 2)
code4 = vect_count(df_curating_44, 3)

In [None]:
code_list = [code1, code2, code3, code4]

for idx, code in enumerate(code_list):
  print('')
  print(idx + 1, '분기')
  for num in code.tolist():
    print(get_title(num))
  