In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sys
sys.path.append('../')

from collections import defaultdict
import re
from tqdm import tqdm
import pandas as pd

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from ml.query import *
from ml.utils import *

In [2]:
# 불용어 목록 다운로드
nltk.download('stopwords')

# 불용어 목록 가져오기
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeongmoonwon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_text(x):
    x = re.sub(r'\d', ' ', x)
    x = re.sub("\n"," ",x)
    x = re.sub("<.*?>"," ",x)
    x = re.sub("\s+", " ", x).strip()
    x = re.sub(r'\b\w*read\w*\b', ' ', x)
    x = re.sub(r'\b\w*book\w*\b', ' ', x)
    return x

In [4]:
WORKING_DIRECTORY = '/Users/jeongmoonwon/Downloads/Courses/BKMS1/team_project/bkms1-team10/book_recsys'

In [5]:
review_keywords_dir_path = os.path.join(WORKING_DIRECTORY, 'results/review_keywords')
os.makedirs(review_keywords_dir_path, exist_ok=True)

In [6]:
db_path = os.path.join(WORKING_DIRECTORY, 'resources/project.db')
con = connection(db_path)

In [7]:
ratings = read_table(con, ratings_query)
reviews = read_table(con, reviews_query)

In [8]:
review_df = pd.merge(ratings, reviews, on='review_id', how = 'inner')
review_df = review_df[['book_id', 'review_text']]

In [9]:
review_df

Unnamed: 0,book_id,review_text
0,34426579,4.5 I'm-Obsessed Stars!! \n Jessica Hawkins ha...
1,32993133,"4.5 Stars!! \n ""Was sixteen too young to fall ..."
2,35162866,4.5 EPIC STARS!! \n OMG what do I read now?! M...
3,35121403,"RACER EFFING TATE!! \n Katy Evans is back, doi..."
4,33807229,"4.5 Addictive Stars!! \n ""I'm Easton Royal, su..."
...,...,...
3374595,12988016,"""You know you can do anything to me,"" she murm..."
3374596,14866,"Nineteen Minutes is the story of a teenager, P..."
3374597,8352056,"This is my second Dorothy McFalls story, after..."
3374598,42899,"A couple of months ago, I discovered the Kresl..."


In [10]:
review_df['review_text'] = review_df['review_text'].fillna('')
review_df = review_df.head(1000)  # 테스트로 일부만

review_df['review_text'] = review_df['review_text'].apply(clean_text)

reviews_new = review_df.groupby('book_id')['review_text'].apply(list).to_dict()
book_list = list(reviews_new.keys())
review_keywords = defaultdict(list)

In [11]:
for i in tqdm(book_list):
    corpus = reviews_new[i]
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

    try:
        vecs = vectorizer.fit_transform(corpus)
        feature_names = vectorizer.get_feature_names()
        dense = vecs.todense()
        lst1 = dense.tolist()
        tfidf_matrix = pd.DataFrame(lst1, columns=feature_names)
        keywords = tfidf_matrix.T.sum(axis=1).sort_values(ascending=False).index[:10].tolist()
        review_keywords[i].append(keywords)

    except ValueError:
        pass

100%|██████████| 1000/1000 [00:06<00:00, 163.34it/s]


In [12]:
keyword_df = pd.DataFrame.from_dict(review_keywords, orient='index', columns=['keywords'])
keyword_df.reset_index(inplace=True)
keyword_df.columns = ['book_id', 'keywords']

In [13]:
keyword_df

Unnamed: 0,book_id,keywords
0,10187224,"[smith, come, time, feel, way, experiences, ch..."
1,10438971,"[just, fey, years, heard, great, funny, fan, f..."
2,10480613,"[world, series, boatman, body, check, count, c..."
3,1093105,"[narrator, tension, distracting, ll, liked, ke..."
4,11080141,"[perspective, family, does, story, county, col..."
...,...,...
994,9634447,"[jack, old, year, felt, mother, idea, child, t..."
995,9814767,"[people, characters, violence, really, end, co..."
996,9815629,"[series, lackberg, really, second, sub, summer..."
997,9912994,"[really, palahniuk, idea, novel, like, don, de..."
