<a href="https://colab.research.google.com/github/Yyyyyjkim/review_analysis/blob/master/functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get update
!apt install chromium-chromedriver
!pip install requests
!pip install bs4
!pip install selenium
!pip install konlpy
!pip install sklearn

In [0]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import pickle
from konlpy.tag import Okt
import sklearn as sk
import numpy as np
import dill
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

In [0]:
class crawling:
  def __init__(self):
    self.options = webdriver.ChromeOptions()
    self.options.add_argument('--headless')
    self.options.add_argument('--no-sandbox')
    self.options.add_argument('--disable-dev-shm-usage')
    self.driver = webdriver.Chrome('chromedriver',options=self.options)
  
  def get_text(self,url,sorting,pagenum):
    # go to review page and sorted by sorting
    self.driver.get(url)
    self.driver.find_element_by_xpath("//div[@class='detail_tab_floatable']//li[@class='item']/a[@class='link N=a:tab.review']").click()
    time.sleep(5)
    div_class = 'header_review _review_list_header'
    ul_class = 'sort_list _review_list_header_sort'
    path = f"//div[@class='{div_class}']//ul[@class='{ul_class}']//a[text()='{sorting}']"
    self.driver.find_element_by_xpath(path).click()
    time.sleep(5)

    # get text from 1 to pagenum
    html = self.driver.page_source
    soup = BeautifulSoup(html,'html.parser')
    review_page = soup.find_all('p',class_='review_text _review_text')
    review = [each_line.get_text().strip() for each_line in review_page[:20]]
    for page in range(2,(pagenum+1)):
      if (page)%10 == 1:
        path_page = f"//nav[@class='module_pagination _review_list_page']//a[text()='다음']"
        self.driver.find_element_by_xpath(path_page).click()
      else:
        path_page = f"//nav[@class='module_pagination _review_list_page']//a[text()='{page}']"
        self.driver.find_element_by_xpath(path_page).click()
      time.sleep(5)
      html = self.driver.page_source
      soup = BeautifulSoup(html, 'html.parser')
      review_page = soup.find_all('p',class_='review_text _review_text')
      review_page = [each_line.get_text().strip() for each_line in review_page[:20]]
      review.extend(review_page)
    
    return review

In [0]:
class nlp:
  def __init__(self,text):
    self.twitter = Okt()
    self.tidy = [re.sub('[\[\]!"#$%&\'()*+,/:;<=>?@\^_`{|}~-✔️🙏🏻👍🤗💕😘]|BEST\n|재구매\n|한달사용기\n','',sent) for sent in text]
    self.tidy = [re.sub('\n',' ',sent) for sent in self.tidy]

  def get_morphs(self):
    pos_text = [self.twitter.morphs(sent) for sent in self.tidy]
    corpus = [" ".join(word) for word in pos_text]
    corpus = list(filter(lambda x: len(x)!=0, corpus))
    return(corpus)

  def get_pos(self):
    pos_text = [self.twitter.pos(sent) for sent in self.tidy]
    pos_text = [[word for word, tag in sent if tag == 'Noun' or tag=='Verb' or tag=='Adjective'] for sent in pos_text]
    corpus = [" ".join(word) for word in pos_text]
    corpus = list(filter(lambda x: len(x)!=0, corpus))
    return(corpus)

  def get_noun(self):
    pos_text = [self.twitter.nouns(sent) for sent in self.tidy]
    corpus = [" ".join(word) for word in pos_text]
    corpus = list(filter(lambda x: len(x)!=0, corpus))
    return(corpus)

In [0]:
class graph:
  def __init__(self):
    self.tfidf = TfidfVectorizer()
    self.counter = CountVectorizer()

  def graph_sent(self,corpus):
    mat_tfidf = self.tfidf.fit_transform(corpus).toarray()
    graph_tfidf = np.dot(mat_tfidf,mat_tfidf.T)
    return graph_tfidf

  def graph_word(self,corpus):
    mat_count = normalize(self.counter.fit_transform(corpus).toarray().astype(float),axis=0) #왜 normalize 하지?
    vocab = self.counter.vocabulary_ # word:count
    graph_word = np.dot(mat_count.T,mat_count)
    words = {vocab[word]:word for word in vocab} # count:word
    return graph_word, words

In [0]:
class textrank():
  def __init__(self,url,sorting,pagenum,unit='self'):
    self.crawling = crawling()
    self.text = self.crawling.get_text(url,sorting,pagenum)

    self.nlp = nlp(self.text)
    self.tidy = self.nlp.tidy
    self.morphs = self.nlp.get_morphs()
    self.pos = self.nlp.get_pos()
    self.noun = self.nlp.get_noun()

    self.graph = graph()
    if unit=='self':
      self.graph_sent = self.graph.graph_sent(self.morphs)
      self.graph_word, self.words = self.graph.graph_word(self.morphs)
    if unit=='pos':
      self.graph_sent = self.graph.graph_sent(self.pos)
      self.graph_word, self.words = self.graph.graph_word(self.pos)
    if unit=='noun':
      self.graph_sent = self.graph.graph_sent(self.noun)
      self.graph_word, self.words = self.graph.graph_word(self.noun)
    
  def get_rank(self,graph='self',corpus='self',d=0.85,tol=1e-5,len=10):
    if graph=='self':
      graph = self.graph_sent
    if corpus=='self':
      corpus = self.tidy
    colsum = np.sum(graph,axis=0)
    weight = pd.DataFrame(graph).apply(lambda x:x/colsum,axis=1)
    weight = weight.fillna(0)
    pro_vec = np.ones(graph.shape[0]).reshape(-1,1)

    bias = (1-d)*pro_vec
    diff = sum(pro_vec)
    while diff > tol:
      past_vec = pro_vec.copy()
      pro_vec = bias+d*np.dot(weight,pro_vec)
      diff = sum(abs(past_vec-pro_vec))
    
    rank = {id: rank for id, rank in enumerate(pro_vec)}
    keywords = sorted(rank, key=lambda x: rank[x],reverse=True) #value를 기준으로 정렬하고 key return 

    summary = []
    for id in keywords[:len]:
      summary.append(corpus[id])
    
    return summary