### Install necessary packages

In [1]:
# for web scraping
import requests
from bs4 import BeautifulSoup
import re
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# for cleaning the list of key words
def clean(sentence_ws, sentence_pos):
    short_with_pos = []
    short_sentence = []
    
    # stop_pos means these categories are excluded
    stop_pos = set(['Nep', 'Nh', 'Nb', 'Neu', 'Nc', 'Ncd', 'Nd', 'Neqa'])
    for word_ws, word_pos in zip(sentence_ws, sentence_pos):
        # only keep N & V
        is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N")
        # delete words in stop_pos
        is_not_stop_pos = word_pos not in stop_pos
        # delete only one word
        is_not_one_charactor = not (len(word_ws) == 1)
        
        if is_N_or_V and is_not_stop_pos and is_not_one_charactor:
            short_with_pos.append(f"{word_ws}({word_pos})")
            short_sentence.append(f"{word_ws}")
    return (" ".join(short_sentence), " ".join(short_with_pos))

### Set the target url and Scrape the text (using CKIP)

In [3]:
url = "https://www.mj-hair.com/news_info/14/1/130"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
raw_texts = ' '.join(soup.findAll(text=True))
pattern = re.compile(r"[\u4e00-\u9fff！？。，、]+")
text = [' '.join(re.findall(pattern, raw_texts))]
text = [text[0].replace(" ", "")]

  raw_texts = ' '.join(soup.findAll(text=True))


In [4]:
text

['植髮價格如何計算？個影響植髮費用關鍵因素、價錢差異大公開明錦生髮植髮診所自訂洽詢據點埋碼加諮詢埋碼繁體中文简体中文關於明錦品牌介紹醫療團隊服務據點明錦集團明錦醫美知識分享植髮自然生髮生髮知識與保養醫療新知診所動態新聞動態時事影音專區生髮與植髮自然生髮美學植髮生髮案例自然生髮美學植髮前後比對常見問題聯絡我們聯絡我們聯絡我們立即諮詢常見問題關於明錦品牌介紹醫療團隊服務據點明錦集團明錦醫美知識分享植髮自然生髮生髮知識與保養醫療新知診所動態新聞動態時事影音專區生髮與植髮自然生髮美學植髮生髮案例自然生髮美學植髮前後比對常見問題聯絡我們聯絡我們聯絡我們立即諮詢常見問題用用首頁植髮植髮價格如何計算？個影響植髮費用關鍵因素、價錢差異大公開植髮價格如何計算？個影響植髮費用關鍵因素、價錢差異大公開植髮植髮費用大約是多少？價格是怎麼計算的？坊間經常聽到頭髮是人的第二張臉此說法，可見頭髮對於人的形象之重要性。但遺傳、工作、家庭等各種壓力，讓許多人都出現掉髮問題，而如果改變生活作息與飲食後仍不見效果，有些人就會想選擇植髮，但卻又擔心無法負擔植髮價格。究竟各類型的植髮費用有何差異？跟著本文，帶你全面了解影響植髮價錢的重要因素！植髮費用如何計算？價格計算方式解析依據取髮、種髮等不同方式，每根頭髮價格約落在元不等，主要是手術方式的選擇、個人頭皮的鑽取難度不同、頭皮與原有頭髮條件不同，甚至是使用的手術耗材是否使用拋棄式的植髮筆等等因素，都會影響植髮手術費用的計算。而目前兩種主流的計費方式，分別是以根數收費，或是以株數計算。計價方式說明以根計價以單根頭髮為收費單位，依照實際需植入的髮根數量，計算植髮手術費用以株計價以單株毛囊為收費單位，每個毛囊單位不一定只會有根頭髮，有時會出現根甚至多到根的情況東方人每株毛囊平均含有根頭髮影響植髮價錢大因素一般來說，植髮價格高低與以下點有關植髮手術進行方式植髮手術方式又分為、及植髮機器人，其各自的差異比較如下植髮機器人取髮方式切取帶有毛囊的細長頭皮，分離出毛囊株以鑽取器取下後枕部毛囊株由機械手臂摘取毛囊時間成本較快耗時耗時價格比較低中高機械折舊、開機費用適合族群有預算考量後枕部不想剃髮可接受較長恢復期可接受取髮位置留下長條狀疤痕頭皮彈性不佳蟹足腫體質植眉手術植鬍手術喜歡剃短髮型頭皮彈性不佳蟹足腫體質植眉手術植鬍手術喜歡剃短髮型而除了取頭髮使用的工具不同以外，醫療人

In [5]:
# set CKIP Drivers
ws_driver  = CkipWordSegmenter(model="bert-base")
pos_driver = CkipPosTagger(model="bert-base")
ner_driver = CkipNerChunker(model="bert-base")

In [6]:
# apply CKIP
ws  = ws_driver(text)
pos = pos_driver(ws)
ner = ner_driver(text)

Tokenization: 100%|██████████████████████████████| 1/1 [00:00<00:00, 406.35it/s]
Inference: 100%|██████████████████████████████████| 1/1 [00:01<00:00,  1.85s/it]
Tokenization: 100%|██████████████████████████████| 1/1 [00:00<00:00, 817.60it/s]
Inference: 100%|██████████████████████████████████| 1/1 [00:13<00:00, 13.43s/it]
Tokenization: 100%|█████████████████████████████| 1/1 [00:00<00:00, 1040.51it/s]
Inference: 100%|██████████████████████████████████| 1/1 [00:01<00:00,  1.37s/it]


In [7]:
keyword_list = []
for sentence, sentence_ws, sentence_pos, sentence_ner in zip(text, ws, pos, ner):
    (short, res) = clean(sentence_ws, sentence_pos)
    keyword_list.extend(short.split(' '))

In [8]:
# count key words in the whole 'keyword_list'
value_counts = pd.Series(keyword_list).value_counts()
df = pd.DataFrame({'Word': value_counts.index, 'Count': value_counts.values})
sub_df = df.sort_values(by=['Count'], ascending=False).reset_index(drop=True).head(100)

### Upload the dataframe to Google Sheet

In [9]:
import gspread
from google.oauth2.service_account import Credentials
from gspread_dataframe import get_as_dataframe, set_with_dataframe

In [None]:
# google sheet authorization
scopes = ['https://www.googleapis.com/auth/spreadsheets','https://www.googleapis.com/auth/drive']

# set your api key (json file) as the credential
credentials = Credentials.from_service_account_file('YourJsonFile.json',scopes=scopes)

gc = gspread.authorize(credentials)

# get the google sheet id from the url of the google sheet
spreadsheet_key = 'YourGoogleSheetID'

# use the key and id to open a sheet in the google sheet
sheet = gc.open_by_key(spreadsheet_key).worksheet("TheSheetName")

existing_df = get_as_dataframe(sheet,skiprows=0, skipcolumns=0)
existing_df = existing_df.dropna(axis=0, how='all')
existing_df = existing_df.dropna(axis=1, how='all')

if existing_df is not None:
    new_df = pd.concat([sub_df, existing_df], axis=1)
else:
    new_df = sub_df

# upload df to the google sheet
set_with_dataframe(sheet, new_df)