In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import requests
import json
import csv
import numpy as np
import pandas as pd


activity_df = pd.read_csv('data/preprocess/activity.csv', index_col='id')

activity_df

In [None]:
activity_content = activity_df['content']
# 將沒有 content 填成 title
activity_df['content'].fillna(activity_df['title'], inplace=True)
# activity_df.index
activity_df[activity_df['content'].isna()]

In [None]:
from ckip_transformers import __version__
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker


# Show version
print(__version__)

# Initialize drivers
print("Initializing drivers ... WS")
ws_driver = CkipWordSegmenter(model="albert-base")
print("Initializing drivers ... POS")
pos_driver = CkipPosTagger(model="albert-base")
print("Initializing drivers ... NER")
ner_driver = CkipNerChunker(model="albert-base")
print("Initializing drivers ... all done")
print()

In [None]:
# clean function
def clean(sentence_ws, sentence_pos):
  short_with_pos = []
  short_sentence = []
  stop_pos = set(['Nep', 'Nh', 'Nb']) # 這 3 種詞性不保留 指代定詞 代名詞 專有名詞
  for word_ws, word_pos in zip(sentence_ws, sentence_pos):
    # 只留名詞和動詞
    is_N_or_V = word_pos.startswith("V") or word_pos.startswith("N")
    # 去掉名詞裡的某些詞性
    is_not_stop_pos = word_pos not in stop_pos
    # 只剩一個字的詞也不留
    is_not_one_charactor = not (len(word_ws) == 1)
    
    # 組成串列
    if is_N_or_V and is_not_stop_pos and is_not_one_charactor:
      short_with_pos.append(f"{word_ws}({word_pos})")
      short_sentence.append(f"{word_ws}")
  return (" ".join(short_sentence), " ".join(short_with_pos))

In [None]:
ws = ws_driver(activity_content)
pos = pos_driver(ws)
ner = ner_driver(activity_content)
print()
print('=====')

index = 0
activity_content_text = []

# The zip() function in Python is used to combine elements from two or more iterable objects into tuples.
for sentence, sentence_ws, sentence_pos, sentence_ner in zip(activity_content, ws, pos, ner):

#     print("原文：")
#     print(sentence)

    (short, res) = clean(sentence_ws, sentence_pos)
    activity_content_text.append(short)

#     print("斷詞後：")
#     print(short)
#     print("斷詞後+詞性標注：")
#     print(res)
#     print('=====')
#     print(sentence)
#     print(sentence_ws)
#     print(sentence_ner)

activity_content_text

## 設立使用者資料

In [None]:
user_activity_content = ['']
user_history = [5]
for index in user_history:
    user_activity_content[0] += activity_df.iloc[index]['content']

user_activity_content

## 做斷詞分析

In [None]:
ws = ws_driver(user_activity_content)
pos = pos_driver(ws)
ner = ner_driver(user_activity_content)
print()
print('=====')

user_content_text = []

# The zip() function in Python is used to combine elements from two or more iterable objects into tuples.
for sentence, sentence_ws, sentence_pos, sentence_ner in zip(user_activity_content, ws, pos, ner):
    (short, res) = clean(sentence_ws, sentence_pos)
    user_content_text.append(short)

user_content_text

## 算出使用者和活動的 TF-IDF 特徵矩陣，並算出相似值

In [None]:
vectorizer = TfidfVectorizer()

# 目的是學習每個單詞在所有文本中的重要性，轉換的目的是將每個文本轉換成一個數值向量
activity_vec = vectorizer.fit_transform(activity_content_text)

# 已經有一個訓練好的向量化模型，並且想要將新的文本資料轉換成與這個模型相同的向量形式
user_vec = vectorizer.transform(user_content_text)

# 計算使用者向量和語料庫中所有活動向量之間的餘弦相似度
user_activity_matrix = cosine_similarity(user_vec, activity_vec)

# 透過增加字元的方式調整單詞權重
print(activity_vec.toarray()[0][0:20])

user_activity_matrix = pd.DataFrame(user_activity_matrix, index=user_pd.index, columns=activity_df.index)

user_activity_matrix