In [12]:
import json
import re
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from sklearn.metrics.pairwise import linear_kernel, pairwise_distances

import jieba.analyse

# Load Bookmarks

In [2]:
### load data 一次就好

catelog = list() # 目录

print(os.environ["LOCALAPPDATA"])
input_filename = os.environ["LOCALAPPDATA"] + r"\Google\Chrome\User Data\Default\Bookmarks"

with open(input_filename, 'r', encoding='utf-8') as f:
    contents = json.load(f)

C:\Users\adam8\AppData\Local


# Parsing
### contents 結構：

```python
{  
    'checksum': <str>,    # 一個檢查用的資訊  
    'roots': {  
        'bookmark_bar': {  
            'children': [<nested with dicts with structure same as this dict>],  
            'date_added': <str with numbers>,  
            'date_modified': <str with numbers>,  
            'guid': <str>,    # example: '00000000-0000-4000-a000-000000000002'  
            'id': <str with numbers>,  
            'name': <str>,  
            'type': <str>    # 有 folder, url  
        },  
        'other': {<same as bookmark_bar>},  
        'synced' {<same as bookmark_bar>}:  
    },  
    sync_metadata: <str>,    # 一大串不知道幹嘛的亂碼  
    version: <int>  
}
```

In [3]:
def get_nodes_info(root, parent_name):
    children = root.pop('children', None)
    root['parent'] = parent_name
    info_list = [root]
    
    if children:
        for child in children:
            info_list.extend(get_nodes_info(child, root['name']))
    
    return info_list

In [4]:
bookmark_bar_info = get_nodes_info(contents['roots']['bookmark_bar'], 'bookmark_bar')
other_info = get_nodes_info(contents['roots']['other'], 'other')
synced_info = get_nodes_info(contents['roots']['synced'], 'synced')

info = bookmark_bar_info + other_info + synced_info

df = pd.DataFrame(info)
print(df.shape)
df.head()

(488, 9)


Unnamed: 0,date_added,date_modified,guid,id,meta_info,name,parent,type,url
0,13202479594197706,1.3241369625133688e+16,00000000-0000-4000-a000-000000000002,1,,書籤列,bookmark_bar,folder,
1,13233003241034750,1.3233003241037912e+16,f0b52077-6793-498a-b8eb-462dfc400922,1884,,2019-nCoV,書籤列,folder,
2,13226238181043741,,7f8a21b9-b559-44ab-b711-c41800d188d1,1885,,Coronavirus_Taiwan,2019-nCoV,url,https://viator.maps.arcgis.com/apps/opsdashboa...
3,13226407607803828,,24a7d7be-2a87-4481-b157-3773db2f4eab,1886,,2019-nCoV | 武漢肺炎地圖,2019-nCoV,url,https://kiang.github.io/2019-nCoV/
4,13227360115873492,,a09ae3f0-3a5f-417c-ad10-423d35e155f3,1888,,程序員硬核勸告：現在還不是出門的時候 - 幫趣,2019-nCoV,url,https://bangqu.com/a7wXH5.html


# Keywords processing

In [5]:
jieba.analyse.set_stop_words('meaningless_words.txt')

df['keyword'] = df['name'].progress_apply(jieba.analyse.tfidf)

  0%|                                                                                          | 0/488 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\adam8\AppData\Local\Temp\jieba.cache
Loading model cost 1.535 seconds.
Prefix dict has been built successfully.
100%|███████████████████████████████████████████████████████████████████████████████| 488/488 [00:01<00:00, 294.40it/s]


In [6]:
"""
key: 列出不重複的tags，value: 每個tag分別有哪些書籤
record: {tag: [associated_url_index, ...]}
"""

keywords = [keyword for keywords in df['keyword'] for keyword in keywords]
keywords = list(set(keywords))

tag2urls = {k:[] for k in keywords}

for idx, row in tqdm(df.iterrows()):
    for item in row['keyword']:
        tag2urls[item].append(row['id'])

tag_df = pd.DataFrame(tag2urls.items(), columns=['tag', 'indices'])
tag_df['len'] = tag_df['indices'].apply(len)

tag_df = tag_df.sort_values('len', ascending=False)

display(tag_df.head(10), tag_df.tail(10))

488it [00:00, 2963.48it/s]


Unnamed: 0,tag,indices,len
1006,YouTube,"[1900, 2798, 2812, 2824, 1946, 1962, 2038, 203...",24
542,幫趣,"[1888, 2819, 2744, 2818, 2820, 2523, 1719, 252...",22
1785,C++,"[1981, 1983, 1985, 1986, 1987, 1990, 1992, 199...",21
1331,PopDaily,"[2755, 2753, 1852, 1856, 1860, 1877, 1878, 182...",19
1907,Python,"[2545, 2015, 2016, 2017, 2018, 2019, 2020, 274...",18
75,頭條,"[2740, 2817, 1717, 1985, 2069, 2070, 2071, 207...",16
1555,生活,"[2748, 2755, 2753, 1856, 1860, 1877, 1878, 182...",16
1216,每日,"[2740, 2817, 1717, 1985, 2069, 2070, 2071, 207...",16
128,學習,"[1939, 1976, 1998, 2053, 2067, 2068, 2075, 207...",15
141,發現,"[1719, 2755, 2753, 1856, 1860, 1877, 1878, 182...",14


Unnamed: 0,tag,indices,len
744,luminoth,[1798],1
743,Vector,[2525],1
741,短褲,[2803],1
740,Arts,[1839],1
738,萬人,[1872],1
737,映射,[1968],1
736,001,[1962],1
734,媽媽,[1872],1
733,試閱,[1962],1
1916,涼快,[1615],1


In [7]:
# keyword_indices
index2tag = dict(tag_df['tag'])
tag2index = {index2tag[k]: k for k in index2tag}

df['keyword_indices'] = df['keyword'].apply(lambda keywords: [tag2index[k] for k in keywords])


# onehot
def keyword_indices_to_vector(indices):
    vec = np.zeros([len(keywords)])
    for index in indices:
        vec[index] += 1
    return vec

onehot = df['keyword_indices'].apply(keyword_indices_to_vector)
onehot = np.stack(onehot.values)
df['onehot'] = onehot.tolist()

df.head()

Unnamed: 0,date_added,date_modified,guid,id,meta_info,name,parent,type,url,keyword,keyword_indices,onehot
0,13202479594197706,1.3241369625133688e+16,00000000-0000-4000-a000-000000000002,1,,書籤列,bookmark_bar,folder,,[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,13233003241034750,1.3233003241037912e+16,f0b52077-6793-498a-b8eb-462dfc400922,1884,,2019-nCoV,書籤列,folder,,"[2019, nCoV]","[1626, 308]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,13226238181043741,,7f8a21b9-b559-44ab-b711-c41800d188d1,1885,,Coronavirus_Taiwan,2019-nCoV,url,https://viator.maps.arcgis.com/apps/opsdashboa...,"[Coronavirus, Taiwan]","[661, 207]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,13226407607803828,,24a7d7be-2a87-4481-b157-3773db2f4eab,1886,,2019-nCoV | 武漢肺炎地圖,2019-nCoV,url,https://kiang.github.io/2019-nCoV/,"[2019, nCoV, 武漢, 地圖, 肺炎]","[1626, 308, 569, 1758, 1653]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,13227360115873492,,a09ae3f0-3a5f-417c-ad10-423d35e155f3,1888,,程序員硬核勸告：現在還不是出門的時候 - 幫趣,2019-nCoV,url,https://bangqu.com/a7wXH5.html,"[硬核, 勸告, 現在還, 出門, 時候, 幫趣, 程序]","[1673, 1624, 1189, 1041, 1378, 542, 1286]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Model

In [50]:
def return_name(find_similar_function):
    def wrapped(*args, **kwargs):
        result = find_similar_function(*args, **kwargs)
        return result.apply(lambda x: [df.loc[idx, 'name'] for idx in x])
    return wrapped

@return_name
def linear_kernel_highpass(vectors, threshold=1):
    """
    如果 vectors 是 onehot，threshold = 0，那相當於找出擁有同關鍵字的 url 們。
    """
    kernel = linear_kernel(onehot)
    s = pd.Series(kernel.tolist())
    similar_series = s.apply(lambda x: [idx for idx in np.where(np.array(x) >= threshold)[0]])
    return similar_series

@return_name
def l2norm_lowpass(vectors, threshold=1):
    kernel = pairwise_distances(onehot)
    s = pd.Series(kernel.tolist())
    similar_series = s.apply(lambda x: [idx for idx in np.where(np.array(x) <= threshold)[0]])
    return similar_series

In [51]:
%time df['linear_kernel'] = linear_kernel_highpass(onehot)
%time df['l2_norm'] = l2norm_lowpass(onehot)

Wall time: 190 ms
Wall time: 153 ms


In [52]:
with pd.option_context('display.max_colwidth', 500):
    display(df[['name', 'parent', 'linear_kernel', 'l2_norm']])

Unnamed: 0,name,parent,linear_kernel,l2_norm
0,書籤列,bookmark_bar,[],"[書籤列, 木刻思, 碩, 7/21, JupyterLab, (3) Facebook, 每小時, , , , , , , , , , , , MD, , GCP, IDE, R, JupyterLab, docker, NCTU, E3, 舊E3, 數學, 資工, C++, java, python, Qt, CMAKE, WINAPI, clGetPlatformIDs, KMEANS, git, linux, 考研, 物件相等性, gnuplot_i 2.x, 音樂, 琴譜, , d/dx, dx, dx-2, LaTeX, html2pdf, 吃穿, CV, 生活, 日曆, , , , html2pdf, linux, d/dx, dx, dx-2, LaTeX, 琴譜, anime, 其他書籤, 行動版書籤, Alice Y, 書籤]"
1,2019-nCoV,書籤列,"[2019-nCoV, 2019-nCoV | 武漢肺炎地圖, 2019新竹早午餐推薦 | 精選TOP 15間熱門店家- 愛食記]",[2019-nCoV]
2,Coronavirus_Taiwan,2019-nCoV,"[Coronavirus_Taiwan, 下午茶約會別再只想到咖啡廳！台北6間現代茶館，質感空間配上細緻品茶讓舒壓程度再加乘 | Vogue Taiwan, 中華民國考選部(Ministry of Examination,R.O.C(Taiwan))全球資訊網 - 考畢試題查詢(含測驗題答案)]",[Coronavirus_Taiwan]
3,2019-nCoV | 武漢肺炎地圖,2019-nCoV,"[2019-nCoV, 2019-nCoV | 武漢肺炎地圖, 300新竹市東區高翠路173巷4弄 - Google 地圖, 2019新竹早午餐推薦 | 精選TOP 15間熱門店家- 愛食記]",[2019-nCoV | 武漢肺炎地圖]
4,程序員硬核勸告：現在還不是出門的時候 - 幫趣,2019-nCoV,"[程序員硬核勸告：現在還不是出門的時候 - 幫趣, 超細節的BERT/Transformer知識點 - 幫趣, 超細節的BERT/Transformer知識點 - 幫趣, 全面改進Transformer類預訓練模型，自然語言任務超越BERT - 幫趣, 超細節的BERT/Transformer知識點 - 幫趣, 什麼是小樣本學習？這篇綜述文章用166篇參考文獻告訴你答案 - 幫趣, 華裔教授發現二次方程「極簡」解法：丟掉公式，全球教科書可能都要改了 - 幫趣, 斯坦福和伯克利都在用的線性代數教材，現在可以免費下載了 - 幫趣, 5種快速易用的Python Matplotlib數據可視化方法 - 幫趣, 標題黨太嚇人？這篇文章會告訴你DeepMind關係推理網絡的真實面貌 - 幫趣, 教程 | 初學者如何學習機器學習中的L1和L2正則化 - 幫趣, 騰訊AI Lab提出新型損失函數LMCL：可顯著增強人臉識別模型的判別能力 - 幫趣, 一文帶你讀懂深度學習：AI 認識世界的方式如同小孩 - 幫趣, 模擬世界的模型：谷歌大腦與Jürgen Schmidhuber提出「人工智能夢境」...",[程序員硬核勸告：現在還不是出門的時候 - 幫趣]
5,300新竹市東區高翠路173巷4弄 - Google 地圖,2019-nCoV,"[2019-nCoV | 武漢肺炎地圖, 300新竹市東區高翠路173巷4弄 - Google 地圖, 我的雲端硬碟 - Google 雲端硬碟, Linear Regression - Google 簡報, Python风格规范 — Google 开源项目风格指南, 研究所考古題及詳解 - Google 雲端硬碟, Top 10 新竹市最佳晚餐餐廳 - TripAdvisor, Functional Analysis for Probability and Stochastic Processes: An Introduction - Adam Bobrowski - Google 圖書, 異國迷宮的十字路口 - Google 雲端硬碟, Anime1 動畫編號 - Google 文件, Top 10 新竹市最佳晚餐餐廳 - TripAdvisor]",[300新竹市東區高翠路173巷4弄 - Google 地圖]
6,木刻思,書籤列,[木刻思],"[書籤列, 木刻思, 碩, , , , , , , , , , , , , R, , 吃穿, 日曆, , , , 其他書籤, 書籤]"
7,Hubstaff - My account: Billing,木刻思,[Hubstaff - My account: Billing],[Hubstaff - My account: Billing]
8,[HQ] New Member On Board,木刻思,"[[HQ] New Member On Board, iiNumbers HQ / Business Development Office / Department of Project Management / Project Team TSMC Time Signal Abnormality Detection 20202 / anomaly-signal-detecting-model · GitLab]",[[HQ] New Member On Board]
9,iiNumbers HQ / Business Development Office / Department of Project Management / Project Team TSMC Time Signal Abnormality Detection 20202 / anomaly-signal-detecting-model · GitLab,木刻思,"[[HQ] New Member On Board, iiNumbers HQ / Business Development Office / Department of Project Management / Project Team TSMC Time Signal Abnormality Detection 20202 / anomaly-signal-detecting-model · GitLab, Microsoft Office 首頁]",[iiNumbers HQ / Business Development Office / Department of Project Management / Project Team TSMC Time Signal Abnormality Detection 20202 / anomaly-signal-detecting-model · GitLab]


# TODO
1. [x] 加入'tag' by 斷詞
2. [x] 加入母資料夾
3. [ ] 類似的東東們，i.e. recommendation system

In [None]:
"""Adam
可以用 folder 當作 y，去 train 關鍵字的距離
"""

"""Adam
也許可以用 character-wise 的 embedding 來算 onehot 距離之類的
"""

# 現在要做出 recommendation system

In [None]:
### main do things and output to md

bookmark_bar = html_for_node(contents['roots']['bookmark_bar'])
other = html_for_node(contents['roots']['other'])
catelog_str = ''.join(a for a in catelog)

output_file_name = "output_markdown.md"
with open(output_file_name, 'w', encoding='utf-8') as f:
#     f.write(output_file_template.format(catelog=catelog_str, bookmark_bar=bookmark_bar, other=other))    # origin
    f.write(output_file_template.format(other=other))

# other

In [None]:
# def download_thumb_img(url):
#     load_html
#     load_all_img
#     find_max_img

# route = download_thumb_img(googlebookmarkinfo['url'][idx])    # ex: 'C:/example.png'

# 爬一個網址中的所有圖片