In [1]:
import requests
import urllib.request
import json
import re
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from sklearn.metrics.pairwise import linear_kernel, pairwise_distances

import jieba.analyse

# Load Bookmarks

In [2]:
### load data 一次就好

catelog = list() # 目录

print(os.environ["LOCALAPPDATA"])
input_filename = os.environ["LOCALAPPDATA"] + r"\Google\Chrome\User Data\Default\Bookmarks"

with open(input_filename, 'r', encoding='utf-8') as f:
    contents = json.load(f)

C:\Users\adam8\AppData\Local


# Parsing
### contents 結構：

```python
{  
    'checksum': <str>,    # 一個檢查用的資訊  
    'roots': {  
        'bookmark_bar': {  
            'children': [<nested with dicts with structure same as this dict>],  
            'date_added': <str with numbers>,  
            'date_modified': <str with numbers>,  
            'guid': <str>,    # example: '00000000-0000-4000-a000-000000000002'  
            'id': <str with numbers>,  
            'name': <str>,  
            'type': <str>    # 有 folder, url  
        },  
        'other': {<same as bookmark_bar>},  
        'synced' {<same as bookmark_bar>}:  
    },  
    sync_metadata: <str>,    # 一大串不知道幹嘛的亂碼  
    version: <int>  
}
```

In [3]:
def get_nodes_info(root, parent_name):
    children = root.pop('children', None)
    root['parent'] = parent_name
    info_list = [root]
    
    if children:
        for child in children:
            info_list.extend(get_nodes_info(child, root['name']))
    
    return info_list

In [4]:
bookmark_bar_info = get_nodes_info(contents['roots']['bookmark_bar'], 'bookmark_bar')
other_info = get_nodes_info(contents['roots']['other'], 'other')
synced_info = get_nodes_info(contents['roots']['synced'], 'synced')

info = bookmark_bar_info + other_info + synced_info

df = pd.DataFrame(info).fillna('')
print(df.shape)
df.head()

(521, 9)


Unnamed: 0,date_added,date_modified,guid,id,meta_info,name,parent,type,url
0,13202479594197706,1.3242831177794172e+16,00000000-0000-4000-a000-000000000002,1,,書籤列,bookmark_bar,folder,
1,13233003241034750,1.3233003241037912e+16,f0b52077-6793-498a-b8eb-462dfc400922,1884,,2019-nCoV,書籤列,folder,
2,13226238181043741,,7f8a21b9-b559-44ab-b711-c41800d188d1,1885,,Coronavirus_Taiwan,2019-nCoV,url,https://viator.maps.arcgis.com/apps/opsdashboa...
3,13226407607803828,,24a7d7be-2a87-4481-b157-3773db2f4eab,1886,,2019-nCoV | 武漢肺炎地圖,2019-nCoV,url,https://kiang.github.io/2019-nCoV/
4,13227360115873492,,a09ae3f0-3a5f-417c-ad10-423d35e155f3,1888,,程序員硬核勸告：現在還不是出門的時候 - 幫趣,2019-nCoV,url,https://bangqu.com/a7wXH5.html


# Keywords processing

In [5]:
jieba.analyse.set_stop_words('meaningless_words.txt')

df['keyword'] = df['name'].progress_apply(jieba.analyse.tfidf)

  0%|                                                                                                                                                                                                               | 0/521 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\adam8\AppData\Local\Temp\jieba.cache
Loading model cost 0.953 seconds.
Prefix dict has been built successfully.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 521/521 [00:01<00:00, 479.42it/s]


In [6]:
df

Unnamed: 0,date_added,date_modified,guid,id,meta_info,name,parent,type,url,keyword
0,13202479594197706,13242831177794171,00000000-0000-4000-a000-000000000002,1,,書籤列,bookmark_bar,folder,,[]
1,13233003241034750,13233003241037911,f0b52077-6793-498a-b8eb-462dfc400922,1884,,2019-nCoV,書籤列,folder,,"[2019, nCoV]"
2,13226238181043741,,7f8a21b9-b559-44ab-b711-c41800d188d1,1885,,Coronavirus_Taiwan,2019-nCoV,url,https://viator.maps.arcgis.com/apps/opsdashboa...,"[Coronavirus, Taiwan]"
3,13226407607803828,,24a7d7be-2a87-4481-b157-3773db2f4eab,1886,,2019-nCoV | 武漢肺炎地圖,2019-nCoV,url,https://kiang.github.io/2019-nCoV/,"[2019, nCoV, 武漢, 地圖, 肺炎]"
4,13227360115873492,,a09ae3f0-3a5f-417c-ad10-423d35e155f3,1888,,程序員硬核勸告：現在還不是出門的時候 - 幫趣,2019-nCoV,url,https://bangqu.com/a7wXH5.html,"[硬核, 勸告, 現在還, 出門, 時候, 幫趣, 程序]"
5,13232695304731928,,8c555a82-3c7b-4f14-8f19-9cfe663e5d19,1890,,300新竹市東區高翠路173巷4弄 - Google 地圖,2019-nCoV,url,https://www.google.com/maps/place/300%E6%96%B0...,"[300, 東區, 高翠路, 173, Google, 地圖, 新竹市]"
6,13233003241047126,13242239034086184,90b08b84-66d4-488d-8e18-20cdf3ba014a,1891,,木刻思,書籤列,folder,,[木刻]
7,13228386784256522,,8d719a89-0635-42d2-a750-52ab77bd8d8f,1892,,Hubstaff - My account: Billing,木刻思,url,https://app.hubstaff.com/users/796279/billing,"[Hubstaff, My, account, Billing]"
8,13228386905254476,,ab84fcbe-ce9f-4823-b200-9066e72d3896,1893,,[HQ] New Member On Board,木刻思,url,https://3.basecamp.com/3287947/projects/4147783,"[HQ, New, Member, Board]"
9,13230802944793614,,b66208e6-5322-4eac-9841-bb50844182ee,1894,,iiNumbers HQ / Business Development Office / D...,木刻思,url,https://gitlab.iinumbers.net/iinumbers-hq/bdo/...,"[Project, iiNumbers, HQ, Business, Development..."


"""
key: 列出不重複的tags，value: 每個tag分別有哪些書籤
record: {tag: [associated_url_index, ...]}
"""

keywords = [keyword for keywords in df['keyword'] for keyword in keywords]
keywords = list(set(keywords))

tag2urls = {k:[] for k in keywords}

for idx, row in tqdm(df.iterrows()):
    for item in row['keyword']:
        tag2urls[item].append(row['id'])

tag_df = pd.DataFrame(tag2urls.items(), columns=['tag', 'indices'])
tag_df['len'] = tag_df['indices'].apply(len)

tag_df = tag_df.sort_values('len', ascending=False)

display(tag_df.head(10), tag_df.tail(10))

# Model

# keyword_indices
index2tag = dict(tag_df['tag'])
tag2index = {index2tag[k]: k for k in index2tag}

df['keyword_indices'] = df['keyword'].apply(lambda keywords: [tag2index[k] for k in keywords])

# TODO
1. [x] 加入'tag' by 斷詞
2. [x] 加入母資料夾
3. [ ] 類似的東東們，i.e. recommendation system

In [7]:
"""Adam
可以用 folder 當作 y，去 train 關鍵字的距離
"""

"""Adam
也許可以用 character-wise 的 embedding 來算 onehot 距離之類的
"""

'Adam\n也許可以用 character-wise 的 embedding 來算 onehot 距離之類的\n'

# 現在要做出 recommendation system

# other

In [8]:
# def download_thumb_img(url):
#     load_html
#     load_all_img
#     find_max_img

# route = download_thumb_img(googlebookmarkinfo['url'][idx])    # ex: 'C:/example.png'

In [9]:
df['img_url'] = None

In [10]:
df.to_hdf('../metadata.h5', key='data')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['date_added', 'date_modified', 'guid', 'id', 'meta_info', 'name', 'parent', 'type', 'url', 'keyword', 'img_url']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
