In [None]:
import json
import re
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from sklearn.metrics.pairwise import linear_kernel, pairwise_distances

import jieba.analyse

# Load Bookmarks

In [None]:
### load data 一次就好

catelog = list() # 目录

print(os.environ["LOCALAPPDATA"])
input_filename = os.environ["LOCALAPPDATA"] + r"\Google\Chrome\User Data\Default\Bookmarks"

with open(input_filename, 'r', encoding='utf-8') as f:
    contents = json.load(f)

# Parsing
### contents 結構：

```python
{  
    'checksum': <str>,    # 一個檢查用的資訊  
    'roots': {  
        'bookmark_bar': {  
            'children': [<nested with dicts with structure same as this dict>],  
            'date_added': <str with numbers>,  
            'date_modified': <str with numbers>,  
            'guid': <str>,    # example: '00000000-0000-4000-a000-000000000002'  
            'id': <str with numbers>,  
            'name': <str>,  
            'type': <str>    # 有 folder, url  
        },  
        'other': {<same as bookmark_bar>},  
        'synced' {<same as bookmark_bar>}:  
    },  
    sync_metadata: <str>,    # 一大串不知道幹嘛的亂碼  
    version: <int>  
}
```

In [None]:
def get_nodes_info(root, parent_name):
    children = root.pop('children', None)
    root['parent'] = parent_name
    info_list = [root]
    
    if children:
        for child in children:
            info_list.extend(get_nodes_info(child, root['name']))
    
    return info_list

In [None]:
bookmark_bar_info = get_nodes_info(contents['roots']['bookmark_bar'], 'bookmark_bar')
other_info = get_nodes_info(contents['roots']['other'], 'other')
synced_info = get_nodes_info(contents['roots']['synced'], 'synced')

info = bookmark_bar_info + other_info + synced_info

df = pd.DataFrame(info)
print(df.shape)
df.head()

# Keywords processing

In [None]:
jieba.analyse.set_stop_words('meaningless_words.txt')

df['keyword'] = df['name'].progress_apply(jieba.analyse.tfidf)

In [None]:
"""
key: 列出不重複的tags，value: 每個tag分別有哪些書籤
record: {tag: [associated_url_index, ...]}
"""

keywords = [keyword for keywords in df['keyword'] for keyword in keywords]
keywords = list(set(keywords))

tag2urls = {k:[] for k in keywords}

for idx, row in tqdm(df.iterrows()):
    for item in row['keyword']:
        tag2urls[item].append(row['id'])

tag_df = pd.DataFrame(tag2urls.items(), columns=['tag', 'indices'])
tag_df['len'] = tag_df['indices'].apply(len)

tag_df = tag_df.sort_values('len', ascending=False)

display(tag_df.head(10), tag_df.tail(10))

In [None]:
# keyword_indices
index2tag = dict(tag_df['tag'])
tag2index = {index2tag[k]: k for k in index2tag}

df['keyword_indices'] = df['keyword'].apply(lambda keywords: [tag2index[k] for k in keywords])


# onehot
def keyword_indices_to_vector(indices):
    vec = np.zeros([len(keywords)])
    for index in indices:
        vec[index] += 1
    return vec

onehot = df['keyword_indices'].apply(keyword_indices_to_vector)
onehot = np.stack(onehot.values)
df['onehot'] = onehot.tolist()

df.head()

# Model

In [None]:
def return_name(find_similar_function):
    def wrapped(*args, **kwargs):
        result = find_similar_function(*args, **kwargs)
        return result.apply(lambda x: [df.loc[idx, 'name'] for idx in x])
    return wrapped

@return_name
def linear_kernel_highpass(vectors, threshold=1):
    """
    如果 vectors 是 onehot，threshold = 0，那相當於找出擁有同關鍵字的 url 們。
    """
    kernel = linear_kernel(onehot)
    s = pd.Series(kernel.tolist())
    similar_series = s.apply(lambda x: [idx for idx in np.where(np.array(x) >= threshold)[0]])
    return similar_series

@return_name
def l2norm_lowpass(vectors, threshold=1):
    kernel = pairwise_distances(onehot)
    s = pd.Series(kernel.tolist())
    similar_series = s.apply(lambda x: [idx for idx in np.where(np.array(x) <= threshold)[0]])
    return similar_series

In [None]:
%time df['linear_kernel'] = linear_kernel_highpass(onehot)
%time df['l2_norm'] = l2norm_lowpass(onehot)

In [None]:
with pd.option_context('display.max_colwidth', 500):
    display(df[['name', 'parent', 'linear_kernel', 'l2_norm']])

# TODO
1. [x] 加入'tag' by 斷詞
2. [x] 加入母資料夾
3. [ ] 類似的東東們，i.e. recommendation system

In [None]:
"""Adam
可以用 folder 當作 y，去 train 關鍵字的距離
"""

"""Adam
也許可以用 character-wise 的 embedding 來算 onehot 距離之類的
"""

# 現在要做出 recommendation system

In [None]:
### main do things and output to md

bookmark_bar = html_for_node(contents['roots']['bookmark_bar'])
other = html_for_node(contents['roots']['other'])
catelog_str = ''.join(a for a in catelog)

output_file_name = "output_markdown.md"
with open(output_file_name, 'w', encoding='utf-8') as f:
#     f.write(output_file_template.format(catelog=catelog_str, bookmark_bar=bookmark_bar, other=other))    # origin
    f.write(output_file_template.format(other=other))

# other

In [None]:
# def download_thumb_img(url):
#     load_html
#     load_all_img
#     find_max_img

# route = download_thumb_img(googlebookmarkinfo['url'][idx])    # ex: 'C:/example.png'

# 爬一個網址中的所有圖片