<a href="https://colab.research.google.com/github/alexlautw9527/for_notebook/blob/main/Fund2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install node2vec
!pip install glove-python-binary

In [55]:
import argparse
import pprint

import pandas as pd 
import numpy as np


import networkx as nx
from node2vec import Node2Vec
import gensim
from gensim.models.word2vec import Word2Vec

from glove import Glove
from glove import Corpus

In [2]:
fund_code_df = pd.read_csv('/content/drive/MyDrive/05_OTHER_DATA/FUND_CODE_DATA.csv', encoding='CP950')
fund_code_df = fund_code_df[['產品代碼', '基金名稱', 'ISIN Code']]

In [42]:
fund_df = pd.read_csv('/content/drive/MyDrive/05_OTHER_DATA/fund_desc.csv', encoding='CP950')
fund_df = fund_df[fund_df['RISK_LEVEL_ALL']!=0][['CNAME','FUNDCRYID', 'RISK_LEVEL_ALL','配息', '股債分類','基金類型']]
fund_df['基金類型'] = fund_df['基金類型'].fillna('NA')

In [43]:
fund_df = fund_df.astype(str)

## Node2vec

In [82]:
## 製作graph

fund_edge = pd.get_dummies(fund_df, columns=['FUNDCRYID', 'RISK_LEVEL_ALL','配息','股債分類','基金類型'], prefix='', prefix_sep='')
fund_edge = fund_edge.set_index('CNAME').stack()
fund_edge = fund_edge[fund_edge==1]

In [84]:
## 生成節點
fund_edge = fund_edge.index.tolist()

In [85]:
G = nx.Graph(fund_edge)

In [86]:

'''
:param graph: Input graph
:param dimensions: Embedding dimensions (default: 128)
:param walk_length: Number of nodes in each walk (default: 80)
:param num_walks: Number of walks per node (default: 10)
:param p: Return hyper parameter (default: 1)
:param q: Inout parameter (default: 1)
:param weight_key: On weighted graphs, this is the key for the weight attribute (default: 'weight')
:param workers: Number of workers for parallel execution (default: 1)
:param sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'.
:param seed: Seed for the random number generator.
'''

# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**

node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=10, workers=8)  # Use temp_folder for big graphs

Computing transition probabilities:   0%|          | 0/2581 [00:00<?, ?it/s]

In [89]:
node2vec.walks[0]

['宏利中國離岸債券基金Ｂ（新臺幣）（季配息）',
 '2',
 'ＰＩＭＣＯ全球投資級別債券基金Ｅ（美元）（季配權）',
 '投資等級公司債',
 'ＰＩＭＣＯ全球投資級別債券基金Ｅ（美元）（季配息）',
 '投資等級公司債',
 'ＮＮ（Ｌ）投資級公司債基金Ｙ（南非幣對沖）（月配息）（本基金進行配息前未先扣除應負擔之相關費用）（基金之配息來源可能為本金）',
 '01_配息',
 '木星金融機會基金（美元）（半年配權）',
 'USD',
 '台新美國增益高收益債券基金（美元）（月配息）（本基金主要係投資於非投資等級之高風險債券且配息來源可能為本金）',
 '4',
 '瀚亞亞太不動產證券化基金Ａ（新臺幣）',
 '4',
 '霸菱亞洲增長',
 '亞太股市',
 '瀚亞亞太豐收平衡基金Ａ（美元）（本基金有相當比重投資於非投資等級之高風險債券且配息來源可能為本金）',
 '3',
 '鋒裕匯理策略收益債券Ａ（歐元）（本基金進行配息前未先扣除應負擔之相關費用）（本基金有相當比重投資於非投資等級之高風險債券且配息來源可能為本金）',
 '全球債']

### 語料庫訓練

In [90]:
node_corpus = node2vec.walks

In [91]:
n2v_model = Word2Vec(node_corpus, size=64, iter=20, window=5, sg=1, negative=20, min_count=1, workers=16)

In [92]:
fund_df = fund_df[fund_df['RISK_LEVEL_ALL']!=0][['CNAME','FUNDCRYID', 'RISK_LEVEL_ALL','配息', '股債分類','基金類型']]


In [94]:
np.savetxt("n2v_fund_emb.tsv", n2v_model.wv[fund_df['CNAME']], delimiter="\t")

In [95]:
fund_df[['CNAME','FUNDCRYID', 'RISK_LEVEL_ALL','配息', '股債分類','基金類型']].to_csv('n2v_fund_emb_meta.tsv', sep = '\t',index=False,header=False)

In [96]:
np.savetxt("n2v_mkt_emb.tsv", n2v_model.wv[fund_df['基金類型'].unique()], delimiter="\t")

In [97]:
pd.DataFrame({'mkt':fund_df['基金類型'].unique()}).to_csv('n2v_mkt_emb_meta.tsv', sep = '\t',index=False,header=False)

## GLOVE

In [47]:

#准备数据集
sentense = [['你','是','谁'],['我','是','台灣人']]
corpus_model = Corpus()
corpus_model.fit(sentense, window=10)


#corpus_model.save('corpus.model')
print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)

#训练
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10,
          no_threads=1, verbose=True)
glove.add_dictionary(corpus_model.dictionary)

#模型保存
glove.save('glove.model')
glove = Glove.load('glove.model')
#语料保存
corpus_model.save('corpus.model')
corpus_model = Corpus.load('corpus.model')

#求相似词
glove.most_similar('我', number=10)

ModuleNotFoundError: ignored

In [None]:
def most_similar(w2v_model, words, topn=20):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [None]:
from google.colab import files
files.download('/content/fund_embedd_metadata.tsv') 
files.download('/content/fund_embedd.tsv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Pretrained W2V combining

In [None]:
! wget https://www.rahimikia.com/FinText/FinText_FastText_Skip-gram.zip

--2021-10-26 15:11:10--  https://www.rahimikia.com/FinText/FinText_FastText_Skip-gram.zip
Resolving www.rahimikia.com (www.rahimikia.com)... 69.163.228.90
Connecting to www.rahimikia.com (www.rahimikia.com)|69.163.228.90|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11656495869 (11G) [application/zip]
Saving to: ‘FinText_FastText_Skip-gram.zip’


2021-10-26 15:19:24 (22.5 MB/s) - ‘FinText_FastText_Skip-gram.zip’ saved [11656495869/11656495869]



In [None]:
!unzip /content/FinText_FastText_Skip-gram.zip

Archive:  /content/FinText_FastText_Skip-gram.zip
   creating: FinText_FastText_Skip-gram/
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015  
  inflating: FinText_FastText_Skip-gram/README.txt  
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015.trainables.vectors_ngrams_lockf.npy  
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015.pkl  
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015.wv.vectors_ngrams.npy  
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015.wv.vectors.npy  
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015.trainables.vectors_vocab_lockf.npy  
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015.wv.vectors_vocab.npy  
  inflating: FinText_FastText_Skip-gram/Word_Embedding_2000_2015.trainables.syn1neg.npy  


In [None]:
from gensim.models import Word2Vec
from gensim.models import FastText
import gensim.downloader as api 

In [None]:
FinText_Word2Vec_skipgram = Word2Vec.load('/content/FinText_FastText_Skip-gram/Word_Embedding_2000_2015')

In [None]:

prod=['fund',
 'etf',
 'stock',
 'preferred_stock',
 'auto_loan',
 'trust',
 'life_insurance',
 'insurance',
 'personal_loan',
 'mortgage',
 'loan',
 'treasury_bond',
 'foreign_exchange',
 'online_banking',
 'fintech',
 'blockchain',
 'bitcoin',
 'robo_advisor'
 ]

In [None]:
product_embedding = FinText_Word2Vec_skipgram.wv[prod]

In [None]:
product_embedding

array([[ 0.14604235,  0.1704098 ,  0.01929267, ..., -0.13392453,
        -0.3286863 ,  0.01605617],
       [ 0.40752319,  0.3141233 , -0.22489968, ..., -0.28437993,
         0.02246348,  0.17138132],
       [ 0.09119733,  0.13523722,  0.01482047, ..., -0.09712761,
        -0.23065072, -0.22710954],
       ...,
       [-0.00303637,  0.145203  , -0.41418603, ..., -0.11681274,
        -0.14459747, -0.02398938],
       [-0.07647095,  0.24916057, -0.12600106, ...,  0.3001527 ,
        -0.21294053, -0.04931631],
       [ 0.29539302, -0.00322265,  0.12641694, ...,  0.13217312,
        -0.4011273 , -0.07066225]], dtype=float32)

In [None]:
product_embedding.shape

(18, 300)

In [None]:
np.savetxt("product_embedd.tsv", product_embedding, delimiter="\t")
pd.DataFrame({'product':prod}).to_csv('prod_embedd_metadata.tsv', sep = '\t',index=False,header=False)

In [None]:
pd.DataFrame({'product':prod}).to_csv('prod_embedd_metadata.tsv', sep = '\t',index=False,header=False)

In [None]:
from google.colab import files
files.download('/content/prod_embedd_metadata.tsv') 
files.download('/content/product_embedd.tsv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>