# Network Analysis

## Word Similarities from Embeddings

If necessary, install `spacy` and the Chinese language model `zh_core_web_lg` (glove embeddings).

In [None]:
# !pip install spacy==2.3
# !spacy download zh_core_web_lg

Load the packages.

In [None]:
import spacy
nlp_zh = spacy.load('zh_core_web_lg')

near_syns = ['覺得','認為','宣稱','表示','強調','顯示', '說明','指出','提出','主張']


Inspect the word vectors matrix from the spacy model.

In [None]:
glove_word_vectors = nlp_zh.vocab.vectors
print('Spacy GloVe word vectors Shape: (vocab_size, embedding_dim)',glove_word_vectors.shape)
len(nlp_zh.vocab.vectors)

Check the similarities of 認為 with the other words in the near-syns

In [None]:
w1 = nlp_zh.vocab['認為']
w2 = nlp_zh.vocab['覺得']

# w1 similarities with others
for w in near_syns:
    if w !=w1:
        w_text = w
        w =nlp_zh.vocab[w]
        print(w_text, ':', w1.similarity(w))

To reduce the computation cost, extract the vocabulary of the Chinense model by excluding:
- ascii characters
- digits
- punctuations

And also, consider only two-character words.

In [5]:
vocab = list(nlp_zh.vocab.strings)
#       if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
# ]
print(len(vocab))
print(vocab[20000:20200])

544337
['2022', '022', '乌拉特后旗', '特后旗', '温差', '湘潭', 'play', '留學', '索取', '透明度', '孤立', '伊始', '安全法', '故居', '中医院', '番茄', '07月', '历任', '預算', '十字', '手柄', '利润率', '133', '涛', 'Office', '宝博', '企稳', '加�', '代辦', '紧缺', '重现', '冲着', '大利', '播种', '随手', '克什克腾旗', '克腾旗', "'s", "'x", '换来', '受灾', '亮眼', '峦�', '峦', '计数', '操穴', '100米', '00米', '展品', '帶動', '前任', 'a站', '表率', '社科', '供奉', '安检', '吉泽明', '学生会', '三线', '清凉', '取暖', '隐蔽', '无所谓', '不在乎', '粗大', '串联', '切尔西', '時光', '增殖', '宜宾市', '溫暖', '燕子', '燕', '后天', '冒出', '權力', '倫敦', '波司登', '胜地', '值当', '康健', '协和', '朴素', '胸口', '樱花', '樱', '孔明', '少许', '嵌入', '镍', '掘金', '掘', '推�', '项链', '包赢', '制作人', '增产', '交流区', '妆品', '妆', '温恒', '未婚', '非金属', '事前', '台账', '强强', '银行家', '大树', '小哥', '纱', '肤色', '肤', '陡然', '陡', '打水', '電源', '项目部', '樂團', '兩位', '来不及', '邻家', '外星人', '黄网站', '南充', '市直', '带入', '電影院', '摔倒', '礼服', '建造师', '５', '自拍区', '贯通', '沿岸', '透玩', 'LOGO', 'logo', 'OGO', '他家', '领空', '稀少', '13%', '山林', '频w', '算单', '田野', '猜想', '這裏', '增強', '文山', '不俗', '收費', '配电', '利害', '萌', '捡', '开播', '依规', '深知', '

In [6]:
target_word = '覺得'
word_sim = []
# check each word in vocab its simi with target_Word

target_word_vocab = nlp_zh.vocab[target_word]
for w in vocab:
    w_vocab = nlp_zh.vocab[w]
    if w_vocab.vector is not None and np.count_nonzero(w_vocab.vector):
        word_sim.append((w, target_word_vocab.similarity(w_vocab)))

In [7]:
sorted(word_sim, key=lambda x:x[1], reverse=True)[:10]

[('覺得', 1.0),
 ('覺', 0.8478886),
 ('其實', 0.7956978),
 ('會覺', 0.7882689),
 ('以為', 0.78638536),
 ('感覺', 0.78400886),
 ('看來', 0.77983254),
 ('畢竟', 0.7633344),
 ('看起來', 0.76294947),
 ('因為', 0.7625315)]

Each `vocab` has several properties that are useful for filtering irrelevant words before computing the word similarities

In [8]:
#w.is_lower == word.is_lower and w.prob >= -15

w1 = nlp_zh.vocab['覺得']
w2 = nlp_zh.vocab['ship']

w2.is_ascii
w2.is_currency
w2.is_punct

False

Define functions to extract top-N similar words

- Functions taken from [this SO discussion thread](https://stackoverflow.com/questions/57697374/list-most-similar-words-in-spacy-in-pretrained-model)

In [9]:
from numba import jit

@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta


In [10]:
def most_similar_v1(word, topn=5):
  word = nlp_zh.vocab[str(word)]
  queries = [
      w for w in nlp_zh.vocab 
      if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
  ]

  #by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)

  by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)
    
    
  return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]


In [11]:
def most_similar_v2(word, topn=5):
  word = nlp_zh.vocab[str(word)]
  queries = [
      w for w in nlp_zh.vocab 
      if np.count_nonzero(w.vector) and not w.is_ascii and not w.is_punct and len(w.text)==2
  ]

  by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
  #by_similarity = sorted(queries, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)

  return [(w.text,w.similarity(word)) for w in by_similarity[:topn+1] if w.text != word.text]



In [12]:
%%time
most_similar_v1("覺得", topn=3)

CPU times: user 7.19 s, sys: 319 ms, total: 7.51 s
Wall time: 28.6 s


[('其實', 0.7956978), ('會覺', 0.7882689), ('以為', 0.78638536)]

In [None]:
%%time
most_similar_v2("覺得", topn=3)

Extract top N similar words for all near-syns:

In [None]:
%%time
near_syn_topn = dict([(w, most_similar_v1(w, topn=500)) for w in near_syns])

Top 10 similar words for `near_syns[0]`

In [None]:
near_syn_topn[near_syns[0]][:10]

In [None]:
near_syn_topn_list = []
for w, s in near_syn_topn.items():
    for s_w, s_s in s:
        near_syn_topn_list.append((w, s_w, s_s))

In [None]:
print(near_syn_topn_list[:10])
print(len(near_syn_topn_list))

In [None]:
import pandas as pd
df = pd.DataFrame(near_syn_topn_list,columns=['w1','w2','sim'])
df[df['sim']>0.7]

## Creating a Network

In [None]:
import networkx as nx
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cosine
G = nx.Graph()

In [None]:
df2 = df[df['sim'] > 0.5]
nodes_id = list(set(list(df2['w2'].values) + list(df2['w1'].values)))
#print(nodes_id)
print(len(nodes_id))

- Word vectors of all nodes from spaCy
- Pairwise cosine similarity of all nodes

In [None]:
# word vectors of all nodes
#x = np.array([nlp_zh(w).vector for w in nodes_id])

m = len(list(nodes_id))
distances = np.zeros((m,m))

for i in range(m):
    for j in range(m):  
        distances[i,j] = nlp_zh.vocab[nodes_id[i]].similarity(nlp_zh.vocab[nodes_id[j]])

In [None]:
# flatten        


#print(node_names)
distances_flat = []
CUTOFF = 0.8
for i in range(m):
    for j in range(m):
        if distances[i,j]> 0.8 and i != j:
            distances_flat.append((nodes_id[i], nodes_id[j], distances[i,j]))

edges_df = pd.DataFrame(distances_flat, columns=['w1','w2','sim'])
edges_df.loc[100:140,:]

In [None]:
print(edges_df.shape)

In [None]:
edges_df = edges_df.append(df2)

In [None]:
G= nx.from_pandas_edgelist(edges_df, 'w1','w2','sim')

In [None]:
def myRescaler(x):
    x = np.array(x)
    y = np.interp(x, (x.min(), x.max()), (5, 20))
    return list(y)

In [None]:
nodes_df = pd.DataFrame({'id':list(G.nodes),
#                          'visnodes':Gvis.get_nodes(),
                         'betweenness': myRescaler(list(nx.betweenness_centrality(G).values())),
                         'eigenvector': myRescaler(list(nx.eigenvector_centrality(G).values()))})
nodes_df['size']=[5 if i not in near_syns else 10 for i in nodes_id]
nodes_df['size2']= [i if i not in near_syns else 30 for i in nodes_df['eigenvector']]

In [None]:
from pyvis.network import Network

In [None]:
Gvis = Network("1024px","1024px", notebook=False)
# # Gvis.from_nx(G)
Gvis.add_nodes(list(G.nodes), value=nodes_df['size2'])
Gvis.add_edges(list(G.edges))
Gvis.show_buttons(filter_=['physics'])

In [None]:
Gvis.show('Gvis.html')

## Visualizing a Network