## Modeling of Complex Network Project

## Data Processing

In [None]:
import os
import json
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from SDCN.evaluation import eva

data_dir = os.path.expanduser("./data/cora")

In [None]:
edgelist = pd.read_csv(os.path.join(data_dir, "cora.cites"), sep='\t', header=None, names=["target", "source"])
edgelist["label"] = "cites"
G = nx.from_pandas_edgelist(edgelist, edge_attr="label")
nx.set_node_attributes(G, "paper", "label")

In [None]:
feature_names = ["w_{}".format(ii) for ii in range(1433)]
column_names =  feature_names + ["subject"]
node_data = pd.read_csv(os.path.join(data_dir, "cora.content"), sep='\t', header=None, names=column_names)

In [None]:
node_data = node_data.reset_index()
node_data.head()

### 1. data2json (for js)

In [None]:
subject = list(set(node_data['subject']))
subject2id = {subject[i]:i for i in range(len(subject))}
subject2id

In [None]:
np.unique(subject)

In [None]:
# add nodes
nodes = []
for i in range(len(node_data)):
    paper_id = node_data.loc[i, 'index']
    name = f"paper_{paper_id}"
    category = subject2id[node_data.loc[i, 'subject']]
    nodes.append({"id":str(paper_id), "name":name, "category": int(category)})
# add edges
links = []
for i in range(len(edgelist)):
    source = edgelist.loc[i, 'source']
    target = edgelist.loc[i, 'target']
    links.append({"source": str(source), "target": str(target)})
    
# add categories
categories = []
for sub in subject:
    categories.append({"name": sub})

cora_data = {"nodes": nodes, "links": links, "categories": categories}
with open(data_dir+"/cora_data.json", "w") as json_file:
    json.dump(cora_data, json_file)

In [None]:
cora_data = {"nodes": nodes, "links": links, "categories": categories}
with open(data_dir+"/cora_data.json", "w") as json_file:
    json.dump(cora_data, json_file)

### 2. data2txt (for GCN model)

In [None]:
# 首先我们需要重排id
idx_map = dict()
for i in range(len(node_data)):
    idx_map[node_data.loc[i, "index"]] = i
## 构图
graph = []
for i in range(len(edgelist)):
    source = str(idx_map[edgelist.loc[i, 'source']])
    target = str(idx_map[edgelist.loc[i, 'target']])
    graph.append([source, target])
    graph.append([target, source])
    
sorted_graph = sorted(graph, key=lambda x: x[0])   

In [None]:
## 构建特征+标签
features = []
labels = []
for i in range(len(node_data)):
    feature = []
    for ii in range(1433):
        feature.append(str(node_data.loc[i, f"w_{ii}"]))
    features.append(feature)
    labels.append(subject2id[node_data.loc[i, 'subject']])

### 3. statistics

In [None]:
stat = dict().fromkeys(subject2id.keys())
for lb in labels:
    if not stat[subject[lb]]:
        stat[subject[lb]] = 1
    else:
        stat[subject[lb]] += 1
stat

In [None]:
print(stat.keys())

In [None]:
# 准备数据
lbs = stat.keys()
sizes = stat.values()

# 选择一组吸引人的颜色
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99', '#c2c2f0','#ffb3e6', '#c4e17f']

# 可以选择突出显示一个或多个切片
explode = (0.02, 0, 0.01, 0, 0.08, 0.02, 0)  

# 画饼状图
plt.figure(figsize=(12, 8), dpi=300)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=lbs, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=140, pctdistance=0.85)

# 画一个白色的圆圈在图的中心（即画一个“甜甜圈图”）
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Equal aspect ratio 保证画出的图是圆的，显示图像
ax1.axis('equal')  
plt.tight_layout()
plt.show()

### 4. Deep Walk & clustering

In [None]:
idx_map

In [None]:
# 初始化 Node2Vec 模型
from node2vec import Node2Vec
node2vec = Node2Vec(G, walk_length=10, num_walks=100, p=1, q=1, workers=48)

# 训练嵌入模型
print("training...")
model = node2vec.fit(window=10, min_count=1, batch_words=4)
print("finished training")
# 获取所有节点的嵌入
embeddings = {}
for node in G.nodes():
    embeddings[node] = model.wv[str(node)]

In [None]:
from sklearn.cluster import SpectralClustering, KMeans
x = [0 for _ in range(len(labels))]
for k, v in embeddings.items():
    x[idx_map[k]] = v
kmeans = KMeans(n_clusters=7, n_init=20).fit(np.array(x))
eva(labels, kmeans.labels_, "dw_kmeans")

In [None]:
umap_model = umap.UMAP(random_state=42)
embedding = umap_model.fit_transform(np.array(x))

unique_labels = np.unique(labels)
# 使用 Seaborn 的 cubehelix_palette 调色板生成颜色
from matplotlib.pyplot import MultipleLocator
palette = sns.color_palette("hsv", n_colors=len(unique_labels))  # 使用hsv颜色空间为每个类别生成唯一颜色
color_map = {label: palette[i] for i, label in enumerate(unique_labels)}

In [None]:
plt.figure(figsize=(12, 8), dpi=300)
for label in unique_labels:
    indices = np.where(labels == label)
    plt.scatter(embedding[indices, 0], embedding[indices, 1], c=[color_map[label]], label=subject[label], alpha=0.6, edgecolor='w', s=30)
# plt.legend(loc='upper right', ncol=2)
# 设置 axis刻度
x_major_locator=MultipleLocator(2)
y_major_locator=MultipleLocator(2)
ax=plt.gca()
#ax为两条坐标轴的实例
ax.xaxis.set_major_locator(x_major_locator)
ax.yaxis.set_major_locator(y_major_locator)
plt.savefig('./dw_umap_wo.png')

In [None]:
plt.show()

### 5. UMAP Algorithm

In [None]:
new_features = features
for i in range(len(new_features)):
    new_features[i] = [int(i) for i in new_features[i]] 

In [None]:
from sklearn.random_projection import SparseRandomProjection

In [None]:
# Step 1: 使用 FastRP   ### 先降维并没有用，效果很差！
fastrp_model = SparseRandomProjection(n_components=128, random_state=42)
features_reduced_fastrp = fastrp_model.fit_transform(new_features)

In [None]:
labels

In [None]:
# 然后创建 UMAP 对象并进行进一步降维
umap_model = umap.UMAP(random_state=42)
embedding = umap_model.fit_transform(new_features)

unique_labels = np.unique(labels)
print(unique_labels)

In [None]:
# 使用 Seaborn 的 cubehelix_palette 调色板生成颜色
from matplotlib.pyplot import MultipleLocator
palette = sns.color_palette("hsv", n_colors=len(unique_labels))  # 使用hsv颜色空间为每个类别生成唯一颜色
color_map = {label: palette[i] for i, label in enumerate(unique_labels)}

plt.figure(figsize=(12, 8), dpi=300)
for label in unique_labels:
    indices = np.where(labels == label)
    plt.scatter(embedding[indices, 0], embedding[indices, 1], c=[color_map[label]], label=subject[label], alpha=0.6, edgecolor='w', s=30)
plt.legend(loc='upper right', ncol=2)
# 设置 axis刻度
x_major_locator=MultipleLocator(2)
y_major_locator=MultipleLocator(2)
ax=plt.gca()
#ax为两条坐标轴的实例
ax.xaxis.set_major_locator(x_major_locator)
ax.yaxis.set_major_locator(y_major_locator)
plt.show()

In [None]:
tsne_df = pd.DataFrame(data = {
   "subject": [subject[label] for label in labels],
   "x": [value[0] for value in embedding],
   "y": [value[1] for value in embedding]
})
tsne_df.head()

In [None]:
import altair as alt
 
alt.Chart(tsne_df).mark_circle(size=60).encode(
   x='x',
   y='y',
   color='subject',
   tooltip=['subject']
).properties(width=700, height=600)

### 6. save as .txt file

In [None]:
with open(data_dir+"/cora_graph.txt", 'w') as f:
    for row in sorted_graph:
        f.write(' '.join(row) + '\n')

with open(data_dir+"/cora.txt", 'w') as f:
    for row in features:
        f.write(' '.join(row) + '\n')
        
with open(data_dir+"/cora_label.txt", 'w') as f:
    for row in labels:
        f.write(str(row) + '\n')