In [1]:
import pandas as pd
import numpy as np
import csv

import torch
import torch.nn as nn
import torch.optim as optim

from tuto_dataset import *
from utils import *
from model_GCN import *

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pickle

import itertools
from collections import namedtuple
import scipy.sparse as sp

import random
from sklearn.preprocessing import MinMaxScaler, normalize

In [2]:
# define hyper-parameter ======================================
learning_rate = 0.01
weight_decay = 5e-4
epochs = 20

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
with open('./data/state_q.csv') as f:
    reader = csv.reader(f)
    state = list(reader)

In [4]:
data_x = state
for i in range(len(state)):
    for j in range(len(state[0])):
        data_x[i][j] = np.float32(state[i][j])

In [5]:
data_y = list()
with open('./data/all_users_stands.txt', 'r') as f:
    for i in range(1563):
        stand = f.readline()
        stand = stand.replace('\n', '')
        data_y.append(stand)
f.close()

In [6]:
# one-hot encoding
# opponent label = 0, proponent label = 1
for i in range(1563):
    data_y[i] = int(data_y[i])
    if data_y[i] == -1:
        data_y[i] = 0

In [7]:
# read adj_dict
a_file = open("adj_dict.pkl", "rb")
adj_dict = pickle.load(a_file)

In [8]:
def build_adjacency(adj_dict):

    edge_index = []
    num_nodes = len(adj_dict)
    for src, dst, in adj_dict.items():
        edge_index.extend([src, v] for v in dst)
        edge_index.extend([v, src] for v in dst)
    
    # removed replicated edges
    edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index)))
    edge_index = np.asarray(edge_index)
        
    # A sparse matrix in COOrdinate format.
    adjacency = sp.coo_matrix((np.ones(len(edge_index)), (edge_index[:,0], edge_index[:,1])), shape=(num_nodes, num_nodes), dtype="float32")

    return adjacency

In [9]:
# build a sparse matrix based on the adjacency dict
adj = build_adjacency(adj_dict)

In [10]:
adj.tocoo().col

array([1439,    2, 1340, ..., 1546, 1546, 1546], dtype=int32)

In [11]:
# select train data, test data and pred data. Create masks.
num_label = 180
num_total = 1563
num_nodes = 1563
num_size = 512 # the length of a word vector

# l have 180 labelled data. Among those data, 160 are used to train the model and the rest 20 labeled data are used to test the model.
# then we shuffle the list, which select train data in a random way
train_test_mask = [True]*(num_label-20) + [False]*(20) 

random.shuffle(train_test_mask)
train_mask = train_test_mask + [False]*(num_total - num_label)
test_mask = [not stand for stand in train_test_mask] + [False]*(num_total - num_label)
pred_mask = [False]*num_label + [True]*(num_total - num_label)





In [12]:
# loading data ============================================= 
# convert all list objects to ndarray type
data_x = np.array(data_x)
data_y = np.array(data_y)
train_mask = np.array(train_mask)
test_mask = np.array(test_mask)
pred_mask = np.array(pred_mask)

scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(data_x)
x = normalize(scaled_X, norm='l1', axis=1, copy=True)

#x = data_x / data_x.sum(1, keepdims=True)
tensor_x = torch.from_numpy(x).to(device)
tensor_x = torch.tensor(np.float32(tensor_x))

tensor_y = torch.from_numpy(data_y).to(device)

tensor_train_mask = torch.from_numpy(train_mask).to(device)
tensor_test_mask = torch.from_numpy(test_mask).to(device)
tensor_pred_mask = torch.from_numpy(pred_mask).to(device)

normalize_adjacency = normalization(adj) 

indices = torch.from_numpy(np.asarray([normalize_adjacency.row, normalize_adjacency.col]).astype('int64')).long()
values = torch.from_numpy(normalize_adjacency.data.astype(np.float32))
tensor_adjacency = torch.sparse.FloatTensor(indices, values, (num_nodes, num_nodes)).to(device)
print('Analyzing data ...')
print("------------------------------------------------------")
print("Node's feature shape: ", data_x.shape)
print("Node's label shape: ", data_y.shape)
print("Adjacency's shape: ", adj.shape)
print("Number of train nodes: ", train_mask.sum())
print("Number of validation nodes: ", test_mask.sum())
print("Number of test nodes: ", pred_mask.sum())
print("------------------------------------------------------")



Analyzing data ...
------------------------------------------------------
Node's feature shape:  (1563, 512)
Node's label shape:  (1563,)
Adjacency's shape:  (1563, 1563)
Number of train nodes:  160
Number of validation nodes:  20
Number of test nodes:  1383
------------------------------------------------------


In [13]:
def test(mask):

    model.eval()
    with torch.no_grad():
        logits = model(tensor_adjacency, tensor_x)
        test_mask_logits = logits[mask]
        predict_y = test_mask_logits.max(1)[1]
        accuracy = torch.eq(predict_y, tensor_y[mask]).float().mean()

    return accuracy, predict_y

In [14]:
# build model =================================================
from model_GCN import *

model = GCN_Network(num_size).to(device)

criterion = nn.BCELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


#def train():

loss_history = []
val_acc_history = []
model.train()
train_y = tensor_y[tensor_train_mask]
train_y = torch.tensor(np.float32(train_y))

for epoch in range(epochs):
    logits = model(tensor_adjacency, tensor_x)
    train_mask_logits = logits[tensor_train_mask]
    train_mask_logits = train_mask_logits.squeeze(1)
    loss = criterion(train_mask_logits, train_y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_acc, _ = test(tensor_train_mask)
    val_acc, _ = test(tensor_test_mask)

    loss_history.append(loss.item())
    val_acc_history.append(val_acc.item())

    print("Epoch {:03d}: Loss {:04f}, Train Acc {:.04f}, Val Acc {:.04f}".format(epoch, loss.item(), train_acc.item(), val_acc.item()))

#    return loss_history, val_acc_history

Epoch 000: Loss 0.665601, Train Acc 0.8938, Val Acc 0.9000
Epoch 001: Loss 0.444660, Train Acc 0.8938, Val Acc 0.9000
Epoch 002: Loss 0.353434, Train Acc 0.8938, Val Acc 0.9000
Epoch 003: Loss 0.356554, Train Acc 0.8938, Val Acc 0.9000
Epoch 004: Loss 0.383135, Train Acc 0.8938, Val Acc 0.9000
Epoch 005: Loss 0.393604, Train Acc 0.8938, Val Acc 0.9000
Epoch 006: Loss 0.385746, Train Acc 0.8938, Val Acc 0.9000
Epoch 007: Loss 0.367693, Train Acc 0.8938, Val Acc 0.9000
Epoch 008: Loss 0.348777, Train Acc 0.8938, Val Acc 0.9000
Epoch 009: Loss 0.337199, Train Acc 0.8938, Val Acc 0.9000
Epoch 010: Loss 0.337453, Train Acc 0.8938, Val Acc 0.9000
Epoch 011: Loss 0.345650, Train Acc 0.8938, Val Acc 0.9000
Epoch 012: Loss 0.350761, Train Acc 0.8938, Val Acc 0.9000
Epoch 013: Loss 0.347586, Train Acc 0.8938, Val Acc 0.9000
Epoch 014: Loss 0.339284, Train Acc 0.8938, Val Acc 0.9000
Epoch 015: Loss 0.331394, Train Acc 0.8938, Val Acc 0.9000
Epoch 016: Loss 0.327184, Train Acc 0.8938, Val Acc 0.90

In [15]:
train_mask_logits

tensor([0.0911, 0.0552, 0.0472, 0.0923, 0.0929, 0.0776, 0.0917, 0.0729, 0.0911,
        0.0249, 0.0839, 0.0905, 0.0871, 0.0476, 0.0905, 0.0511, 0.0849, 0.0954,
        0.0919, 0.0541, 0.0851, 0.0923, 0.0910, 0.0731, 0.0909, 0.0924, 0.0690,
        0.0746, 0.0778, 0.0852, 0.0802, 0.0874, 0.0901, 0.0909, 0.0699, 0.0738,
        0.0482, 0.0911, 0.0928, 0.0786, 0.0911, 0.0403, 0.0913, 0.0911, 0.0928,
        0.0906, 0.0946, 0.0767, 0.0635, 0.0938, 0.0295, 0.0910, 0.0635, 0.0904,
        0.0339, 0.0775, 0.0903, 0.0166, 0.0839, 0.0800, 0.0268, 0.0912, 0.0926,
        0.0688, 0.0832, 0.0697, 0.0930, 0.0717, 0.0928, 0.0918, 0.0913, 0.0483,
        0.0929, 0.0054, 0.0910, 0.0903, 0.0240, 0.0776, 0.0902, 0.0907, 0.0768,
        0.0956, 0.0930, 0.0644, 0.0907, 0.0841, 0.0989, 0.0735, 0.0006, 0.0742,
        0.0366, 0.0916, 0.0795, 0.0336, 0.0681, 0.0910, 0.0656, 0.0959, 0.0947,
        0.0925, 0.0847, 0.0894, 0.0843, 0.0924, 0.0911, 0.0912, 0.0849, 0.0904,
        0.0785, 0.0848, 0.0669, 0.0902, 

In [16]:
# ==========================================================
# PREDICTION
# ==========================================================
_, prediction = test(tensor_pred_mask)
# print("Testing Acc {:.4}".format(accuracy))
prediction.sum()

tensor(0)