In [78]:
import dgl as d
import numpy as np
import torch as t
from dgl.nn.pytorch import SAGEConv
from torch.utils.data import DataLoader
from tqdm import *
import pandas as pd

t.set_default_tensor_type(t.FloatTensor)

class DataTransformer():

    def __init__(self):
        self.lamd = None
        self.min = None
        self.max = None

    def fit(self,X):
        from scipy import stats
        X, self.lamd = stats.boxcox(X)
        self.min = X.min()
        self.max = X.max()
        X = (X-self.min) / (self.max - self.min)
        return X

    def inverse(self,X):
        from scipy.special import inv_boxcox
        X = X * (self.max - self.min) + self.min
        X = inv_boxcox(X, self.lamd)
        return X


def getIPClass(ip):
    try:
        ips = ip.split('.')
        ips = [int(ip) for ip in ips]
        if 1 <= ips[0] <= 127:
            return 1
        if 128 <= ips[0] <= 191:
            return 2
        if 192 <= ips[0] <= 223:
            return 3
        if 224 <= ips[0] <= 239:
            return 4
        if 240 <= ips[0] <= 255:
            return 5
    except:
        pass
    return 0

class FeatureLookup():

    def __init__(self):
        self.__inner_id_counter = 0
        self.__inner_bag = {}
        self.__category = set()
        self.__category_bags = {}
        self.__inverse_map = {}

    def register(self, category, value):
        # 添加进入类别
        self.__category.add(category)
        # 如果类别不存在若无则，则新增一个类别子树
        if category not in self.__category_bags:
            self.__category_bags[category] = {}

        # 如果值不在全局索引中，则创建之，id += 1
        if value not in self.__inner_bag:
            self.__inner_bag[value] = self.__inner_id_counter
            self.__inverse_map[self.__inner_id_counter] = value
            # 如果值不存在与类别子树，则创建之
            if value not in self.__category_bags[category]:
                self.__category_bags[category][value] = self.__inner_id_counter
            self.__inner_id_counter += 1

    def query_id(self, value):
        # 返回索引id
        return self.__inner_bag[value]

    def query_value(self, id):
        # 返回值
        return self.__inverse_map[id]

    def __len__(self):
        return len(self.__inner_bag)

def Metrics(realVec, estiVec):
    realVec = np.array(realVec)
    estiVec = np.array(estiVec)
    absError = np.abs(estiVec - realVec)
    mae = np.mean(absError)
    nmae = mae / (np.sum(realVec) / absError.shape[0])
    rmse = np.linalg.norm(absError) / np.sqrt(absError.shape[0])
    relativeError = absError / realVec
    mre = np.percentile(relativeError, 50)
    npre = np.percentile(relativeError, 90)
    return np.array([mae, nmae, rmse, mre, npre])

class QoSFMDataset(t.utils.data.Dataset):
    def __init__(self,lookup):
        import time
        self.lookup = lookup
        # Load Dataset
        print('Loading Dataset')
        start = time.time()
        self.wslist = pd.read_csv('./datasets/data/WSDREAM/原始数据/wslist.csv').to_numpy()
        self.userlist = pd.read_csv('./datasets/data/WSDREAM/原始数据/userlist.csv').to_numpy()
        self.rtMatrix = np.loadtxt('./datasets/data/WSDREAM/原始数据/rtMatrix.txt')
        self.tpMatrix = np.loadtxt('./datasets/data/WSDREAM/原始数据/tpMatrix.txt')
        self.rtTransformer = DataTransformer()
        self.tpTransformer = DataTransformer()
        self.recs = []
        for i in tqdm(range(339)):
            for j in range(5825):
                if self.rtMatrix[i][j] <= 0 or self.rtMatrix[i][j] >= 19.9:
                    continue
                if self.tpMatrix[i][j] <= 0 or self.tpMatrix[i][j] >= 1000.0:
                    continue
                self.recs.append([i,j])
        self.recs = np.array(self.recs)

        self.rtMatrix[self.rtMatrix > 0] = self.rtTransformer.fit(self.rtMatrix[self.rtMatrix>0])
        self.tpMatrix[self.tpMatrix > 0] = self.tpTransformer.fit(self.tpMatrix[self.tpMatrix>0])

        p = np.random.permutation(len(self.recs))
        self.recs = self.recs[p]
        end = time.time()
        print('Dataset Ready! Time=%.3fs'%(end-start))

    def __len__(self):
        return len(self.recs)

    def __getitem__(self, idx):
        i,j = self.recs[idx]
        uid = f'User{i}'
        sid = f'Serv{j}'
        ure, uas = self.userlist[i][2],self.userlist[i][4]
        sre, sas, spr = self.wslist[j][5],self.wslist[j][7],self.wslist[j][3]

        ure = f'URE_{ure}'
        uas = f'UAS_{uas}'

        sre = f'SRE_{sre}'
        sas = f'SAS_{sas}'
        spr = f'SPR_{spr}'

        uid = self.lookup.query_id(uid)
        sid = self.lookup.query_id(sid)
        ure = self.lookup.query_id(ure)
        uas = self.lookup.query_id(uas)
        sre = self.lookup.query_id(sre)
        sas = self.lookup.query_id(sas)
        spr = self.lookup.query_id(spr)
        rt = self.rtMatrix[i][j]
        tp = self.tpMatrix[i][j]

        return [uid,sid,ure,uas,sre,sas,spr], rt ,tp

In [79]:
# ====================================================#
# Create Graph
# ====================================================#

graph = d.graph([])
lookup = FeatureLookup()

ulines = pd.read_csv('./datasets/data/WSDREAM/原始数据/userlist.csv').to_numpy()
slines = pd.read_csv('./datasets/data/WSDREAM/原始数据/wslist.csv').to_numpy()


for uid in range(339):
    lookup.register('User', f'User{uid}')

for sid in range(5825):
    lookup.register('Serv', f'Serv{sid}')

for ure in ulines[:, 2]:
    lookup.register('URE', f'URE_{ure}')

for uas in ulines[:, 4]:
    lookup.register('UAS', f'UAS_{uas}')

for spr in slines[:, 3]:
    lookup.register('SPR', f'SPR_{spr}')

for sre in slines[:, 5]:
    lookup.register('SRE', f'SRE_{sre}')

for sas in slines[:, 7]:
    lookup.register('SAS', f'SAS_{sas}')

graph.add_nodes(len(lookup))

for line in ulines:
    uid = line[0]
    ure = line[2]
    uas = line[4]
    uid = f'User{uid}'
    ure = f'URE_{ure}'
    uas = f'UAS_{uas}'

    uid = lookup.query_id(uid)
    ure = lookup.query_id(ure)
    if not graph.has_edges_between(uid,ure):
        graph.add_edges(uid, ure)

    uas = lookup.query_id(uas)
    if not graph.has_edges_between(uid,uas):
        graph.add_edges(uid, uas)

    if not graph.has_edges_between(ure,uas):
        graph.add_edges(ure, uas)

for line in slines:
    sid = line[0]
    spr = line[3]
    sre = line[5]
    sas = line[7]

    sid = f'Serv{sid}'
    sre = f'SRE_{sre}'
    spr = f'SPR_{spr}'
    sas = f'SAS_{sas}'

    sid = lookup.query_id(sid)
    sre = lookup.query_id(sre)
    if not graph.has_edges_between(sid,sre):
        graph.add_edges(sid, sre)

    sas = lookup.query_id(sas)
    if not graph.has_edges_between(sid,sas):
        graph.add_edges(sid, sas)

    spr = lookup.query_id(spr)
    if not graph.has_edges_between(sid,spr):
        graph.add_edges(sid, spr)

    if not graph.has_edges_between(sre,spr):
        graph.add_edges(sre, spr)

    if not graph.has_edges_between(sre,sas):
        graph.add_edges(sre, sas)

    if not graph.has_edges_between(spr,sas):
        graph.add_edges(spr, sas)

graph = d.add_self_loop(graph)
graph = d.to_bidirected(graph)

In [80]:
dataset = QoSFMDataset(lookup)

Loading Dataset


100%|██████████| 339/339 [00:01<00:00, 222.07it/s]


Dataset Ready! Time=3.974s


In [81]:
# ====================================================#
# Create DataLoader
# ====================================================#
from torch.utils.data import random_split

def get_splited_datasets(dataset,density,testsize=100000):
    trainsize  = int(density * len(dataset))
    trainset,testset = random_split(dataset,[trainsize, len(dataset)-trainsize])
    testset,_ = random_split(testset,[testsize, len(testset)-testsize])
    return trainset, testset

In [82]:
def collate_fn(batch):

    idxs = []
    rts = []
    tps = []
    for rec in batch:
        idx,rt ,tp = rec
        idxs +=[idx]
        rts += [rt]
        tps += [tp]

    return t.tensor(idxs),t.tensor(rts),t.tensor(tps)


class GraphSAGEConv(t.nn.Module):

    def __init__(self,graph, dim, order=3):
        super(GraphSAGEConv, self).__init__()
        self.order = order
        self.graph = graph
        self.embedding = t.nn.Parameter(t.Tensor(self.graph.number_of_nodes(), dim)).cuda()
        t.nn.init.kaiming_normal_(self.embedding, nonlinearity='relu')
        self.graph.ndata['L0'] = self.embedding
        self.layers = t.nn.ModuleList([SAGEConv(dim, dim, 'gcn') for _ in range(order)])
        self.norms = t.nn.ModuleList([t.nn.BatchNorm1d(dim) for _ in range(order)])
        self.acts = t.nn.ModuleList([t.nn.ReLU() for _ in range(order)])

    def forward(self):
        feats = self.graph.ndata['L0']
        res = feats
        last = feats
        for i, (layer, norm, act) in enumerate(zip(self.layers, self.norms, self.acts)):
            feats = layer(self.graph, feats).squeeze()
            feats = norm(feats)
            feats = act(feats)
            last = feats
            res = t.cat((feats, res), -1)
        return res, feats


class FM(t.nn.Module):
    """Factorization Machine models pairwise (order-2) feature interactions
     without linear term and bias.
      Input shape
        - 3D tensor with shape: ``(batch_size,field_size,embedding_size)``.
      Output shape
        - 2D tensor with shape: ``(batch_size, 1)``.
      References
        - [Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
    """

    def __init__(self):
        super(FM, self).__init__()

    def forward(self, inputs):
        fm_input = inputs
        square_of_sum = t.pow(t.sum(fm_input, dim=1, keepdim=True), 2)
        sum_of_square = t.sum(fm_input * fm_input, dim=1, keepdim=True)
        cross_term = square_of_sum - sum_of_square
        cross_term = 0.5 * t.sum(cross_term, dim=-1, keepdim=False)
        return cross_term


class DeepFM(t.nn.Module):
    def __init__(self, graph, latent_dim, order=3):
        super(DeepFM, self).__init__()
        self.latent_dim = latent_dim
        self.GraphEmbedding = GraphSAGEConv(graph, latent_dim, order=order)

        self.FeatureWeights = t.nn.Embedding(graph.number_of_nodes(), 1)
        t.nn.init.kaiming_normal_(self.FeatureWeights.weight)

        self.Deep = t.nn.Sequential(
            t.nn.Linear((order + 1) * 7 * latent_dim, 4 * latent_dim),
            # t.nn.Linear(7*latent_dim,4*latent_dim),
            t.nn.LayerNorm(4 * latent_dim),
            t.nn.ReLU(),
            t.nn.Linear(4 * latent_dim, 2 * latent_dim),
            t.nn.LayerNorm(2 * latent_dim),
            t.nn.ReLU(),
            t.nn.Linear(2 * latent_dim, 1)
        )
        self.FM = FM()

    def forward(self, Idx):

        if t.cuda.is_available():
            Idx = Idx.cuda()
        bs = Idx.shape[0]

        # Graph Spreading Embedding
        embedding, feats = self.GraphEmbedding()

        # One
        one = self.FeatureWeights(Idx)
        one = t.sum(one, 1)

        # Two
        feat_embeds = feats[Idx]
        two = self.FM(feat_embeds)

        # FM Output(LR)
        yfm = (one + two).sigmoid().squeeze()

        # Deep Part
        din = t.reshape(embedding[Idx], (bs, -1))
        ydnn = self.Deep(din).squeeze()

        # Return
        return (yfm + ydnn).sigmoid()

In [83]:
def EvaluationOnce(graph,trainset,testset,order,dimension):
    trainLoader = DataLoader(trainset, 256, shuffle=True, num_workers=6, pin_memory=True,collate_fn=collate_fn)
    testLoader = DataLoader(testset, 256, shuffle=True,num_workers=6, pin_memory=True,collate_fn=collate_fn)
    graph = graph.to(t.device('cuda'))
    model = DeepFM(graph,dimension, order=order).cuda()
    loss = t.nn.L1Loss().cuda()
    lr = 8e-3
    optimizer = t.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)

    bestMAE = 2e9
    for epoch in trange(30):
        total_loss = 0
        if epoch % 10 == 0 and epoch > 0:
            lr /= 2
            optimizer = t.optim.AdamW(model.parameters(),lr=lr, weight_decay=1e-3)

        model.train()

        for batch in trainLoader:
            optimizer.zero_grad()
            idx, label, _ = batch
            val = model(idx)
            val = val.reshape(label.shape)
            trainloss = loss(val, label.cuda())
            trainloss.backward()
            optimizer.step()
            total_loss += trainloss

        arr0 = np.zeros((5,))
        model.eval()
        for batch in testLoader:
            idx, label, _ = batch
            val = model(idx)
            val = val.reshape(label.shape)
            val, label = val, label.cuda()
            val = val.cpu().detach().numpy()
            label = label.cpu().detach().numpy()
            val = dataset.rtTransformer.inverse(val)
            label = dataset.rtTransformer.inverse(label)
            arr0 += Metrics(val, label)

        arr0 /= len(testLoader)

        if arr0[0] <= bestMAE:
            bestMAE = arr0[0]

        # print(f'Epochs {epoch}: Loss={total_loss / len(trainLoader)}')
        # print('Epochs %d: MAE:%.4f NMAE:%.4f RMSE:%.4f MRE:%.4f NPRE:%.4f' % (epoch, *arr0))

    return bestMAE

In [84]:
class A:
    def __init__(self, ):
        self.dataset = 'rt'
        self.path = './datasets/data/WSDREAM/'
        self.density = 0.10
        self.processed = 0
        self.part_type = 1
        self.slices = 1
        self.epochs = 30
        self.retrain = 1
        self.batch_size = 128

In [85]:
from datasets.dataloader import get_dataloaders
from datasets.dataset import load_data, create_graph, ShardedTensorDataset


def Evaluation():
    for density in [0.025,0.05,0.075,0.10]:
        trainset, testset = get_splited_datasets(dataset,density,testsize=300000)
        for dimension in [64, 128, 256]:
            for order in [1, 2]:
                bestMAE = EvaluationOnce(graph,trainset,testset,order,dimension)
                print(f'Density={density}, Dim={dimension}, Order={order}, MAE = {bestMAE:.3f}')

In [86]:
t.cuda.is_available()

True

In [87]:
Evaluation() # order = 2, dimension = 128
# order = 1 , density = 2.5, dimension = 64, MAE = 0.471
# order = 2 , density = 2.5, dimension = 64, MAE = 0.430
# order = 1 , density = 2.5, dimension = 128, MAE = 0.421
# order = 2 , density = 2.5, dimension = 128, MAE = 0.421

100%|██████████| 30/30 [01:02<00:00,  2.09s/it]


Density=0.025, Dim=64, Order=1, MAE = 0.475


100%|██████████| 30/30 [01:19<00:00,  2.66s/it]


Density=0.025, Dim=64, Order=2, MAE = 0.439


100%|██████████| 30/30 [01:05<00:00,  2.17s/it]


Density=0.025, Dim=128, Order=1, MAE = 0.420


100%|██████████| 30/30 [01:21<00:00,  2.73s/it]


Density=0.025, Dim=128, Order=2, MAE = 0.404


100%|██████████| 30/30 [01:07<00:00,  2.25s/it]


Density=0.025, Dim=256, Order=1, MAE = 0.911


100%|██████████| 30/30 [01:25<00:00,  2.84s/it]


Density=0.025, Dim=256, Order=2, MAE = 19.087


100%|██████████| 30/30 [01:35<00:00,  3.18s/it]


Density=0.05, Dim=64, Order=1, MAE = 0.416


100%|██████████| 30/30 [01:53<00:00,  3.79s/it]


Density=0.05, Dim=64, Order=2, MAE = 0.386


100%|██████████| 30/30 [01:29<00:00,  3.00s/it]


Density=0.05, Dim=128, Order=1, MAE = 0.375


100%|██████████| 30/30 [01:57<00:00,  3.92s/it]


Density=0.05, Dim=128, Order=2, MAE = 0.358


100%|██████████| 30/30 [01:45<00:00,  3.50s/it]


Density=0.05, Dim=256, Order=1, MAE = 19.087


100%|██████████| 30/30 [02:09<00:00,  4.32s/it]


Density=0.05, Dim=256, Order=2, MAE = 0.352


100%|██████████| 30/30 [01:44<00:00,  3.48s/it]


Density=0.075, Dim=64, Order=1, MAE = 0.381


100%|██████████| 30/30 [02:21<00:00,  4.72s/it]


Density=0.075, Dim=64, Order=2, MAE = 0.364


100%|██████████| 30/30 [01:56<00:00,  3.87s/it]


Density=0.075, Dim=128, Order=1, MAE = 0.359


100%|██████████| 30/30 [02:25<00:00,  4.86s/it]


Density=0.075, Dim=128, Order=2, MAE = 0.346


100%|██████████| 30/30 [02:04<00:00,  4.15s/it]


Density=0.075, Dim=256, Order=1, MAE = 0.341


100%|██████████| 30/30 [02:35<00:00,  5.20s/it]


Density=0.075, Dim=256, Order=2, MAE = 19.088


100%|██████████| 30/30 [02:03<00:00,  4.12s/it]


Density=0.1, Dim=64, Order=1, MAE = 0.361


100%|██████████| 30/30 [02:33<00:00,  5.12s/it]


Density=0.1, Dim=64, Order=2, MAE = 0.343


100%|██████████| 30/30 [02:13<00:00,  4.45s/it]


Density=0.1, Dim=128, Order=1, MAE = 0.340


100%|██████████| 30/30 [02:43<00:00,  5.46s/it]


Density=0.1, Dim=128, Order=2, MAE = 0.327


100%|██████████| 30/30 [02:32<00:00,  5.07s/it]


Density=0.1, Dim=256, Order=1, MAE = 0.323


100%|██████████| 30/30 [03:08<00:00,  6.30s/it]

Density=0.1, Dim=256, Order=2, MAE = 0.899



