In [3]:
import pickle
import torch
from torch_geometric.data import Data
import networkx as nx
from sklearn.metrics import roc_auc_score
import torch_geometric.transforms as T
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import negative_sampling


In [4]:
follower = {}
friend = {}
e_tweet = {}
m_tweet = {}
r_tweet = {}

def open_files():
    with open('./Unfollower/15weeks_follower_dict.pkl', 'rb') as f:
        follower = pickle.load(f)

    with open('./Unfollower/15weeks_friend_dict.pkl', 'rb') as f:
        friend = pickle.load(f)

    with open('./Unfollower/e_tweet_dict.pkl', 'rb') as f:
        e_tweet = pickle.load(f)

    with open('./Unfollower/m_tweet_dict.pkl', 'rb') as f:
        m_tweet = pickle.load(f)

    with open('./Unfollower/r_tweet_dict.pkl', 'rb') as f:
        r_tweet = pickle.load(f)
        
    return (follower,friend,e_tweet,m_tweet,r_tweet)

In [5]:
train_range = (0,9)
test_range = (10,14)


# make a networkx graph in order to find the clustering coefficient of the nodes
def getTrainClustering(follower,friend,edge_attr,edge_to_in):
    G = nx.Graph()
    for key in follower:
        if len(follower[key][train_range[1]]) == 2:
            for f in follower[key][train_range[1]][1]:
                G.add_edge(key,f)

    for key in friend:
        if len(friend[key][train_range[1]]) == 2:
            for f in friend[key][train_range[1]][1]:
                G.add_edge(f,key)
              
    print("Getting train common neighbors")
    #i = 0
    for key in edge_to_in:
        if edge_attr[edge_to_in[key]][0] == 0.0 and key[0] in G and key[1] in G:
            neighbors = sum(1 for _ in nx.common_neighbors(G,key[0],key[1]))
            edge_attr[edge_to_in[key]][0] = neighbors
            # if i < 100:
            #     print(f'{ edge_attr[edge_to_in[key]][0]} {neighbors} {edge_to_in[key]}')
            # i += 1
            if (key[1],key[0]) in edge_to_in:
                edge_attr[edge_to_in[(key[1],key[0])]][0] = neighbors
    print("Getting train clustering")  
    cluster_coeffs = nx.clustering(G)
    return (cluster_coeffs,edge_attr)


    # make another for the test set
def getTestClustering(follower,friend,edge_attr,edge_to_in):
    G2 = nx.Graph()
    for key in follower:
        if len(follower[key][test_range[1]]) == 2:
            for f in follower[key][test_range[1]][1]:
                G2.add_edge(key,f)

    for key in friend:
        if len(friend[key][test_range[1]]) == 2:
            for f in friend[key][test_range[1]][1]:
                G2.add_edge(f,key)
    
    print("Getting test common neighbors")
    for key in edge_to_in:
        if edge_attr[edge_to_in[key]][0] == 0.0 and key[0] in G2 and key[1] in G2:
            neighbors = sum(1 for _ in nx.common_neighbors(G2,key[0],key[1]))
            edge_attr[edge_to_in[key]][0] = neighbors
            if (key[1],key[0]) in edge_to_in:
                edge_attr[edge_to_in[(key[1],key[0])]][0] = neighbors       
    print("Getting test clustering")     
    cluster_coeffs = nx.clustering(G2)
    return (cluster_coeffs,edge_attr)


In [6]:
def get_train_edges(follower,friend,id_to_in):
    nodes1 = []
    nodes2 = []

    # mapping of edge tuple to index in edge_index tensor
    edge_to_in = {}

    # Create edge lists for the train Data object
    for key in follower:
        if len(follower[key][train_range[1]]) == 2:
            for f in follower[key][train_range[1]][1]:
                edge_to_in[(key,f)] = len(nodes1)
                nodes1.append(id_to_in[key])
                nodes2.append(id_to_in[f])

    for key in friend:
        if len(friend[key][train_range[1]]) == 2:
            for f in friend[key][train_range[1]][1]:
                edge_to_in[(f,key)] = len(nodes1)
                nodes1.append(id_to_in[f])
                nodes2.append(id_to_in[key])
                
    edge_label = [0.0]*len(nodes1)
    
    # find edges that have been removed
    for key in follower:
        if len(follower[key][train_range[0]]) == 2:
            for f in follower[key][train_range[0]][1]:
                if (key,f) not in edge_to_in:
                    edge_label.append(1.0)
                    edge_to_in[(key,f)] = len(nodes1)
                    nodes1.append(id_to_in[key])
                    nodes2.append(id_to_in[f])
                    
    for key in friend:
        if len(friend[key][train_range[0]]) == 2:
            for f in friend[key][train_range[0]][1]:
                if (f,key) not in edge_to_in:
                    edge_label.append(1.0)
                    edge_to_in[(f,key)] = len(nodes1)
                    nodes1.append(id_to_in[f])
                    nodes2.append(id_to_in[key])
                    
    return (nodes1,nodes2,edge_label,edge_to_in)

def get_test_edges(follower,friend,id_to_in):
    nodes3 = []
    nodes4 = []
    edge_to_in2 = {}


    # Create edge lists for the test Data object
    for key in follower:
        if len(follower[key][test_range[1]]) == 2:
            for f in follower[key][test_range[1]][1]:
                edge_to_in2[(key,f)] = len(nodes3)
                nodes3.append(id_to_in[key])
                nodes4.append(id_to_in[f])

    for key in friend:
        if len(friend[key][test_range[1]]) == 2:
            for f in friend[key][test_range[1]][1]:
                edge_to_in2[(f,key)] = len(nodes3)
                nodes3.append(id_to_in[f])
                nodes4.append(id_to_in[key])
                
    edge_label = [0.0]*len(nodes3)
    
    # find edges that have been removed
    for key in follower:
        if len(follower[key][test_range[0]]) == 2:
            for f in follower[key][test_range[0]][1]:
                if (key,f) not in edge_to_in2:
                    edge_label.append(1.0)
                    edge_to_in2[(key,f)] = len(nodes3)
                    nodes3.append(id_to_in[key])
                    nodes4.append(id_to_in[f])
                    
    for key in friend:
        if len(friend[key][test_range[0]]) == 2:
            for f in friend[key][test_range[0]][1]:
                if (f,key) not in edge_to_in2:
                    edge_label.append(1.0)
                    edge_to_in2[(f,key)] = len(nodes3)
                    nodes3.append(id_to_in[f])
                    nodes4.append(id_to_in[key])
                
    return (nodes3,nodes4,edge_label,edge_to_in2)

In [7]:

# put clustering coefficients in feature array

def add_clustering(x,n,in_to_id,cluster_coeffs):
    for i in range(n):
        key = str(in_to_id[i])
        if key in cluster_coeffs:
            x[i][0] = cluster_coeffs[key]
        


In [8]:
# add number of tweets to feature list x

def add_tweets_train(x,n,in_to_id,e_tweet):
    for i in range(n):
        key = str(in_to_id[i])
        x[i][1] = 0
        for j in range(train_range[1]):
            if key in e_tweet['train'][j+1]:
                x[i][1] += len(e_tweet['train'][j+1][key])
                
def add_tweets_test(x2,n,in_to_id,e_tweet):
    for i in range(n):
        key = str(in_to_id[i])
        x2[i][1] = 0
        for j in range(test_range[0],test_range[1]):
            if key in e_tweet['test'][j+1]:
                x2[i][1] += len(e_tweet['test'][j+1][key])

In [9]:
def add_edge_tweets_train(edge_attr,n,edge_to_in,nodes1,nodes2,m_tweet,r_tweet):
    for key in edge_to_in:
        for j in range(train_range[1]):
            if key[0] in m_tweet['train'][j+1] and key[1] in m_tweet['train'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][1] += len(m_tweet['train'][j+1][key[0]][key[1]])
            if key[0] in r_tweet['train'][j+1] and key[1] in r_tweet['train'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][2] += len(r_tweet['train'][j+1][key[0]][key[1]])
                
def add_edge_tweets_test(edge_attr,n,edge_to_in,nodes1,nodes2,m_tweet,r_tweet):
    for key in edge_to_in:
        for j in range(test_range[0],test_range[1]):
            if key[0] in m_tweet['test'][j+1] and key[1] in m_tweet['test'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][1] += len(m_tweet['test'][j+1][key[0]][key[1]])
            if key[0] in r_tweet['test'][j+1] and key[1] in r_tweet['test'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][2] += len(r_tweet['test'][j+1][key[0]][key[1]])

In [10]:
def make_data():
    print("loading files")
    follower,friend,e_tweet,m_tweet,r_tweet = open_files()
    # make a mapping of the person's id to their index in the node list
    # The index will be the index of their x values when making the Data object
    id_to_in = {}
    in_to_id = list(follower.keys())
    i = 0
    for key in follower:
        id_to_in[key] = i
        i += 1
        
    print("getting edges")
    # get edges
    nodes1,nodes2,edge_label_train,edge_to_in_train = get_train_edges(follower,friend,id_to_in)
    nodes3,nodes4,edge_label_test,edge_to_in_test = get_test_edges(follower,friend,id_to_in)
    
    # make edge feature arrays
    num_edges_train = len(nodes1)
    num_edges_test = len(nodes3)
    edge_attr1 = [[0 for i in range(3)] for j in range(num_edges_train)]
    edge_attr2 = [[0 for i in range(3)] for j in range(num_edges_test)]
    
    # calculate the clustering coefficients and common neighbors
    cluster_coeffs,edge_attr1 = getTrainClustering(follower,friend,edge_attr1,edge_to_in_train)
    cluster_coeffs2,edge_attr2 = getTestClustering(follower,friend,edge_attr2,edge_to_in_test)
    print("exited nx graph calculations")
    
    # add retweets and mentions
    add_edge_tweets_train(edge_attr1,num_edges_train,edge_to_in_train,nodes1,nodes2,m_tweet,r_tweet)
    add_edge_tweets_train(edge_attr2,num_edges_test,edge_to_in_test,nodes3,nodes4,m_tweet,r_tweet)
    
    
    # make node feature arrays
    n = len(list(id_to_in.keys()))
    x = [[0 for i in range(2)] for j in range(n)]
    x2 = [[0 for i in range(2)] for j in range(n)]
    
    
    print("adding clustering to feature array")
    # add clustering coefficients to the feature arrays
    add_clustering(x,n,in_to_id,cluster_coeffs)
    add_clustering(x2,n,in_to_id,cluster_coeffs2)
    
    
    print("adding number of tweets")
    # add number of tweets per user
    add_tweets_train(x,n,in_to_id,e_tweet)
    add_tweets_test(x2,n,in_to_id,e_tweet)
    
    x = torch.tensor(x, dtype=torch.float)
    x2 = torch.tensor(x2, dtype=torch.float)
    edge_index = torch.tensor([nodes1,nodes2], dtype=torch.long)
    edge_index2 = torch.tensor([nodes3,nodes4], dtype=torch.long)
    n_edges = len(nodes1)
    n_edges2 = len(nodes3)
    edge_label = torch.tensor(edge_label_train, dtype=torch.float)
    edge_label2 = torch.tensor(edge_label_test, dtype=torch.float)
    edge_attr1 = torch.tensor(edge_attr1, dtype=torch.float)
    edge_attr2 = torch.tensor(edge_attr2, dtype=torch.float)
    
    train_data = Data(x=x, edge_index=edge_index, edge_label_index=edge_index, edge_label=edge_label,edge_attr=edge_attr1)
    test_data = Data(x=x2, edge_index=edge_index2, edge_label_index=edge_index2, edge_label=edge_label2,edge_attr=edge_attr2)
    
    print("Done")
    return (train_data,test_data)
    

In [11]:
train_data,test_data = make_data()

loading files
getting edges
Getting train common neighbors
Getting train clustering
Getting test common neighbors
Getting test clustering
exited nx graph calculations
adding clustering to feature array
adding number of tweets
Done


In [15]:
l = train_data.edge_label.tolist()
i = -1
print(l[-1])
print(len(l))
for j in range(len(l)):
    if i == -1 and l[j] == 1:
        i = j
print(i)

1.0
5715947
5682979


In [None]:
print(train_data.x.tolist()[:20])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = train_data.to(device)


class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        # heads = 8
        # droupout?, concat?
        self.conv1 = GATConv(in_channels, hidden_channels)
        self.conv2 = GATConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

num_features = 2
hidden_features = 8
model = Net(num_features, hidden_features, num_features).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=.005)
criterion = torch.nn.BCEWithLogitsLoss()


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

losses = []
best_test_auc = 0
for epoch in range(1, 501):
    loss = train()
    #val_auc = test(val_data)
    train_auc = test(train_data)
    test_auc = test(test_data)
    if test_auc > best_test_auc:
        best_test_auc = test_auc
    # print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
    #       f'Test: {test_auc:.4f}')
    losses.append(loss)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_auc:.4f}, Test: {test_auc:.4f}')

print(f'Best Test: {best_test_auc:.4f}')

z = model.encode(test_data.x, test_data.edge_index)
final_edge_index = model.decode_all(z)

[[0.10000000149011612, 0.0], [1.0, 0.0], [0.4346197247505188, 5.0], [0.11166881769895554, 9.0], [0.0, 0.0], [0.0, 0.0], [0.2554112672805786, 7.0], [0.0, 0.0], [0.20346319675445557, 0.0], [0.4346764385700226, 0.0], [0.0, 14.0], [0.0679602101445198, 21.0], [0.20705881714820862, 0.0], [0.08176100999116898, 4.0], [1.0, 0.0], [0.4000000059604645, 0.0], [0.1428571492433548, 0.0], [0.12946158647537231, 1.0], [0.0, 0.0], [0.07621333748102188, 1.0]]
Epoch: 001, Loss: 179.9043, Train: 0.5449, Test: 0.5687
Epoch: 002, Loss: 163.1438, Train: 0.5492, Test: 0.5694
Epoch: 003, Loss: 148.5969, Train: 0.5507, Test: 0.5709
Epoch: 004, Loss: 135.9909, Train: 0.5525, Test: 0.5743
Epoch: 005, Loss: 124.5348, Train: 0.5534, Test: 0.5773
Epoch: 006, Loss: 114.4591, Train: 0.5544, Test: 0.5800
Epoch: 007, Loss: 105.4352, Train: 0.5554, Test: 0.5812
Epoch: 008, Loss: 97.2333, Train: 0.5558, Test: 0.5827
Epoch: 009, Loss: 89.8420, Train: 0.5561, Test: 0.5840
Epoch: 010, Loss: 83.0518, Train: 0.5560, Test: 0.585

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = train_data.to(device)


class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels,edge_dim):
        super().__init__()
        # heads = 8
        # droupout?, concat?
        dropout = .2
        self.conv1 = GATConv(in_channels, hidden_channels,edge_dim=edge_dim,dropout=dropo)
        self.conv2 = GATConv(hidden_channels, out_channels,edge_dim=edge_dim)

    def encode(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr).relu()
        return self.conv2(x, edge_index, edge_attr)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

num_features = 2
edge_dim = 3
hidden_features = 8
model = Net(num_features, hidden_features, num_features,edge_dim).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=.001)
criterion = torch.nn.BCEWithLogitsLoss()


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index, train_data.edge_attr)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index, data.edge_attr)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

losses = []
best_test_auc = 0
for epoch in range(1, 501):
    loss = train()
    #val_auc = test(val_data)
    train_auc = test(train_data)
    test_auc = test(test_data)
    if test_auc > best_test_auc:
        best_test_auc = test_auc
    # print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
    #       f'Test: {test_auc:.4f}')
    losses.append(loss)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_auc:.4f}, Test: {test_auc:.4f}')

print(f'Best Test: {best_test_auc:.4f}')

z = model.encode(test_data.x, test_data.edge_index, test_data.edge_attr)
final_edge_index = model.decode_all(z)

Epoch: 001, Loss: 427.9030, Train: 0.4990, Test: 0.5312
Epoch: 002, Loss: 390.4045, Train: 0.5002, Test: 0.5333
Epoch: 003, Loss: 361.9710, Train: 0.5008, Test: 0.5345
Epoch: 004, Loss: 331.7034, Train: 0.5014, Test: 0.5365
Epoch: 005, Loss: 309.8726, Train: 0.5025, Test: 0.5385
Epoch: 006, Loss: 283.4200, Train: 0.5032, Test: 0.5405
Epoch: 007, Loss: 239.7411, Train: 0.5019, Test: 0.5412
Epoch: 008, Loss: 204.8929, Train: 0.5023, Test: 0.5426
Epoch: 009, Loss: 184.6708, Train: 0.5029, Test: 0.5442
Epoch: 010, Loss: 164.5259, Train: 0.5017, Test: 0.5450
Epoch: 011, Loss: 154.6784, Train: 0.5028, Test: 0.5448
Epoch: 012, Loss: 145.3862, Train: 0.5011, Test: 0.5450
Epoch: 013, Loss: 136.8086, Train: 0.5009, Test: 0.5449
Epoch: 014, Loss: 129.5165, Train: 0.5013, Test: 0.5453
Epoch: 015, Loss: 121.9577, Train: 0.5003, Test: 0.5470
Epoch: 016, Loss: 104.2763, Train: 0.5006, Test: 0.5484
Epoch: 017, Loss: 74.0545, Train: 0.5001, Test: 0.5491
Epoch: 018, Loss: 68.0525, Train: 0.4995, Test: 0