In [1]:
import pickle
import torch
from torch_geometric.data import Data
import networkx as nx
from sklearn.metrics import roc_auc_score
import torch_geometric.transforms as T
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import negative_sampling
import random


In [2]:

def open_files():
    with open('./Unfollower/15weeks_follower_dict.pkl', 'rb') as f:
        follower = pickle.load(f)

    with open('./Unfollower/15weeks_friend_dict.pkl', 'rb') as f:
        friend = pickle.load(f)

    with open('./Unfollower/e_tweet_dict.pkl', 'rb') as f:
        e_tweet = pickle.load(f)

    with open('./Unfollower/m_tweet_dict.pkl', 'rb') as f:
        m_tweet = pickle.load(f)

    with open('./Unfollower/r_tweet_dict.pkl', 'rb') as f:
        r_tweet = pickle.load(f)
        
    return (follower,friend,e_tweet,m_tweet,r_tweet)

In [3]:
train_range = (0,9)
test_range = (10,14)


# make a networkx graph in order to find the clustering coefficient of the nodes
def getTrainClustering(follower,friend,edge_attr,edge_to_in,node_set):
    G = nx.Graph()
    for key in follower:
        if len(follower[key][train_range[1]]) == 2 and key in node_set:
            for f in follower[key][train_range[1]][1]:
                if f in node_set:
                    G.add_edge(key,f)

    for key in friend:
        if len(friend[key][train_range[1]]) == 2 and key in node_set:
            for f in friend[key][train_range[1]][1]:
                if f in node_set:
                    G.add_edge(f,key)
              
    print("Getting train common neighbors")
    #i = 0
    for key in edge_to_in:
        if edge_attr[edge_to_in[key]][0] == 0.0 and key[0] in G and key[1] in G:
            neighbors = sum(1 for _ in nx.common_neighbors(G,key[0],key[1]))
            edge_attr[edge_to_in[key]][0] = neighbors
            if (key[1],key[0]) in edge_to_in:
                edge_attr[edge_to_in[(key[1],key[0])]][0] = neighbors
    print("Getting train clustering")  
    cluster_coeffs = nx.clustering(G)
    return (cluster_coeffs,edge_attr)


    # make another for the test set
def getTestClustering(follower,friend,edge_attr,edge_to_in,node_set):
    G2 = nx.Graph()
    for key in follower:
        if len(follower[key][test_range[1]]) == 2 and key in node_set:
            for f in follower[key][test_range[1]][1]:
                if f in node_set:
                    G2.add_edge(key,f)

    for key in friend:
        if len(friend[key][test_range[1]]) == 2 and key in node_set:
            for f in friend[key][test_range[1]][1]:
                if f in node_set:
                    G2.add_edge(f,key)
    
    print("Getting test common neighbors")
    for key in edge_to_in:
        if edge_attr[edge_to_in[key]][0] == 0.0 and key[0] in G2 and key[1] in G2:
            neighbors = sum(1 for _ in nx.common_neighbors(G2,key[0],key[1]))
            edge_attr[edge_to_in[key]][0] = neighbors
            if (key[1],key[0]) in edge_to_in:
                edge_attr[edge_to_in[(key[1],key[0])]][0] = neighbors       
    print("Getting test clustering")     
    cluster_coeffs = nx.clustering(G2)
    return (cluster_coeffs,edge_attr)


In [4]:
def get_train_edges(follower,friend,id_to_in,node_set):
    nodes1 = []
    nodes2 = []

    # mapping of edge tuple to index in edge_index tensor
    edge_to_in = {}

    # Create edge lists for the train Data object
    for key in follower:
        if len(follower[key][train_range[1]]) == 2 and key in node_set:
            for f in follower[key][train_range[1]][1]:
                if f in node_set:
                    edge_to_in[(key,f)] = len(nodes1)
                    nodes1.append(id_to_in[key])
                    nodes2.append(id_to_in[f])

    for key in friend:
        if len(friend[key][train_range[1]]) == 2 and key in node_set:
            for f in friend[key][train_range[1]][1]:
                if f in node_set:
                    edge_to_in[(f,key)] = len(nodes1)
                    nodes1.append(id_to_in[f])
                    nodes2.append(id_to_in[key])
                
    edge_label = [0.0]*len(nodes1)
    
    # find edges that have been removed
    for key in follower:
        if len(follower[key][train_range[0]]) == 2 and key in node_set:
            for f in follower[key][train_range[0]][1]:
                if (key,f) not in edge_to_in and f in node_set:
                    edge_label.append(1.0)
                    edge_to_in[(key,f)] = len(nodes1)
                    nodes1.append(id_to_in[key])
                    nodes2.append(id_to_in[f])
                    
    for key in friend:
        if len(friend[key][train_range[0]]) == 2 and key in node_set:
            for f in friend[key][train_range[0]][1]:
                if (f,key) not in edge_to_in and f in node_set:
                    edge_label.append(1.0)
                    edge_to_in[(f,key)] = len(nodes1)
                    nodes1.append(id_to_in[f])
                    nodes2.append(id_to_in[key])
                    
    return (nodes1,nodes2,edge_label,edge_to_in)

def get_test_edges(follower,friend,id_to_in,node_set):
    nodes3 = []
    nodes4 = []
    edge_to_in2 = {}


    # Create edge lists for the test Data object
    for key in follower:
        if len(follower[key][test_range[1]]) == 2 and key in node_set:
            for f in follower[key][test_range[1]][1]:
                if f in node_set:
                    edge_to_in2[(key,f)] = len(nodes3)
                    nodes3.append(id_to_in[key])
                    nodes4.append(id_to_in[f])

    for key in friend:
        if len(friend[key][test_range[1]]) == 2 and key in node_set:
            for f in friend[key][test_range[1]][1]:
                if f in node_set:
                    edge_to_in2[(f,key)] = len(nodes3)
                    nodes3.append(id_to_in[f])
                    nodes4.append(id_to_in[key])
                
    edge_label = [0.0]*len(nodes3)
    
    # find edges that have been removed
    for key in follower:
        if len(follower[key][test_range[0]]) == 2 and key in node_set:
            for f in follower[key][test_range[0]][1]:
                if (key,f) not in edge_to_in2 and f in node_set:
                    edge_label.append(1.0)
                    edge_to_in2[(key,f)] = len(nodes3)
                    nodes3.append(id_to_in[key])
                    nodes4.append(id_to_in[f])
                    
    for key in friend:
        if len(friend[key][test_range[0]]) == 2 and key in node_set:
            for f in friend[key][test_range[0]][1]:
                if (f,key) not in edge_to_in2 and f in node_set:
                    edge_label.append(1.0)
                    edge_to_in2[(f,key)] = len(nodes3)
                    nodes3.append(id_to_in[f])
                    nodes4.append(id_to_in[key])
                
    return (nodes3,nodes4,edge_label,edge_to_in2)

In [5]:

# put clustering coefficients in feature array

def add_clustering(x,n,in_to_id,cluster_coeffs):
    for i in range(n):
        key = str(in_to_id[i])
        if key in cluster_coeffs:
            x[i][0] = cluster_coeffs[key]
        


In [6]:
# add number of tweets to feature list x

def add_tweets_train(x,n,in_to_id,e_tweet,node_set):
    for i in range(n):
        key = in_to_id[i]
        x[i][1] = 0
        for j in range(train_range[1]):
            if key in e_tweet['train'][j+1]:
                x[i][1] += len(e_tweet['train'][j+1][key])
                
def add_tweets_test(x2,n,in_to_id,e_tweet,node_set):
    for i in range(n):
        key = in_to_id[i]
        x2[i][1] = 0
        for j in range(test_range[0],test_range[1]):
            if key in e_tweet['test'][j+1]:
                x2[i][1] += len(e_tweet['test'][j+1][key])

In [7]:
def add_edge_tweets_train(edge_attr,n,edge_to_in,nodes1,nodes2,m_tweet,r_tweet):
    for key in edge_to_in:
        for j in range(train_range[1]):
            if key[0] in m_tweet['train'][j+1] and key[1] in m_tweet['train'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][1] += len(m_tweet['train'][j+1][key[0]][key[1]])
            if key[0] in r_tweet['train'][j+1] and key[1] in r_tweet['train'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][1] += len(r_tweet['train'][j+1][key[0]][key[1]])
                
def add_edge_tweets_test(edge_attr,n,edge_to_in,nodes1,nodes2,m_tweet,r_tweet):
    for key in edge_to_in:
        for j in range(test_range[0],test_range[1]):
            if key[0] in m_tweet['test'][j+1] and key[1] in m_tweet['test'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][1] += len(m_tweet['test'][j+1][key[0]][key[1]])
            if key[0] in r_tweet['test'][j+1] and key[1] in r_tweet['test'][j+1][key[0]]:
                edge_attr[edge_to_in[key]][1] += len(r_tweet['test'][j+1][key[0]][key[1]])

In [8]:
def make_data():
    print("loading files")
    follower,friend,e_tweet,m_tweet,r_tweet = open_files()
    # make a mapping of the person's id to their index in the node list
    # The index will be the index of their x values when making the Data object
    id_to_in1 = {}
    in_to_id1 = []
    id_to_in2 = {}
    in_to_id2 = []
    i1 = 0
    i2 = 0
    # minimum followers needed to be added to the graph
    min_count = 1000
    train_nodes = set()
    test_nodes = set()
    for key in follower:
        if len(follower[key][train_range[1]]) == 2 and len(follower[key][train_range[1]][1]) > min_count:
            id_to_in1[key] = i1
            i1 += 1
            train_nodes.add(key)
            in_to_id1.append(key)
        if len(follower[key][test_range[1]]) == 2 and len(follower[key][test_range[1]][1]) > min_count:
            id_to_in2[key] = i2
            i2 += 1
            test_nodes.add(key)
            in_to_id2.append(key)
    
    print("getting edges")
    # get edges
    nodes1,nodes2,edge_label_train,edge_to_in_train = get_train_edges(follower,friend,id_to_in1,train_nodes)
    nodes3,nodes4,edge_label_test,edge_to_in_test = get_test_edges(follower,friend,id_to_in2,test_nodes)
    
    # make edge feature arrays
    num_edges_train = len(nodes1)
    num_edges_test = len(nodes3)
    edge_attr1 = [[0 for i in range(2)] for j in range(num_edges_train)]
    edge_attr2 = [[0 for i in range(2)] for j in range(num_edges_test)]
    
    # calculate the clustering coefficients and common neighbors
    cluster_coeffs,edge_attr1 = getTrainClustering(follower,friend,edge_attr1,edge_to_in_train,train_nodes)
    cluster_coeffs2,edge_attr2 = getTestClustering(follower,friend,edge_attr2,edge_to_in_test,test_nodes)
    print("exited nx graph calculations")
    
    # add retweets and mentions
    add_edge_tweets_train(edge_attr1,num_edges_train,edge_to_in_train,nodes1,nodes2,m_tweet,r_tweet)
    add_edge_tweets_train(edge_attr2,num_edges_test,edge_to_in_test,nodes3,nodes4,m_tweet,r_tweet)
    
    
    # make node feature arrays
    n_train = len(in_to_id1)
    n_test = len(in_to_id2)
    x = [[0 for i in range(2)] for j in range(n_train)]
    x2 = [[0 for i in range(2)] for j in range(n_test)]
    
    
    print("adding clustering to feature array")
    # add clustering coefficients to the feature arrays
    add_clustering(x,n_train,in_to_id1,cluster_coeffs)
    add_clustering(x2,n_test,in_to_id2,cluster_coeffs2)
    
    
    print("adding number of tweets")
    # add number of tweets per user
    add_tweets_train(x,n_train,in_to_id1,e_tweet,train_nodes)
    add_tweets_test(x2,n_test,in_to_id2,e_tweet,test_nodes)
    
    x = torch.tensor(x, dtype=torch.float)
    x2 = torch.tensor(x2, dtype=torch.float)
    edge_index = torch.tensor([nodes1,nodes2], dtype=torch.long)
    edge_index2 = torch.tensor([nodes3,nodes4], dtype=torch.long)
    edge_label = torch.tensor(edge_label_train, dtype=torch.float)
    edge_label2 = torch.tensor(edge_label_test, dtype=torch.float)
    edge_attr1 = torch.tensor(edge_attr1, dtype=torch.float)
    edge_attr2 = torch.tensor(edge_attr2, dtype=torch.float)
    
    train_data = Data(x=x, edge_index=edge_index, edge_label_index=edge_index, edge_label=edge_label,edge_attr=edge_attr1)
    test_data = Data(x=x2, edge_index=edge_index2, edge_label_index=edge_index2, edge_label=edge_label2,edge_attr=edge_attr2)
    
    print("Done")
    return (train_data,test_data)
    

In [9]:
train_data,test_data = make_data()

loading files
getting edges
Getting train common neighbors
Getting train clustering
Getting test common neighbors
Getting test clustering
exited nx graph calculations
adding clustering to feature array
adding number of tweets
Done


In [10]:
l = train_data.edge_label.tolist()
pos_edges = -1
print(train_data.edge_label_index.size(1))
print(l[-1])
for j in range(len(l)):
    if pos_edges == -1 and l[j] == 1:
        pos_edges = j
print(pos_edges)
print(len(l))


9232
1.0
9214
9232


In [None]:
print(train_data.x.tolist()[:20])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = train_data.to(device)


class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        # heads = 8
        dropout = .2
        self.conv1 = GATConv(in_channels, hidden_channels, dropout=dropout)
        self.conv2 = GATConv(hidden_channels, out_channels, dropout=dropout)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

num_features = 2
hidden_features = 8
model = Net(num_features, hidden_features, num_features).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=.0001)
criterion = torch.nn.BCEWithLogitsLoss()


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    length = len(train_data.edge_index[0])-pos_edges
    neg_edge_index = torch.tensor([[0 for _ in range(length)] for _ in range(2)],dtype=torch.long)
    indices = [i for i in range(pos_edges)]
    sample = random.sample(indices,length)
    for i in range(length):
        neg_edge_index[0][i] = train_data.edge_index[0][sample[i]]
        neg_edge_index[1][i] = train_data.edge_index[1][sample[i]]
    
    edge_label_index = torch.cat(
        [train_data.edge_label_index[:,pos_edges:], neg_edge_index],
        dim=-1,
    )

    edge_label = torch.tensor([1.0]*length+[0.0]*length,dtype=torch.float)
    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())

losses = []
best_test_auc = 0
for epoch in range(1, 501):
    loss = train()
    #val_auc = test(val_data)
    train_auc = test(train_data)
    test_auc = test(test_data)
    if test_auc > best_test_auc:
        best_test_auc = test_auc
    # print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
    #       f'Test: {test_auc:.4f}')
    losses.append(loss)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_auc:.4f}, Test: {test_auc:.4f}')

print(f'Best Test: {best_test_auc:.4f}')

z = model.encode(test_data.x, test_data.edge_index)
final_edge_index = model.decode_all(z)

[[0.10000000149011612, 0.0], [1.0, 0.0], [0.4346197247505188, 5.0], [0.11166881769895554, 9.0], [0.0, 0.0], [0.0, 0.0], [0.2554112672805786, 7.0], [0.0, 0.0], [0.20346319675445557, 0.0], [0.4346764385700226, 0.0], [0.0, 14.0], [0.0679602101445198, 21.0], [0.20705881714820862, 0.0], [0.08176100999116898, 4.0], [1.0, 0.0], [0.4000000059604645, 0.0], [0.1428571492433548, 0.0], [0.12946158647537231, 1.0], [0.0, 0.0], [0.07621333748102188, 1.0]]
Epoch: 001, Loss: 9.3419, Train: 0.5345, Test: 0.5784
Epoch: 002, Loss: 13.5071, Train: 0.5345, Test: 0.5783
Epoch: 003, Loss: 7.9284, Train: 0.5344, Test: 0.5783
Epoch: 004, Loss: 10.2939, Train: 0.5344, Test: 0.5783
Epoch: 005, Loss: 9.8050, Train: 0.5343, Test: 0.5783
Epoch: 006, Loss: 8.4763, Train: 0.5343, Test: 0.5783
Epoch: 007, Loss: 10.4235, Train: 0.5342, Test: 0.5782
Epoch: 008, Loss: 8.0204, Train: 0.5342, Test: 0.5782
Epoch: 009, Loss: 9.6446, Train: 0.5342, Test: 0.5782
Epoch: 010, Loss: 8.1146, Train: 0.5341, Test: 0.5782
Epoch: 011, 

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = train_data.to(device)


class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels,edge_dim):
        super().__init__()
        # heads = 8
        # droupout?, concat?
        dropout = 0.4
        self.conv1 = GATConv(in_channels, hidden_channels,edge_dim=edge_dim,dropout=dropout)
        self.conv2 = GATConv(hidden_channels, out_channels,edge_dim=edge_dim,dropout=dropout)

    def encode(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr=edge_attr).relu()
        return self.conv2(x, edge_index, edge_attr=edge_attr)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

num_features = 2
edge_dim = 2
hidden_features = 128
out_features = 64
model = Net(num_features, hidden_features, out_features,edge_dim).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=.00001)
criterion = torch.nn.BCEWithLogitsLoss()


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index, train_data.edge_attr)

    length = len(train_data.edge_index[0])-pos_edges
    neg_edge_index = torch.tensor([[0 for _ in range(length)] for _ in range(2)],dtype=torch.long)
    indices = [i for i in range(pos_edges)]
    sample = random.sample(indices,length)
    for i in range(length):
        neg_edge_index[0][i] = train_data.edge_index[0][sample[i]]
        neg_edge_index[1][i] = train_data.edge_index[1][sample[i]]
    
    edge_label_index = torch.cat(
        [train_data.edge_label_index[:,pos_edges:], neg_edge_index],
        dim=-1,
    )

    edge_label = torch.tensor([1.0]*length+[0.0]*length,dtype=torch.float)
    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index, data.edge_attr)
    out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())


best_test_auc = 0
for epoch in range(1, 5001):
    if epoch == 100:
        optimizer = torch.optim.Adam(params=model.parameters(), lr=.00001)
    loss = train()
    if (epoch-1) % 100 == 0:
        train_auc = test(train_data)
        test_auc = test(test_data)
        if test_auc > best_test_auc:
            best_test_auc = test_auc
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_auc:.4f}, Test: {test_auc:.4f}')
    # print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

print(f'Best Test: {best_test_auc:.4f}')

z = model.encode(test_data.x, test_data.edge_index, test_data.edge_attr)
final_edge_index = model.decode_all(z)

Epoch: 001, Loss: 329.2676, Train: 0.5098, Test: 0.5097
