In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import csv

<h3> Setup </h3>

Setup the graph and add the attributes in for each node

In [2]:
%%time
network_df = pd.read_csv('C:/Users/Chris/Desktop/SI671Data/network.tsv', sep='\t', names=['node', 'edge'])
G = nx.from_pandas_edgelist(network_df, source='node', target='edge')

Wall time: 3min 57s


In [3]:
%%time
node_attr_dict = {}
with open("C:/Users/Chris/Desktop/SI671Data/labeled-vertices.train.tsv") as tsvfile:
    tsvreader = csv.reader(tsvfile, delimiter="\t")
    #line[0] is the node
    #line[1] is the attributes
    for line in tsvreader:
        attr_dict = {}
        for attr in line[1].split(" "):
            attr_split = attr.split(':')
            attr_dict[attr_split[0]] = attr_split[1]
            
        #create dict of form
        #{node_id: {attr1:val1, attr2:val2}, node_id2..}
        node_attr_dict[int(line[0])] = attr_dict
len(node_attr_dict)

Wall time: 23 s


In [4]:
%%time
nx.set_node_attributes(G, node_attr_dict)

Wall time: 47.3 s


<h3> Attr Prediction </h3>

Get the nodes we want to predict on

In [143]:
test_nodes = []

with open ('C:/Users/Chris/Desktop/SI671Data/unlabeled-vertices.test.txt') as file:
    reader = csv.reader(file)
    for line in reader:
        test_nodes.append(int(line[0]))

len(test_nodes)

662675

Goal is to build a nearest-neighbor-esque algorithm.  If a node has lots of friends (direct links) we will let their direct links vote on the top attributes.  If a node has few friends, we will look at second degree links and let them vote.  

In [18]:
def GetAttributeVotes(node):
    from collections import Counter
    attrs = []
    
    #if the node has at least 3 friends, let them vote on the attributes
    if len([n for n in G[node]]) > 3: 
        for n in G[node]:
            for a in G.node[n].items():
                attrs.append(a)
    else: #since they had less than 3 friends, let's go one step out and let second degree connections vote
        for n in G[node]:
            #for neighbors of neighbors
            for n_2 in G[n]:
                for a in G.node[n].items():
                    attrs.append(a)
                
    #for the results we got back - sort them
    counts = sorted([(count,) + item for item, count in Counter(attrs).items()], key=lambda tup: tup[0], reverse = True)
    #and now lets take only the top results for each attribute
    attr_dict = {}
    for tup in counts:
        if tup[1] not in attr_dict:
            attr_dict[tup[1]] = tup[2]
        else:
            #attr already in dict - lets count if this value has higher votes
            if int(attr_dict[tup[1]]) > int(tup[0]):
                pass
            else:
                #reset the value of the attribute to the new value
                attr_dict[tup[1]] = tup[2]
    return attr_dict

def PredictAttributes(G_whole, test_nodes, max_iter):
    i = 0
    attr_preds = {}
    for node in test_nodes:
        if i<max_iter:          
            attributes = GetAttributeVotes(node)

            attr_preds[node] = attributes
            i+=1
    return attr_preds

In [None]:
%%time
vals = PredictAttributes(G, test_nodes, len(test_nodes))

Lets clean this up to be able to submit it to Kaggle.  Need it to be in a space delimited list, with 2 columns 'id, attr'

In [87]:
with open("attr_pred_submission.csv", 'w') as out:
    out.write("id,attr")
    out.write('\r')
    for key, val in vals.items():
        out.write(str(key) + "," + " ".join(k+":"+v for k,v in val.items()))
        out.write('\r')

<h3> Homophily </h3>

Here we want to determine a measure of how well do similar nodes group together across the network.  I am taking a simple approach of just looking at nearby neighbors and seeing how similar are:
 - their attributes
 - the values associated with the attributes
 
I do this by simply creating ratios if the nearby nodes have the same attr/value pairs as the node we are examining and then averaging across a subset of the network.

In [132]:
%%time
#take a sample of the network since it's big!
import random

sample_size = int(len(G.node)*.01)

sampled_nodes = random.sample(G.nodes, sample_size)

Wall time: 427 ms


In [133]:
%%time
class_sim_scores = []
#we want to identify:
    #for a given node, what % of it's neighbors have the same attributes
    #for a given node, for the neighbors that share the same attributes, what % of those are the same?

for node in sampled_nodes:
    similar_neighbors = .00001 #to offset divide by 0 issues
    similar_values = 0
    #we will use sets to compare if the nodes have the same attribute values
    node_attr_set = set(G.node[node].values())
    for n in G.neighbors(node):
        try: 
            #if the node has the same keys (attrs) as it's neighbor, log it
            if G.node[node].keys() == G.node[n].keys():
                similar_neighbors += 1
                
                #since we know the two nodes have the same attributes,
                #check if the attributes are the same
                n_attr_set = set(G.node[n].values())
                
                if node_attr_set == n_attr_set:
                    similar_values += 1            
        except:
            pass
    
    #for easier understanding - we are storing the class_type (what attributes the node had)
    #the class ratio - neighbors with similar attr/all neighbors
    #value ratio - similar neighbors with similar values/similar neighbors
    class_type = ''.join(str(elem) for elem in list(G.node[node]))
    similar_class_ratio = similar_neighbors/len(list(G.neighbors(node)))
    similar_val_ratio = similar_values/similar_neighbors
        
    class_sim_scores.append((class_type, similar_class_ratio, similar_val_ratio))


Wall time: 58.8 s


In [134]:
class_sim_df = pd.DataFrame.from_records(class_sim_scores, columns=['Class', '%SimilarNeighbors', '%SimilarValues'])

In [135]:
class_sim_df.head()

Unnamed: 0,Class,%SimilarNeighbors,%SimilarValues
0,T0T1,0.66667,0.0
1,T0T1,0.333337,0.99999
2,T0T1,0.836364,0.007246
3,T0T1,1.000001,0.285714
4,T0T1,0.96875,0.064516


In [136]:
class_sim_df.groupby('Class').agg({'%SimilarNeighbors' : 'mean', '%SimilarValues' : 'mean', 'Class' : 'count'})

Unnamed: 0_level_0,%SimilarNeighbors,%SimilarValues,Class
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0.104552,0.140255,13240
T0T1,0.915477,0.485813,51602
T0T1T8,0.089678,0.12603,1425


<h3> Development Dataset - Testing Predictions </h3>

In [6]:
%%time
dev_df = pd.read_csv('C:/Users/Chris/Desktop/SI671Data/labeled-vertices.dev.tsv', sep='\t', names=['node', 'attr'])

Wall time: 629 ms


In [7]:
dev_df_s = dev_df.sample(frac=.1)

In [8]:
nodes_to_test = list(dev_df_s['node'])

60% of the nodes in the dev sample only have 1 connection.  This is at odds with the actual network.  Need to branch out probably.

In [75]:
test3 = pd.DataFrame(list(G.degree(nodes_to_test)), columns = ['node', 'degree'])
test3 = test3.groupby('degree').count()
test3['percents'] = test3['node']/np.sum(test3['node'])
test3.head()

Unnamed: 0_level_0,node,percents
degree,Unnamed: 1_level_1,Unnamed: 2_level_1
1,422573,0.637678
2,147581,0.222705
3,56159,0.084746
4,21504,0.03245
5,8716,0.013153


Predicting attributes with this approach might not work very well because 30% of the nodes in the network only have 1 connection.  Maybe go out to second-level connections for them?

In [61]:
test = pd.DataFrame(list(G.degree()), columns = ['node', 'degree'])

In [67]:
test2 = test.groupby('degree').count()
test2['percents'] = test2['node']/np.sum(test2['node'])
test2.head()

Unnamed: 0_level_0,node,percents
degree,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2014881,0.304053
2,903945,0.136408
3,570686,0.086118
4,406171,0.061293
5,312170,0.047108


In [10]:
%%time
preds = PredictAttributes(G, test_nodes=nodes_to_test, max_iter = len(nodes_to_test))

Wall time: 1min 57s


Now that we have predictions, we need labeled data in the right format to compare it to.  Convert labeled data to a dict

In [12]:
%%time
def ConvertAttrsToDict(node, attrs):
    labeled_dict = {}
    #node = "662674"
    #test = "T0:0 T1:629"

    vals = dict(item.split(":") for item in attrs.split(" "))

    labeled_dict[node] = vals
    
    return labeled_dict

test = dev_df_s.apply(lambda row : ConvertAttrsToDict(row['node'], row['attr']), axis=1)

#convert to dictionary where node is the key for comparision/scoring
labeled_dict = {}
for val in test.values:
    for k, v in val.items():
        labeled_dict[k] = v
    
labeled_dict

Wall time: 2.16 s


In [53]:
results = [Score(dic, labeled_dict[node]) for node, dic in preds.items()]


In [68]:
result_list = []
for result in results:
    for r in result:
        result_list.append(r)
results_df = pd.DataFrame(result_list, columns=['result', 'attr'])
results_grouped = results_df.groupby(['result', 'attr'])[['attr']].count()
results_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,attr
result,attr,Unnamed: 2_level_1
1,T0,38525
1,T1,38525
1,T8,64
2,T0,874
2,T1,17215
2,T8,32
3,T8,5005


In [70]:
results_df.groupby('attr').count()

Unnamed: 0_level_0,result
attr,Unnamed: 1_level_1
T0,39399
T1,55740
T8,5101


In [52]:
def Score(dict1, dict2):
    '''
    returns: a list of tuples that indicates what the error was and what attrs the error was on
    list((1,correct attrs)) - got the prediction right!
    list((2,wrong values)) - got the attrs right, but values wrong
    list((3,wrong attrs)) - got the attrs wrong
    '''
    if dict1.keys() == dict2.keys():
        #the attributes are the same!
        if set(dict1.values()) == set(dict2.values()):
            #the values are the same! correct prediction
            correct_attrs = [(1,attr) for attr in list(dict1.keys())]
            return correct_attrs
        else:
            wrong_vals = []
            for attr in dict1.keys():
                #if the values arent equal, record what attr we messed up
                if dict1[attr] != dict2[attr]:
                    wrong_vals.append((2,attr))
            return(wrong_vals)
    else:
        #where were the keys different?
        wrong_attrs = [(3,attr) for attr in list(set(dict1.keys())-set(dict2.keys()))]
        return wrong_attrs 