In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from ipywidgets import IntProgress
from IPython.display import display
import time
from subsampling import get_samples_from_relation_with_chunks
import os

In [2]:
usersdata = pd.read_csv('data/usersdata.csv', delimiter = '\t', names = ['userId', 'sex', 'timePassedValidation', 'ageGroup', 'label'])
GraphPath = 'graphs/'
ratio = 0.2
for i in range(1,8):
    print('for relation {}...'.format(i))
    if i in [3,4,5,6]:
        output = GraphPath + 'ratio_02_rel_' + str(i) + '.gexf'
    else:
        output = GraphPath + 'full_rel_' + str(i) + '.gexf'   
    
    output_nodes = GraphPath + 'pagerank_rel_'+str(i)+'.csv'
    
    existence = os.path.isfile(output_nodes)
    if existence:
        print('retrieve existing nodes with pagerank...')
        #graph = nx.read_gexf(output)
        nodes = pd.read_csv(output_nodes)
    else : 
        if i in [3,4,5,6]:
            nodes, edgelist = get_samples_from_relation_with_chunks('data/relation_'+str(i)+'.csv', num_of_nodes = int(np.floor(ratio*len(usersdata))), chunksize = 200000)
        else:
            relation_df = pd.read_csv('data/relation_'+str(i)+'.csv', delimiter = ',').groupby(['src','dst']).agg({'time_ms':'sum'}).reset_index()
            print('relation retrieved successfully')
            relation_df.loc[:,'time_s'] = relation_df['time_ms'] / 1000.
            rel_users = list(set(relation_df['src'].unique()).union(set(relation_df['dst'].unique())))
            nodes = usersdata[usersdata['userId'].isin(rel_users)]
            #print('{} users : {} % of the population'.format(len(rel_users), 100*len(rel_users) // len(usersdata)))
            edges = relation_df[['src', 'dst', 'time_s']].rename(columns = {'time_s':'weight'})
            nodes.reset_index(level=0, inplace=True)
            nodes = nodes.drop(columns={'index'})
            nodes.reset_index(level=0, inplace=True)
            nodes = nodes.rename(columns = {'index':'node_idx'})

            uid2idx = nodes[['node_idx', 'userId']]
            uid2idx = uid2idx.set_index('userId')

            edges_renumbered = edges.join(uid2idx, on = 'src').join(uid2idx, on = 'dst', rsuffix = '_dst').drop(columns = ['src', 'dst'])

            edgelist = edges_renumbered[['node_idx','node_idx_dst','weight']]
        
        print("edgelist created, creation of the graph...")
        graph = nx.from_pandas_edgelist(edgelist, 'node_idx', 'node_idx_dst', 'weight', create_using=nx.DiGraph())
        attributes = nodes[['node_idx','ageGroup', 'timePassedValidation', 'label']].set_index('node_idx')
        attributes.rename(columns = {'label':'spammer'}, inplace = True)
        node_props = attributes.to_dict() #, 'timePassedValidation', 'ageGroup'
        for key in node_props:
            nx.set_node_attributes(graph, node_props[key], key)
        print("computing the pagerank of the relation {}".format(i))
        pr = nx.pagerank(graph)
        nodes['pagerank_' + str(i)] = nodes.apply(lambda x: float(pr[x['node_idx']]), axis = 1)
        nodes.to_csv(output_nodes)
        nx.write_gexf(graph, output)
        print('graph successfully saved')
    
    usersdata = usersdata.merge(nodes[['userId', 'pagerank_'+str(i)]], left_on = 'userId', right_on = 'userId', how = 'left').fillna(0)
    print("Successfull process for relation {}".format(i))

for relation 1...
retrieve existing nodes with pagerank...
Successfull process for relation 1
for relation 2...
retrieve existing nodes with pagerank...
Successfull process for relation 2
for relation 3...
retrieve existing nodes with pagerank...
Successfull process for relation 3
for relation 4...
retrieve existing nodes with pagerank...
Successfull process for relation 4
for relation 5...
retrieve existing nodes with pagerank...
Successfull process for relation 5
for relation 6...
retrieve existing nodes with pagerank...
Successfull process for relation 6
for relation 7...
relation retrieved successfully
edgelist created, creation of the graph...
computing the pagerank of the relation 7
graph successfully saved
Successfull process for relation 7


In [3]:
usersdata.to_csv('data/usersdata_with_pr.csv')

In [8]:
usersdata.head(20)

Unnamed: 0,userId,sex,timePassedValidation,ageGroup,label,pagerank_1,pagerank_2,pagerank_3,pagerank_4,pagerank_5,pagerank_6,pagerank_7
0,1,M,0.9,30,0,0.0,0.0,5.218214e-07,3.841856e-07,7.47545e-07,7.674321e-07,0.0
1,2,F,1.0,20,0,1e-06,0.0,0.0,0.0,0.0,5.368759e-07,0.0
2,3,M,0.1375,30,0,0.0,4.663933e-07,4.241208e-07,0.0,0.0,0.0,0.0
3,4,M,0.3875,20,0,0.0,0.0,0.0,0.0,0.0,5.047452e-07,0.0
4,5,M,0.0125,20,0,0.0,0.0,3.326827e-07,0.0,0.0,0.0,0.0
5,6,M,0.7125,20,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,M,0.925,20,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,M,0.7875,20,0,0.0,0.0,4.299013e-07,0.0,0.0,0.0,0.0
8,9,M,0.075,20,0,0.0,0.0,3.326827e-07,0.0,0.0,0.0,0.0
9,10,M,0.2875,20,0,0.0,0.0,4.468528e-07,1.209221e-06,0.0,0.0,0.0


In [9]:
def check_null(x):
    result = False
    for i in range(1,8):
        result = result or x['pagerank_' + str(i)] != 0
    return result

indices = usersdata.apply(check_null, axis = 1)

In [10]:
sub_users = usersdata[indices]

In [13]:
print(" {} % of users who have at least one non-zero pagerank value are spammers".format( 100 * len(sub_users[sub_users['label'] == 1]) // len(sub_users) ))

 6 % of users who have at least one non-zero pagerank value are spammers


In [14]:
print(" {} % of all users are spammers ".format(100 * len(usersdata[usersdata['label'] == 1])//len(usersdata)))

 6 % of all users are spammers 
