In [1]:
import pandas as pd
import numpy as np 

import sys 
sys.path.append('../')

## Read datasets 

In [54]:
# mutual: indicated whether the link represents a mutual friend relation (1) or not (0)
# friend_date: Unix time stamp of when the friendship link was created
# user_id: anonymized unique id of a user
# friend_id: anonymized unique id of a user



# vote_date: Unix time stamp of the vote
# voter_id: anonymized unique id of the voter
# story_id: anonymized unique id of the story


friends = pd.read_csv('../../digg_friends.csv', names=['mutual', 'friend_date', 'user_id', 'friend_id' ])
votes = pd.read_csv('../../digg_votes1.csv', names=['vote_date', 'voter_id', 'story_id'])

## Friendship in both directions 

mutual_0 = friends[friends['mutual'] == 0]
mutual_1 = friends[friends['mutual'] == 1]

mutual_2 = mutual_1.copy()
mutual_2[['user_id', 'friend_id']] = mutual_2[['friend_id', 'user_id']]


friends = pd.concat([mutual_0, mutual_1, mutual_2])[['user_id', 'friend_id']]

In [55]:
friends.shape, votes.shape

((2617993, 2), (3018197, 3))

## Filter users who voted voted atleast on 20 stories

In [66]:
users_friends = dict(friends.groupby(['user_id']).size())
friends_users = dict(friends.groupby(['friend_id']).size())

top_users = list({k:v for k,v in users_friends.items() if v < 400 and v > 20  })

len(top_users)
# top_users = set(top_users.keys())

# filtered_friends = friends[friends['user_id'].isin(top_users) & friends['friend_id'].isin(top_users)]
# print(len(top_users))

# filtered_friends[ [ 'user_id', 'friend_id']  ]

15762

In [30]:
# # total_users = top_followers
# # total_users = top_users
# total_users = list(set(list(top_users.keys()) + list(top_followers.keys())))

# filtered_friends = friends[(friends['user_id'].isin(total_users)) | (friends['friend_id'].isin(total_users))]

In [63]:
friends[friends['user_id'].isin(top_users)]

Unnamed: 0,user_id,friend_id
53,336200,333248
58,336200,330595
60,336200,330208
62,336200,328387
63,336200,327579
...,...,...
1731583,186376,60
1731584,172882,60
1731592,66044,60
1731609,88708,40


In [64]:
860 ** 2

739600

In [44]:
mutual_1

Unnamed: 0,mutual,friend_date,user_id,friend_id
28,1,1209447167,221629,336215
41,1,1205461239,336191,336200
42,1,1202837780,336169,336200
43,1,1201143840,335438,336200
44,1,1199907894,335381,336200
...,...,...,...,...
1731592,1,1227115009,66044,60
1731609,1,1246298286,88708,40
1731616,1,1238812906,227703,33
1731642,1,1197470051,70780,10


## Save files 

In [80]:
# network_path: No header. SourceId	DestinationId
    
filtered_friends[ [ 'user_id', 'friend_id']  ].to_csv('./output/digg_filtered_friends.txt', header=['user_id', 'friend_id'], sep='\t', index=False)

In [136]:
filtered_votes = votes[votes['voter_id'].isin(top_users)]

stories_count = filtered_votes.groupby(['story_id']).size()
top_stories = [k for k,v in stories_count.items() if v > 20]
filtered_votes = filtered_votes[votes['story_id'].isin(top_stories)]

users_counts = filtered_votes.groupby(['voter_id']).size()
top_users = [u for u, count in users_counts.items() if count > 20]
# print(len(top_users))
filtered_votes = filtered_votes[votes['voter_id'].isin(top_users)]
filtered_votes['voter_id'].unique().shape


filtered_votes.shape 

  """
  # Remove the CWD from sys.path while we load stuff.


(197752, 3)

In [137]:
filtered_votes['voter_id'].unique().shape 

(878,)

## Filter by stories having >20 votes

In [81]:
# stories = votes['story_id'].unique()
story_votes = dict(votes.groupby(['story_id']).size())

In [82]:

story_votes = {k:v for k,v in story_votes.items() if v > 100 }

top_stories = list(story_votes.keys())

In [83]:
filtered_votes = votes[votes['story_id'].isin(top_stories) & votes['voter_id'].isin(top_users)]

In [84]:
len(top_users), len(top_stories)

(912, 3553)

In [85]:
filtered_votes [['voter_id', 'story_id', 'vote_date']].to_csv('./output/digg_filtered_votes.txt', header=['voter_id', 'story_id', 'vote_date'], sep='\t', index=False)

In [86]:
filtered_votes [['voter_id', 'story_id', 'vote_date']]

Unnamed: 0,voter_id,story_id,vote_date
46,24916,1,1246564734
53,27241,1,1246625992
59,33217,1,1246605018
116,63501,1,1246572238
133,70227,1,1246565268
...,...,...,...
3017930,266031,3553,1243950185
3017942,269430,3553,1243820935
3018047,298705,3553,1243945457
3018099,309541,3553,1243955049



## Create network 



In [8]:
import networkx

G = networkx.DiGraph()
for u in top_users: 
    G.add_node(u)
    friends = filtered_friends[filtered_friends['user_id'] == u]['friend_id']
    
    G.add_edges_from([(u, f) for f in friends])


# Load weights 

In [15]:
# filtered_friends[['user_id', 'friend_id']]

filtered_friends[filtered_friends['user_id'] == 1230024]

Unnamed: 0,mutual,friend_date,user_id,friend_id
1229797,1,1238655881,95593,333418
1229798,0,1238325756,95593,332433
1229799,0,1239490392,95593,331224
1229800,0,1238325453,95593,330902
1229801,0,1238328061,95593,329292
...,...,...,...,...
1230109,0,1238328030,95593,3317
1230110,0,1238325396,95593,2550
1230111,1,1238324143,95593,1541
1230112,1,1238608436,95593,285


In [5]:
def prepare_weights(weight_file='./output/digg_with_weights.txt_InfProbs(10)', topic_col='Topic1'):

    weights = pd.read_csv(weight_file, sep='\t', 
                          names=['Influencer', 'Influenced'] + [f'Topic{i+1}' for i in range(10)], skiprows=1, index_col=False)

    data = weights[['Influencer', 'Influenced', topic_col]].to_dict('list')
    influencer, influenced, weights_ = data['Influencer'], data['Influenced'], data[topic_col]


    return {(influencer[i], influenced[i]): weights_[i] for i, _ in enumerate(weights_) }

# weight_dict = dict(zip(influenced, influenced), weights_)

In [9]:
# dict([(i, j) for i, j in zip(influencers, influenced)], weights_
weights_2 = prepare_weights(topic_col='Topic2')

In [65]:
from k_submodular.influence_maximization import weighted_network

weighted_G = weighted_network.weighted_network(G, 'rn')

In [68]:
# weighted_G.edges()

In [54]:
# pd.unique(filtered_friends[ [ 'user_id', 'friend_id']  ].values.ravel('K')).shape