In [None]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
import math
from queue import PriorityQueue

In [None]:
# dictionaries with the structure source -> array of followings
dictionary = defaultdict(list)
# simple datafram to store the sources and the count of source followings
data = pd.DataFrame([])

file = open('data/train.txt', 'r') 
lines = file.readlines() 
count = 0
for line in lines: 
    count = count + 1
    split_string = list(map(int,line.strip().split("\t")))
    dictionary[split_string[0]] = []
    if (len(split_string) >1):
        dictionary[split_string[0]] = sorted(split_string[1:len(split_string)])
        
    data = data.append(pd.DataFrame({'Source': split_string[0], 'Source_Followings': len(dictionary[split_string[0]]) }, index=[0]), ignore_index=True)


In [None]:
data.tail(10)

In [None]:
# function to calculate followers
def calculateFollowers(data,node):
    followers = 0
    for j in range(0,19999):
        if node in dictionary[data.at[j,'Source']]:
            followers = followers + 1
    
    return followers

# Second version: similar performance
# function to calculate followers
def calculateFollowers2(data,node):
    followers = 0
    for j in range(0,19999):
        common_elements = set(dictionary[data.at[j,'Source']]).intersection([node]) 
        followers = followers + len(common_elements)
    
    return followers

In [None]:
calculateFollowers(data,4778890)

In [None]:
# First version
# Random selection of 2000 real edges
real_edges = pd.DataFrame([])
i = 0
while i < 2000:
    if i % 100 == 0:
        print(i)
    x = randrange(0, 19999)
    source = data.at[x,'Source']
    source_followings = data.at[x,'Source_Followings']
    if (source_followings > 0):
        i += 1
        y = random.randint(0, source_followings-1)
        sink = dictionary[source][y]
        source_followers = calculateFollowers(data,source)
        sink_followers = calculateFollowers(data,sink)
        real_edges = real_edges.append(pd.DataFrame({'Source': source,'Sink': sink,
                                                     'Source_Followings': source_followings,
                                                     'Source_Followers': source_followers,
                                                     'Sink_Followers': sink_followers},
                                                    index=[0]), ignore_index=True)

In [None]:
real_edges.head(10)

In [None]:
# Saving real edges
real_edges.to_csv("model_data/real_edges.csv", sep='\t', index=False)

In [None]:
# First version
# random creation of 4000 fake edges

fake_edges = pd.DataFrame([])
i = 0
while i < 4000:
    if i % 100 == 0:
        print(i)
    
    x = randrange(0, 19999)
    source = data.at[x,'Source']
    source_followings = data.at[x,'Source_Followings']
    
    y = randrange(0, 19999)
    source_gen = data.at[y,'Source']
    source_gen_followings = data.at[y,'Source_Followings']
    
    if (source_gen_followings > 0):
        z = random.randint(0, source_gen_followings-1)
        sink = dictionary[source_gen][z]

        # It is not real, is added
        if sink not in dictionary[source]:
            i += 1
            source_followers = calculateFollowers(data,source)
            sink_followers = calculateFollowers(data,sink)
            fake_edges = fake_edges.append(pd.DataFrame({'Source': source,'Sink': sink,
                                                         'Source_Followings': source_followings,
                                                         'Source_Followers': source_followers,
                                                         'Sink_Followers': sink_followers},
                                                        index=[0]), ignore_index=True)

In [None]:
fake_edges.head()

In [None]:
fake_edges.to_csv("data_generated/fake_edges3.csv", sep='\t', index=False)

In [None]:
# Reading test-public data
test_public = pd.read_csv('data/test-public.txt', sep='\t')

# Just for exploration: number of sinks that are source in train file.
count = 0
for index, row in test_public.iterrows():
    if ( (data['Source'] == row['Sink']).any() ):
        count = count + 1

print(count)   

In [None]:
# calculation of features for test data
test_data = pd.DataFrame([])
for index, row in test_public.iterrows():
    if index % 100 == 0:
        print(index)
    source = row['Source']
    source_followings = data.loc[data['Source'] == source, 'Source_Followings'].iloc[0]
    sink = row['Sink']

    source_followers = calculateFollowers(data,source)
    sink_followers = calculateFollowers(data,sink)




    test_data = test_data.append(pd.DataFrame({'Id': row['Id'], 'Source': source,'Sink': sink,
                                                 'Source_Followings': source_followings,
                                                 'Source_Followers': source_followers,
                                                 'Sink_Followers': sink_followers},
                                                index=[0]), ignore_index=True) 

In [None]:
test_data.head(10)

In [None]:
for i in range (0,10):
    jump = np.random.rand() < 0.1
    if jump:
        print(jump)

        

In [None]:
# Second version
# Random walk with jump (probability 0.1) for creating 4000 real edges
jump_probability = 0.1
real_edges = pd.DataFrame(columns=['Source','Sink','Source_Followings','Source_Followers','Sink_Followers'])
#seed
source = 540762
changeSource = False

i = 0
while i < 4000:
    if i % 100 == 0:
        print(i)
        
    if (data.loc[(data['Source'] == source)].any().values[0]):
        source_followings = data.loc[data['Source'] == source, 'Source_Followings'].values[0]
        if (source_followings > 0):
            changeSource = False
            y = random.randint(0, source_followings-1)
            sink = dictionary[source][y]
            # it is checked if the pair is not already added
            if not(real_edges.loc[(real_edges['Source'] == source) & (real_edges['Sink'] == sink)].any().values[0]):
                i += 1
                source_followers = calculateFollowers(data,source)
                sink_followers = calculateFollowers(data,sink)
                real_edges = real_edges.append(pd.DataFrame({'Source': source,'Sink': sink,
                                                             'Source_Followings': source_followings,
                                                             'Source_Followers': source_followers,
                                                             'Sink_Followers': sink_followers},
                                                           index=[0]), ignore_index=True)
                source = sink
    else:
        changeSource = True
      
    jump = np.random.rand() < jump_probability
    if jump or changeSource:
        x = randrange(0, 19999)
        source = data.at[x,'Source'] 
            

In [None]:
real_edges.head(20)

In [None]:
real_edges.shape

In [None]:
real_edges.to_csv("data_generated/real_edges3.csv", sep='\t', index=False)

In [27]:
# Second version 
# random creation of 4000 fake edges
jump_probability = 0.5
nextNode = True
jump = True

fake_edges = pd.DataFrame([])
i = 0
while i < 4000:
    
    if i % 100 == 0:
        print(i)
        
    changeSource = False
    jump = True
    x = randrange(0, 19999)
    source = data.at[x,'Source']
    source_followings = data.loc[data['Source'] == source, 'Source_Followings'].values[0]

    if (source_followings > 0): 
        y = random.randint(0, source_followings-1)
        next_source = dictionary[source][y]
 
        while jump and not changeSource:
            
            if (data.loc[(data['Source'] == next_source)].any().values[0]):
                next_source_followings = data.loc[data['Source'] == next_source, 'Source_Followings'].values[0]
            else:
                next_source_followings = 0
                changeSource = True
                
            if (next_source_followings > 0):
  
                z = random.randint(0, next_source_followings-1)
                sink = dictionary[next_source][z]
                jump = np.random.rand() < jump_probability

                if jump:
                    next_source = sink 
                else:
                    changeSource = True
                    # It is not real, it is added
                    if sink not in dictionary[source]:
                        i += 1
                        source_followers = calculateFollowers(data,source)
                        sink_followers = calculateFollowers(data,sink)
                        fake_edges = fake_edges.append(pd.DataFrame({'Source': source,'Sink': sink,
                                                                     'Source_Followings': source_followings,
                                                                     'Source_Followers': source_followers,
                                                                     'Sink_Followers': sink_followers},
                                                                    index=[0]), ignore_index=True) 

            else:
                changeSource = True
                    
        

0
0
0
0
0
0
0
100
100
100
100
200
200
200
200
200
200
300
300
300
300
300
300
300
300
400
400
400
400
500
500
500
500
500
600
600
700
800
800
800
800
800
800
800
800
800
900
900
900
900
900
900
900
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1000
1100
1100
1200
1300
1300
1300
1300
1300
1300
1400
1400
1400
1400
1500
1500
1500
1500
1500
1500
1600
1600
1600
1600
1600
1600
1600
1700
1700
1800
1800
1800
1800
1800
1800
1800
1800
1800
1900
1900
1900
1900
1900
1900
1900
1900
1900
1900
1900
1900
1900
1900
2000
2000
2100
2100
2100
2200
2200
2200
2200
2200
2200
2200
2200
2200
2200
2200
2300
2400
2400
2400
2500
2600
2600
2600
2600
2600
2600
2600
2700
2700
2700
2800
2800
2900
2900
2900
2900
2900
2900
3000
3000
3000
3000
3000
3000
3000
3000
3000
3000
3100
3100
3100
3100
3100
3100
3100
3100
3100
3100
3100
3200
3300
3300
3300
3300
3300
3300
3300
3300
3300
3300
3300
3400
3400
3400
3400
3400
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
3500
35

In [28]:
fake_edges.tail(10)

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers
3990,4462656,431517,618,115,9
3991,3502031,1828651,31,11,296
3992,1178327,2891499,106,30,56
3993,2818687,4261974,337,30,200
3994,4496464,533396,193,105,5
3995,1261156,4634938,1944,153,18
3996,2233163,3513422,69,21,1
3997,3292844,4525782,3,4,33
3998,1321472,3334486,1770,68,2
3999,4087122,4657888,6,8,1789


In [29]:
fake_edges.to_csv("data_generated/fake_edges4.csv", sep='\t', index=False)

In [None]:
# This was part of the first exploration, maybe it is useful for the report
data.nlargest(50,'followers')
data.hist(column='Source_Followings',bins=100, range=(1,500))

# 