In [80]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import CategoricalNB

In [81]:
# dictionaries with the structure source -> array of followings
dictionary = defaultdict(list)
# simple datafram to store the sources and the count of source followings
data = pd.DataFrame([])

file = open('data/train.txt', 'r') 
lines = file.readlines() 
count = 0
for line in lines: 
    count = count + 1
    split_string = list(map(int,line.strip().split("\t")))
    dictionary[split_string[0]] = []
    if (len(split_string) > 1):
        dictionary[split_string[0]] = sorted(split_string[1:len(split_string)])
        
    data = data.append(pd.DataFrame({'Source': split_string[0], 'Source_Followings': len(dictionary[split_string[0]]) }, index=[0]), ignore_index=True)

In [82]:
def followingInCommon(node1,node2):  
    list1 = dictionary[node1]
    list2 = dictionary[node2]  
    common_elements = set(list1).intersection(list2) 
    return len(common_elements)
    
# Test    
followingInCommon(3849054,161276) 

1137

In [118]:
real_edges = pd.read_csv("model_data/real_edges.csv", sep='\t')
fake_edges = pd.read_csv("model_data/fake_edges.csv", sep='\t')

In [119]:
real_edges['Common_Followings'] = real_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [120]:
fake_edges['Common_Followings'] = fake_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [121]:
real_edges['Com_Followings_Ratio'] = real_edges['Common_Followings']/real_edges['Source_Followings']
fake_edges['Com_Followings_Ratio'] = fake_edges['Common_Followings']/fake_edges['Source_Followings']
real_edges['Followers_Ratio'] = real_edges['Sink_Followers']/real_edges['Source_Followers']
fake_edges['Followers_Ratio'] = fake_edges['Sink_Followers']/fake_edges['Source_Followers']

In [122]:
real_edges.head()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio
0,4156257,4504242,113,63,274,39,0.345133,4.349206
1,2960143,1466981,192,8,5,0,0.0,0.625
2,1719606,527267,278,101,327,20,0.071942,3.237624
3,531474,439041,89,57,79,23,0.258427,1.385965
4,786311,1663869,18,32,117,5,0.277778,3.65625


In [123]:
real_edges['Real'] = 1
fake_edges['Real'] = 0

In [15]:
real_edges.shape
fake_edges.shape

(2000, 6)

In [124]:
frames = [real_edges, fake_edges]
dataset = pd.concat(frames, ignore_index=True)

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio,Real
0,3633734,1158765,43,9,1,0,0.0,0.111111,0
1,1360946,3628627,2556,224,323,0,0.0,1.441964,1
2,4389781,2052544,166,33,32,0,0.0,0.969697,1
3,4224077,2794818,1,4,17,0,0.0,4.250000,1
4,4755228,3997049,80,31,142,8,0.1,4.580645,1
...,...,...,...,...,...,...,...,...,...
3950,1359393,3731034,96,104,1,0,0.0,0.009615,0
3951,4726840,1960462,69,7,1,0,0.0,0.142857,0
3952,441999,3491941,16,62,1,0,0.0,0.016129,0
3953,2088561,3871055,908,20,2,0,0.0,0.100000,0


In [153]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.tail()
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio,Real


In [152]:
dataset.head()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio,Real
0,3246001,4748530,51,1,203,9,0.176471,203.0,1
1,3749479,1023607,552,163,115,0,0.0,0.705521,1
2,3142164,3986905,1180,51,64,0,0.0,1.254902,1
3,2015545,4002385,59,46,1,0,0.0,0.021739,0
4,168607,3367289,71,84,553,19,0.267606,6.583333,1


In [155]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real'])

# Normalization
#X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [156]:
model = CategoricalNB()
model.fit(X_train, Y_train)
predict = model.predict(X_test)

In [157]:
# accuracy 
print (classification_report(Y_test, predict,digits = 6))
print (confusion_matrix(Y_test, predict))
print (accuracy_score(Y_test, predict))

              precision    recall  f1-score   support

           0   0.701431  0.888601  0.784000       386
           1   0.858553  0.641278  0.734177       407

    accuracy                       0.761665       793
   macro avg   0.779992  0.764939  0.759089       793
weighted avg   0.782072  0.761665  0.758429       793

[[343  43]
 [146 261]]
0.7616645649432535


In [158]:
predict_proba = model.predict_proba(X_test)
print(predict)

[0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0
 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 1 1
 0 1 0 1 0 1 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 0
 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0
 1 0 0 1 1 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1
 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1
 1 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0
 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0
 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0 1
 0 1 0 1 0 0 1 0 0 1 1 0 

In [105]:
print(predict_proba)

[[0.3144835  0.6855165 ]
 [0.81189087 0.18810913]
 [0.80468286 0.19531714]
 ...
 [0.02593366 0.97406634]
 [0.89618076 0.10381924]
 [0.22969683 0.77030317]]
