In [10]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import CategoricalNB

In [2]:
# dictionaries with the structure source -> array of followings
dictionary = defaultdict(list)
# simple datafram to store the sources and the count of source followings
data = pd.DataFrame([])

file = open('data/train.txt', 'r') 
lines = file.readlines() 
count = 0
for line in lines: 
    count = count + 1
    split_string = list(map(int,line.strip().split("\t")))
    dictionary[split_string[0]] = []
    if (len(split_string) > 1):
        dictionary[split_string[0]] = sorted(split_string[1:len(split_string)])
        
    data = data.append(pd.DataFrame({'Source': split_string[0], 'Source_Followings': len(dictionary[split_string[0]]) }, index=[0]), ignore_index=True)

In [57]:
def followingInCommon(node1,node2):  
    list1 = dictionary[node1]
    list2 = dictionary[node2]  
    common_elements = set(dictionary[node1]).intersection(dictionary[node2]) 
    return len(common_elements)
    
# Test    
followingInCommon(3849054,161276) 

1137

In [59]:
real_edges = pd.read_csv("model_data/real_edges.csv", sep='\t')
fake_edges = pd.read_csv("model_data/fake_edges.csv", sep='\t')

In [61]:
real_edges['Common_Followings'] = real_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [63]:
fake_edges['Common_Followings'] = fake_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [64]:
real_edges['Real'] = 1
fake_edges['Real'] = 0

In [15]:
real_edges.shape
fake_edges.shape

(2000, 6)

In [65]:
frames = [real_edges, fake_edges]
dataset = pd.concat(frames, ignore_index=True)

In [67]:
dataset.head(20)

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Real
0,4156257,4504242,113,63,274,39,1
1,2960143,1466981,192,8,5,0,1
2,1719606,527267,278,101,327,20,1
3,531474,439041,89,57,79,23,1
4,786311,1663869,18,32,117,5,1
5,1158858,2136428,146,34,5,0,1
6,2670101,4037770,209,37,3,0,1
7,1222505,3020366,186,17,1,0,1
8,401947,4571446,7890,367,26,0,1
9,842427,1337853,44,6,1795,4,1


In [76]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.tail()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Real
3950,1648492,2956553,24,19,610,10,1
3951,4390232,3001354,1949,113,77,0,1
3952,128346,3623947,62,21,1,0,0
3953,2314405,4562736,13,3,1,0,0
3954,4456233,4570442,105,7,2,0,1


In [77]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real'])

# Normalization
#X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [78]:
model = CategoricalNB()
model.fit(X_train, Y_train)
predict = model.predict(X_test)

In [79]:
# accuracy 
print (classification_report(Y_test, predict,digits = 6))
print (confusion_matrix(Y_test, predict))
print (accuracy_score(Y_test, predict))

              precision    recall  f1-score   support

           0   0.740891  0.897059  0.811530       408
           1   0.860465  0.669251  0.752907       387

    accuracy                       0.786164       795
   macro avg   0.800678  0.783155  0.782218       795
weighted avg   0.799099  0.786164  0.782993       795

[[366  42]
 [128 259]]
0.7861635220125787
