In [80]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import CategoricalNB

In [81]:
# dictionaries with the structure source -> array of followings
dictionary = defaultdict(list)
# simple datafram to store the sources and the count of source followings
data = pd.DataFrame([])

file = open('data/train.txt', 'r') 
lines = file.readlines() 
count = 0
for line in lines: 
    count = count + 1
    split_string = list(map(int,line.strip().split("\t")))
    dictionary[split_string[0]] = []
    if (len(split_string) > 1):
        dictionary[split_string[0]] = sorted(split_string[1:len(split_string)])
        
    data = data.append(pd.DataFrame({'Source': split_string[0], 'Source_Followings': len(dictionary[split_string[0]]) }, index=[0]), ignore_index=True)

In [82]:
def followingInCommon(node1,node2):  
    list1 = dictionary[node1]
    list2 = dictionary[node2]  
    common_elements = set(list1).intersection(list2) 
    return len(common_elements)
    
# Test    
followingInCommon(3849054,161276) 

1137

In [118]:
real_edges = pd.read_csv("model_data/real_edges.csv", sep='\t')
fake_edges = pd.read_csv("model_data/fake_edges.csv", sep='\t')

In [119]:
real_edges['Common_Followings'] = real_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [120]:
fake_edges['Common_Followings'] = fake_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [121]:
real_edges['Com_Followings_Ratio'] = real_edges['Common_Followings']/real_edges['Source_Followings']
fake_edges['Com_Followings_Ratio'] = fake_edges['Common_Followings']/fake_edges['Source_Followings']
real_edges['Followers_Ratio'] = real_edges['Sink_Followers']/real_edges['Source_Followers']
fake_edges['Followers_Ratio'] = fake_edges['Sink_Followers']/fake_edges['Source_Followers']

In [122]:
real_edges.head()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio
0,4156257,4504242,113,63,274,39,0.345133,4.349206
1,2960143,1466981,192,8,5,0,0.0,0.625
2,1719606,527267,278,101,327,20,0.071942,3.237624
3,531474,439041,89,57,79,23,0.258427,1.385965
4,786311,1663869,18,32,117,5,0.277778,3.65625


In [123]:
real_edges['Real'] = 1
fake_edges['Real'] = 0

In [15]:
real_edges.shape
fake_edges.shape

(2000, 6)

In [124]:
frames = [real_edges, fake_edges]
dataset = pd.concat(frames, ignore_index=True)

In [153]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.tail()
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio,Real


In [152]:
dataset.head()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio,Real
0,3246001,4748530,51,1,203,9,0.176471,203.0,1
1,3749479,1023607,552,163,115,0,0.0,0.705521,1
2,3142164,3986905,1180,51,64,0,0.0,1.254902,1
3,2015545,4002385,59,46,1,0,0.0,0.021739,0
4,168607,3367289,71,84,553,19,0.267606,6.583333,1


In [155]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real'])

# Normalization
#X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [156]:
model = CategoricalNB()
model.fit(X_train, Y_train)
predict = model.predict(X_test)

In [157]:
# accuracy 
print (classification_report(Y_test, predict,digits = 6))
print (confusion_matrix(Y_test, predict))
print (accuracy_score(Y_test, predict))

              precision    recall  f1-score   support

           0   0.701431  0.888601  0.784000       386
           1   0.858553  0.641278  0.734177       407

    accuracy                       0.761665       793
   macro avg   0.779992  0.764939  0.759089       793
weighted avg   0.782072  0.761665  0.758429       793

[[343  43]
 [146 261]]
0.7616645649432535


In [158]:
predict_proba = model.predict_proba(X_test)
print(predict)

[0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0
 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 1 1
 0 1 0 1 0 1 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1 0 1 1 1 0
 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0
 1 0 0 1 1 0 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1
 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1
 1 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0
 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0
 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0 1
 0 1 0 1 0 0 1 0 0 1 1 0 

In [105]:
print(predict_proba)

[[0.3144835  0.6855165 ]
 [0.81189087 0.18810913]
 [0.80468286 0.19531714]
 ...
 [0.02593366 0.97406634]
 [0.89618076 0.10381924]
 [0.22969683 0.77030317]]


In [163]:
# Libraries for developing a Neural Network
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical 


In [172]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real'])

# Normalization
X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [175]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=40,
            validation_split=0.2,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(8,))
hidden_layer_1 = Dense(8, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)
#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(8, activation='relu')(hidden_layer_1)
#hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(2, activation='softmax')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.01),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "functional_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense_21 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_22 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_23 (Dense)             (None, 2)                 18        
Total params: 162
Trainable params: 162
Non-trainable params: 0
_________________________________________________________________


In [176]:
# Execute training
model.fit(**args)

Epoch 1/100
81/81 - 0s - loss: 0.6444 - accuracy: 0.6113 - val_loss: 0.7013 - val_accuracy: 0.5319
Epoch 2/100
81/81 - 0s - loss: 0.5836 - accuracy: 0.6661 - val_loss: 0.6478 - val_accuracy: 0.5319
Epoch 3/100
81/81 - 0s - loss: 0.5574 - accuracy: 0.6918 - val_loss: 0.5655 - val_accuracy: 0.5319
Epoch 4/100
81/81 - 0s - loss: 0.5480 - accuracy: 0.6992 - val_loss: 0.5208 - val_accuracy: 0.7621
Epoch 5/100
81/81 - 0s - loss: 0.5352 - accuracy: 0.7144 - val_loss: 0.5507 - val_accuracy: 0.7014
Epoch 6/100
81/81 - 0s - loss: 0.5328 - accuracy: 0.7222 - val_loss: 0.5026 - val_accuracy: 0.7434
Epoch 7/100
81/81 - 0s - loss: 0.5283 - accuracy: 0.7253 - val_loss: 0.5462 - val_accuracy: 0.6470
Epoch 8/100
81/81 - 0s - loss: 0.5189 - accuracy: 0.7288 - val_loss: 0.5057 - val_accuracy: 0.7589
Epoch 9/100
81/81 - 0s - loss: 0.5246 - accuracy: 0.7397 - val_loss: 0.5144 - val_accuracy: 0.7372
Epoch 10/100
81/81 - 0s - loss: 0.5101 - accuracy: 0.7370 - val_loss: 0.4689 - val_accuracy: 0.7714
Epoch 11/

81/81 - 0s - loss: 0.4438 - accuracy: 0.7965 - val_loss: 0.4007 - val_accuracy: 0.8336
Epoch 84/100
81/81 - 0s - loss: 0.4388 - accuracy: 0.8016 - val_loss: 0.3891 - val_accuracy: 0.8383
Epoch 85/100
81/81 - 0s - loss: 0.4351 - accuracy: 0.8039 - val_loss: 0.3938 - val_accuracy: 0.8320
Epoch 86/100
81/81 - 0s - loss: 0.4394 - accuracy: 0.8027 - val_loss: 0.3908 - val_accuracy: 0.8414
Epoch 87/100
81/81 - 0s - loss: 0.4401 - accuracy: 0.8062 - val_loss: 0.4328 - val_accuracy: 0.8149
Epoch 88/100
81/81 - 0s - loss: 0.4392 - accuracy: 0.8023 - val_loss: 0.4261 - val_accuracy: 0.8134
Epoch 89/100
81/81 - 0s - loss: 0.4390 - accuracy: 0.8051 - val_loss: 0.4201 - val_accuracy: 0.8351
Epoch 90/100
81/81 - 0s - loss: 0.4375 - accuracy: 0.8051 - val_loss: 0.3980 - val_accuracy: 0.8336
Epoch 91/100
81/81 - 0s - loss: 0.4393 - accuracy: 0.7965 - val_loss: 0.3980 - val_accuracy: 0.8274
Epoch 92/100
81/81 - 0s - loss: 0.4390 - accuracy: 0.8019 - val_loss: 0.3953 - val_accuracy: 0.8305
Epoch 93/100


<tensorflow.python.keras.callbacks.History at 0x16cf78b80>

In [179]:
predict = model.predict(X_test)
classes = predict.argmax(axis=-1)

In [180]:
print(classes)

[0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 0 1 0 1 0 1 0 1 0 0 1 1 0 1 0 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0
 1 0 0 1 0 0 1 1 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 1 1
 0 0 1 0 1 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 0 1 1
 1 1 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 1 0 1 0 0 1 1 0 1
 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 1 0 0 0 1
 1 0 1 0 1 1 1 0 0 1 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1
 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 1
 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1
 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1
 1 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 1 1 0 0 

In [181]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))


Accuracy: 0.8288


In [195]:
test_data = pd.read_csv("model_data/test_data.csv", sep='\t')

In [183]:
test_data.head()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers
0,3563811,3600160,21,3,29
1,2052043,1401960,71,13,9
2,4517994,1690636,205,80,17
3,1660006,4349447,506,32,36
4,581111,1882617,18,5,46


In [196]:
test_data['Common_Followings'] = test_data.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [197]:
test_data['Com_Followings_Ratio'] = test_data['Common_Followings']/test_data['Source_Followings']
test_data['Followers_Ratio'] = test_data['Sink_Followers']/test_data['Source_Followers']
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)

In [198]:
test_data.tail()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio
1995,1461386,2341683,53,16,2,0,0.0,0.125
1996,4057755,1871227,95,53,41,0,0.0,0.773585
1997,4242514,1413468,27,6,2,0,0.0,0.333333
1998,555531,1290080,56,7,3,0,0.0,0.428571
1999,1707829,2373045,244,10,2,0,0.0,0.2


In [191]:

# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data.head()

Unnamed: 0,Source,Sink,Source_Followings,Source_Followers,Sink_Followers,Common_Followings,Com_Followings_Ratio,Followers_Ratio
0,0.732302,0.740418,5.2e-05,0.00099,0.005785,0.0,0.0,0.003628
1,0.421583,0.287932,0.000182,0.005938,0.001653,0.0,0.0,0.00026
2,0.928418,0.347355,0.000532,0.03909,0.003306,0.0,0.0,8e-05
3,0.341006,0.894654,0.001316,0.015339,0.007231,0.0,0.0,0.000422
4,0.119257,0.386873,4.4e-05,0.001979,0.009298,0.0,0.0,0.003453


In [192]:
predict_test = model.predict(test_data)
classes = predict_test.argmax(axis=-1)

In [193]:
predict_test

array([[0.2443884 , 0.7556116 ],
       [0.5160492 , 0.48395076],
       [0.24332331, 0.7566766 ],
       ...,
       [0.6824082 , 0.31759185],
       [0.74405366, 0.25594643],
       [0.7659686 , 0.23403138]], dtype=float32)

In [194]:
classes

array([1, 0, 1, ..., 0, 0, 0])

In [201]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[1]},
                                                index=[0]), ignore_index=True) 

In [202]:
prediction.head()

Unnamed: 0,Id,Predicted
0,1,0.755612
1,2,0.483951
2,3,0.756677
3,4,0.818148
4,5,0.796943


In [205]:
prediction.to_csv("predictions/prediction_2020-09-10.csv", sep=',', index=False)