In [255]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Libraries for developing a Neural Network
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical 

from sklearn.metrics import roc_auc_score


In [299]:
dataset = pd.read_csv("../data_models/dataset4.csv", sep='\t')

In [300]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
# Filling NaN values with zeros. Not sure if it is correct.
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)
dataset = dataset.replace([np.inf],1000)

In [301]:
dataset.isna().any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                   False
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [303]:
dataset.isin([np.inf]).any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                   False
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [358]:
msk = np.random.rand(len(dataset)) < 0.7

Y = dataset['Real']
X = dataset.drop(columns=['Real','Sink','Source'])

# Normalization
X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [110]:
def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [359]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=15,
            validation_split=0.3,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(9,))
hidden_layer_1 = Dense(100, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)

#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(20, activation='relu')(hidden_layer_1)
#hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(1, activation='sigmoid')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()
#model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', auroc])

Model: "functional_135"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_69 (InputLayer)        [(None, 9)]               0         
_________________________________________________________________
dense_204 (Dense)            (None, 100)               1000      
_________________________________________________________________
dense_205 (Dense)            (None, 20)                2020      
_________________________________________________________________
dense_206 (Dense)            (None, 1)                 21        
Total params: 3,041
Trainable params: 3,041
Non-trainable params: 0
_________________________________________________________________


In [360]:
# Execute training
model.fit(**args)

Epoch 1/15
122/122 - 0s - loss: 0.6290 - accuracy: 0.6132 - val_loss: 0.5773 - val_accuracy: 0.6621
Epoch 2/15
122/122 - 0s - loss: 0.5637 - accuracy: 0.6971 - val_loss: 0.4966 - val_accuracy: 0.8643
Epoch 3/15
122/122 - 0s - loss: 0.4945 - accuracy: 0.7584 - val_loss: 0.6273 - val_accuracy: 0.6801
Epoch 4/15
122/122 - 0s - loss: 0.4394 - accuracy: 0.7936 - val_loss: 0.5931 - val_accuracy: 0.6939
Epoch 5/15
122/122 - 0s - loss: 0.3749 - accuracy: 0.8425 - val_loss: 0.6750 - val_accuracy: 0.6933
Epoch 6/15
122/122 - 0s - loss: 0.3304 - accuracy: 0.8484 - val_loss: 0.2509 - val_accuracy: 0.9022
Epoch 7/15
122/122 - 0s - loss: 0.3073 - accuracy: 0.8654 - val_loss: 0.3210 - val_accuracy: 0.8962
Epoch 8/15
122/122 - 0s - loss: 0.2688 - accuracy: 0.8827 - val_loss: 0.1844 - val_accuracy: 0.9262
Epoch 9/15
122/122 - 0s - loss: 0.2509 - accuracy: 0.8904 - val_loss: 0.1702 - val_accuracy: 0.9448
Epoch 10/15
122/122 - 0s - loss: 0.2355 - accuracy: 0.9048 - val_loss: 0.1528 - val_accuracy: 0.9484

<tensorflow.python.keras.callbacks.History at 0x14475ad90>

In [361]:
# Prediction for test
predict = model.predict(X_test)
print(predict)

[[9.9723387e-01]
 [9.9762261e-01]
 [9.9810576e-01]
 ...
 [9.9933612e-01]
 [9.9783373e-01]
 [3.1158328e-04]]


In [362]:
classes = predict
#classes = predict.argmax(-1)
classes[classes <= 0.5] = 0
classes[classes > 0.5] = 1
print(classes)

[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [0.]]


In [363]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))
print('AUC: ' + "{:.4f}".format(roc_auc_score(Y_test, classes)))

Accuracy: 0.9759
AUC: 0.9756


In [364]:
# accuracy 
print (classification_report(Y_test, predict,digits = 6))
print (confusion_matrix(Y_test, predict))
print (accuracy_score(Y_test, predict))

              precision    recall  f1-score   support

           0   0.999117  0.952020  0.975000      1188
           1   0.955153  0.999177  0.976669      1215

    accuracy                       0.975864      2403
   macro avg   0.977135  0.975599  0.975835      2403
weighted avg   0.976888  0.975864  0.975844      2403

[[1131   57]
 [   1 1214]]
0.9758635039533916


In [365]:
# Reading test data
test_data = pd.read_csv("../data_generated/test_data.csv", sep='\t')
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)
test_data = test_data.replace([np.inf],1000)
# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data = test_data.drop(columns=['Sink','Source'])

In [366]:
predict_test = model.predict(test_data)
predict_test 

array([[4.8768520e-04],
       [4.2834640e-01],
       [9.9612075e-01],
       ...,
       [6.3708025e-01],
       [6.0647166e-01],
       [5.7148761e-01]], dtype=float32)

In [367]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[0]},
                                                index=[0]), ignore_index=True) 
prediction.to_csv("../predictions/prediction_2020-09-14-2.csv", sep=',', index=False,float_format='%.8f')