In [425]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Libraries for developing a Neural Network
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical 

from sklearn.metrics import roc_auc_score


In [455]:
dataset = pd.read_csv("../data_models/dataset5.csv", sep='\t')

In [456]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
# Filling NaN values with zeros. Not sure if it is correct.
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)
dataset = dataset.replace([np.inf],1000)

In [457]:
dataset.isna().any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                   False
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [458]:
dataset.isin([np.inf]).any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                   False
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [459]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real','Sink','Source'])

# Normalization
X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [374]:
def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [460]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=50,
            validation_split=0.2,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(9,))
hidden_layer_1 = Dense(50, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)

#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(20, activation='relu')(hidden_layer_1)
#hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(1, activation='sigmoid')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()
#model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', auroc])

Model: "functional_159"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_81 (InputLayer)        [(None, 9)]               0         
_________________________________________________________________
dense_240 (Dense)            (None, 50)                500       
_________________________________________________________________
dense_241 (Dense)            (None, 20)                1020      
_________________________________________________________________
dense_242 (Dense)            (None, 1)                 21        
Total params: 1,541
Trainable params: 1,541
Non-trainable params: 0
_________________________________________________________________


In [461]:
# Execute training
model.fit(**args)

Epoch 1/50
159/159 - 0s - loss: 0.6682 - accuracy: 0.6359 - val_loss: 0.6390 - val_accuracy: 0.6454
Epoch 2/50
159/159 - 0s - loss: 0.6120 - accuracy: 0.6622 - val_loss: 0.5959 - val_accuracy: 0.6745
Epoch 3/50
159/159 - 0s - loss: 0.5853 - accuracy: 0.6760 - val_loss: 0.5803 - val_accuracy: 0.6596
Epoch 4/50
159/159 - 0s - loss: 0.5715 - accuracy: 0.6864 - val_loss: 0.5654 - val_accuracy: 0.6832
Epoch 5/50
159/159 - 0s - loss: 0.5591 - accuracy: 0.7011 - val_loss: 0.5540 - val_accuracy: 0.6887
Epoch 6/50
159/159 - 0s - loss: 0.5446 - accuracy: 0.7149 - val_loss: 0.5399 - val_accuracy: 0.6942
Epoch 7/50
159/159 - 0s - loss: 0.5306 - accuracy: 0.7279 - val_loss: 0.5174 - val_accuracy: 0.7657
Epoch 8/50
159/159 - 0s - loss: 0.5142 - accuracy: 0.7475 - val_loss: 0.4982 - val_accuracy: 0.8608
Epoch 9/50
159/159 - 0s - loss: 0.4957 - accuracy: 0.7759 - val_loss: 0.4759 - val_accuracy: 0.7822
Epoch 10/50
159/159 - 0s - loss: 0.4755 - accuracy: 0.8052 - val_loss: 0.4667 - val_accuracy: 0.7366

<tensorflow.python.keras.callbacks.History at 0x144c8d0a0>

In [462]:
# Prediction for test
predict = model.predict(X_test)
print(predict)

[[9.9866152e-01]
 [9.9843132e-01]
 [9.9814856e-01]
 ...
 [3.1789068e-07]
 [9.9984193e-01]
 [2.5170743e-03]]


In [463]:
classes = predict
#classes = predict.argmax(-1)
classes[classes <= 0.5] = 0
classes[classes > 0.5] = 1
print(classes)

[[1.]
 [1.]
 [1.]
 ...
 [0.]
 [1.]
 [0.]]


In [464]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))
print('AUC: ' + "{:.4f}".format(roc_auc_score(Y_test, classes)))

Accuracy: 0.9988
AUC: 0.9988


In [465]:
# accuracy 
print (classification_report(Y_test, predict,digits = 6))
print (confusion_matrix(Y_test, predict))
print (accuracy_score(Y_test, predict))

              precision    recall  f1-score   support

           0   0.998753  0.998753  0.998753       802
           1   0.998810  0.998810  0.998810       840

    accuracy                       0.998782      1642
   macro avg   0.998781  0.998781  0.998781      1642
weighted avg   0.998782  0.998782  0.998782      1642

[[801   1]
 [  1 839]]
0.9987819732034104


In [466]:
# Reading test data
test_data = pd.read_csv("../data_generated/test_data.csv", sep='\t')
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)
test_data = test_data.replace([np.inf],1000)
# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data = test_data.drop(columns=['Sink','Source'])

In [467]:
predict_test = model.predict(test_data)
predict_test 

array([[4.3252138e-07],
       [1.4806986e-03],
       [9.9955702e-01],
       ...,
       [2.1935701e-03],
       [2.0491481e-03],
       [1.7515421e-03]], dtype=float32)

In [468]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[0]},
                                                index=[0]), ignore_index=True) 
prediction.to_csv("../predictions/prediction_2020-09-15-3.csv", sep=',', index=False,float_format='%.8f')