In [386]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Libraries for developing a Neural Network
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical 

from sklearn.metrics import roc_auc_score


In [387]:
dataset = pd.read_csv("../data_models/dataset4.csv", sep='\t')

In [388]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
# Filling NaN values with zeros. Not sure if it is correct.
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)
dataset = dataset.replace([np.inf],1000)

In [389]:
dataset.isna().any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                   False
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [390]:
dataset.isin([np.inf]).any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                   False
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [414]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real','Sink','Source'])

# Normalization
X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [374]:
def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [417]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=16,
            validation_split=0.2,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(9,))
hidden_layer_1 = Dense(50, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)

#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(20, activation='relu')(hidden_layer_1)
#hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(1, activation='sigmoid')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()
#model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', auroc])

Model: "functional_149"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_76 (InputLayer)        [(None, 9)]               0         
_________________________________________________________________
dense_225 (Dense)            (None, 50)                500       
_________________________________________________________________
dense_226 (Dense)            (None, 20)                1020      
_________________________________________________________________
dense_227 (Dense)            (None, 1)                 21        
Total params: 1,541
Trainable params: 1,541
Non-trainable params: 0
_________________________________________________________________


In [418]:
# Execute training
model.fit(**args)

Epoch 1/16
161/161 - 1s - loss: 0.6720 - accuracy: 0.6185 - val_loss: 0.6560 - val_accuracy: 0.6278
Epoch 2/16
161/161 - 0s - loss: 0.6339 - accuracy: 0.6405 - val_loss: 0.6295 - val_accuracy: 0.6286
Epoch 3/16
161/161 - 0s - loss: 0.6110 - accuracy: 0.6522 - val_loss: 0.6180 - val_accuracy: 0.6527
Epoch 4/16
161/161 - 0s - loss: 0.5968 - accuracy: 0.6801 - val_loss: 0.6094 - val_accuracy: 0.6294
Epoch 5/16
161/161 - 0s - loss: 0.5838 - accuracy: 0.6807 - val_loss: 0.5944 - val_accuracy: 0.6566
Epoch 6/16
161/161 - 0s - loss: 0.5677 - accuracy: 0.7184 - val_loss: 0.5879 - val_accuracy: 0.8190
Epoch 7/16
161/161 - 0s - loss: 0.5527 - accuracy: 0.7328 - val_loss: 0.5651 - val_accuracy: 0.7125
Epoch 8/16
161/161 - 0s - loss: 0.5351 - accuracy: 0.7618 - val_loss: 0.5524 - val_accuracy: 0.8322
Epoch 9/16
161/161 - 0s - loss: 0.5155 - accuracy: 0.7866 - val_loss: 0.5258 - val_accuracy: 0.7420
Epoch 10/16
161/161 - 0s - loss: 0.4919 - accuracy: 0.8088 - val_loss: 0.5021 - val_accuracy: 0.8368

<tensorflow.python.keras.callbacks.History at 0x1452cbd60>

In [419]:
# Prediction for test
predict = model.predict(X_test)
print(predict)

[[0.36083418]
 [0.44239467]
 [0.3001486 ]
 ...
 [0.68271655]
 [0.3162176 ]
 [0.40478402]]


In [420]:
classes = predict
#classes = predict.argmax(-1)
classes[classes <= 0.5] = 0
classes[classes > 0.5] = 1
print(classes)

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [0.]]


In [421]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))
print('AUC: ' + "{:.4f}".format(roc_auc_score(Y_test, classes)))

Accuracy: 0.9054
AUC: 0.9043


In [422]:
# accuracy 
print (classification_report(Y_test, predict,digits = 6))
print (confusion_matrix(Y_test, predict))
print (accuracy_score(Y_test, predict))

              precision    recall  f1-score   support

           0   0.950525  0.851007  0.898017       745
           1   0.870175  0.957529  0.911765       777

    accuracy                       0.905388      1522
   macro avg   0.910350  0.904268  0.904891      1522
weighted avg   0.909505  0.905388  0.905035      1522

[[634 111]
 [ 33 744]]
0.9053876478318003


In [423]:
# Reading test data
test_data = pd.read_csv("../data_generated/test_data.csv", sep='\t')
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)
test_data = test_data.replace([np.inf],1000)
# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data = test_data.drop(columns=['Sink','Source'])

In [412]:
predict_test = model.predict(test_data)
predict_test 

array([[0.1400662 ],
       [0.19174588],
       [0.88244605],
       ...,
       [0.25105727],
       [0.24114308],
       [0.23710889]], dtype=float32)

In [424]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[0]},
                                                index=[0]), ignore_index=True) 
prediction.to_csv("../predictions/prediction_2020-09-14-6.csv", sep=',', index=False,float_format='%.8f')