In [1]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Libraries for developing a Neural Network
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical 

from sklearn.metrics import roc_auc_score


In [2]:
dataset = pd.read_csv("../data_models/dataset6.csv", sep='\t')

In [3]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
# Filling NaN values with zeros. Not sure if it is correct.
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)
dataset = dataset.replace([np.inf],1000)

In [8]:
dataset.isna().any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                    True
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [9]:
dataset.isin([np.inf]).any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                   False
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [10]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real','Sink','Source','Distance'])

# Normalization
X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [6]:
def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [13]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=50,
            validation_split=0.2,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(8,))
hidden_layer_1 = Dense(50, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)

#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(20, activation='relu')(hidden_layer_1)
#hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(1, activation='sigmoid')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()
#model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', auroc])

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 8)]               0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                450       
_________________________________________________________________
dense_7 (Dense)              (None, 20)                1020      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 21        
Total params: 1,491
Trainable params: 1,491
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Execute training
model.fit(**args)

Epoch 1/50
161/161 - 1s - loss: 0.6837 - accuracy: 0.5591 - val_loss: 0.6741 - val_accuracy: 0.5779
Epoch 2/50
161/161 - 0s - loss: 0.6664 - accuracy: 0.5913 - val_loss: 0.6642 - val_accuracy: 0.5740
Epoch 3/50
161/161 - 0s - loss: 0.6580 - accuracy: 0.5948 - val_loss: 0.6618 - val_accuracy: 0.5872
Epoch 4/50
161/161 - 0s - loss: 0.6549 - accuracy: 0.6039 - val_loss: 0.6641 - val_accuracy: 0.5857
Epoch 5/50
161/161 - 0s - loss: 0.6532 - accuracy: 0.6032 - val_loss: 0.6698 - val_accuracy: 0.5810
Epoch 6/50
161/161 - 0s - loss: 0.6527 - accuracy: 0.6006 - val_loss: 0.6620 - val_accuracy: 0.5896
Epoch 7/50
161/161 - 0s - loss: 0.6514 - accuracy: 0.6051 - val_loss: 0.6621 - val_accuracy: 0.5927
Epoch 8/50
161/161 - 0s - loss: 0.6503 - accuracy: 0.6034 - val_loss: 0.6666 - val_accuracy: 0.5872
Epoch 9/50
161/161 - 0s - loss: 0.6506 - accuracy: 0.6049 - val_loss: 0.6608 - val_accuracy: 0.5935
Epoch 10/50
161/161 - 0s - loss: 0.6498 - accuracy: 0.6067 - val_loss: 0.6603 - val_accuracy: 0.5974

<tensorflow.python.keras.callbacks.History at 0x1374187f0>

In [15]:
# Prediction for test
predict = model.predict(X_test)
print(predict)

[[0.84328866]
 [0.48744127]
 [0.3625833 ]
 ...
 [0.50259155]
 [0.36241066]
 [0.525343  ]]


In [16]:
classes = predict
#classes = predict.argmax(-1)
classes[classes <= 0.5] = 0
classes[classes > 0.5] = 1
print(classes)

[[1.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [1.]]


In [17]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))
print('AUC: ' + "{:.4f}".format(roc_auc_score(Y_test, classes)))

Accuracy: 0.6109
AUC: 0.6097


In [18]:
# accuracy 
print (classification_report(Y_test, predict,digits = 6))
print (confusion_matrix(Y_test, predict))
print (accuracy_score(Y_test, predict))

              precision    recall  f1-score   support

           0   0.637349  0.626777  0.632019       844
           1   0.581673  0.592693  0.587131       739

    accuracy                       0.610865      1583
   macro avg   0.609511  0.609735  0.609575      1583
weighted avg   0.611358  0.610865  0.611064      1583

[[529 315]
 [301 438]]
0.6108654453569172


In [24]:
# Reading test data
test_data = pd.read_csv("../data_generated/test_data.csv", sep='\t')
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)
test_data = test_data.replace([np.inf],1000)
# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data = test_data.drop(columns=['Sink','Source','Distance'])

In [25]:
predict_test = model.predict(test_data)
predict_test 

array([[0.38563478],
       [0.47393203],
       [0.59878665],
       ...,
       [0.49051303],
       [0.48971367],
       [0.50681055]], dtype=float32)

In [26]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[0]},
                                                index=[0]), ignore_index=True) 
prediction.to_csv("../predictions/prediction_2020-09-17-1.csv", sep=',', index=False,float_format='%.8f')