In [156]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Libraries for developing a Neural Network
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical 

from sklearn.metrics import roc_auc_score


In [157]:
dataset = pd.read_csv("../data_models/dataset2.csv", sep='\t')

In [158]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
# Filling NaN values with zeros. Not sure if it is correct.
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)

In [164]:
msk = np.random.rand(len(dataset)) < 0.9

Y = dataset['Real']
X = dataset.drop(columns=['Real','Sink','Source'])

# Normalization
X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [110]:
def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [167]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=100,
            validation_split=0.2,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(6,))
hidden_layer_1 = Dense(100, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)

#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(50, activation='relu')(hidden_layer_1)
hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(2, activation='softmax')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', auroc])
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.01),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()
#model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', auroc])

Model: "functional_95"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_48 (InputLayer)        [(None, 6)]               0         
_________________________________________________________________
dense_141 (Dense)            (None, 100)               700       
_________________________________________________________________
dense_142 (Dense)            (None, 50)                5050      
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_143 (Dense)            (None, 2)                 102       
Total params: 5,852
Trainable params: 5,852
Non-trainable params: 0
_________________________________________________________________


In [168]:
# Execute training
model.fit(**args)

Epoch 1/100
179/179 - 1s - loss: 0.6608 - accuracy: 0.5646 - val_loss: 0.6513 - val_accuracy: 0.5775
Epoch 2/100
179/179 - 0s - loss: 0.6537 - accuracy: 0.5838 - val_loss: 0.6570 - val_accuracy: 0.5768
Epoch 3/100
179/179 - 0s - loss: 0.6521 - accuracy: 0.5884 - val_loss: 0.6472 - val_accuracy: 0.5810
Epoch 4/100
179/179 - 0s - loss: 0.6494 - accuracy: 0.5950 - val_loss: 0.6470 - val_accuracy: 0.5761
Epoch 5/100
179/179 - 0s - loss: 0.6490 - accuracy: 0.5910 - val_loss: 0.6489 - val_accuracy: 0.5789
Epoch 6/100
179/179 - 0s - loss: 0.6497 - accuracy: 0.5941 - val_loss: 0.6461 - val_accuracy: 0.5761
Epoch 7/100
179/179 - 0s - loss: 0.6476 - accuracy: 0.5933 - val_loss: 0.6453 - val_accuracy: 0.6027
Epoch 8/100
179/179 - 0s - loss: 0.6485 - accuracy: 0.5913 - val_loss: 0.6453 - val_accuracy: 0.5789
Epoch 9/100
179/179 - 0s - loss: 0.6464 - accuracy: 0.5941 - val_loss: 0.6452 - val_accuracy: 0.5803
Epoch 10/100
179/179 - 0s - loss: 0.6458 - accuracy: 0.5947 - val_loss: 0.6420 - val_accura

Epoch 82/100
179/179 - 0s - loss: 0.6498 - accuracy: 0.6008 - val_loss: 0.6396 - val_accuracy: 0.5859
Epoch 83/100
179/179 - 0s - loss: 0.6428 - accuracy: 0.5994 - val_loss: 0.6362 - val_accuracy: 0.6159
Epoch 84/100
179/179 - 0s - loss: 0.6438 - accuracy: 0.5948 - val_loss: 0.6507 - val_accuracy: 0.5852
Epoch 85/100
179/179 - 0s - loss: 0.6444 - accuracy: 0.5933 - val_loss: 0.6390 - val_accuracy: 0.5866
Epoch 86/100
179/179 - 0s - loss: 0.6435 - accuracy: 0.5938 - val_loss: 0.6476 - val_accuracy: 0.5852
Epoch 87/100
179/179 - 0s - loss: 0.6435 - accuracy: 0.5997 - val_loss: 0.6488 - val_accuracy: 0.5943
Epoch 88/100
179/179 - 0s - loss: 0.6465 - accuracy: 0.5997 - val_loss: 0.6668 - val_accuracy: 0.5915
Epoch 89/100
179/179 - 0s - loss: 0.6436 - accuracy: 0.5976 - val_loss: 0.6624 - val_accuracy: 0.5845
Epoch 90/100
179/179 - 0s - loss: 0.6442 - accuracy: 0.5954 - val_loss: 0.6584 - val_accuracy: 0.5845
Epoch 91/100
179/179 - 0s - loss: 0.6452 - accuracy: 0.5957 - val_loss: 0.6480 - v

<tensorflow.python.keras.callbacks.History at 0x143653d60>

In [169]:
X_test.shape

(797, 6)

In [170]:
# Prediction for test
predict = model.predict(X_test)
classes = []
classes = predict.argmax(axis=-1)

In [171]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))
print(classes)

Accuracy: 0.5935
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0

In [172]:
print(predict)

[[0.6923984  0.3076015 ]
 [0.52955765 0.47044238]
 [0.5561888  0.4438112 ]
 ...
 [0.54439664 0.45560336]
 [0.61030614 0.38969386]
 [0.56427205 0.4357279 ]]


In [173]:
Y_test.shape

(797,)

In [174]:
classes.shape

(797,)

In [175]:
auc = roc_auc_score(Y_test, classes)
print(auc)

0.5968165501605894


In [151]:
# accuracy 
#print (classification_report(Y_test, predict,digits = 6))
#print (confusion_matrix(Y_test, predict))
#print (accuracy_score(Y_test, predict))

In [176]:
# Reading test data
test_data = pd.read_csv("../data_models/test_data.csv", sep='\t')
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)
# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data = test_data.drop(columns=['Sink','Source'])

In [177]:
predict_test = model.predict(test_data)
predict_test 

array([[0.70154375, 0.29845625],
       [0.5504828 , 0.4495172 ],
       [0.5242651 , 0.47573483],
       ...,
       [0.549556  , 0.45044395],
       [0.54973847, 0.4502615 ],
       [0.53751034, 0.46248963]], dtype=float32)

In [178]:
classes = []
classes = predict_test.argmax(axis=-1)


In [179]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[1]},
                                                index=[0]), ignore_index=True) 
prediction.to_csv("../predictions/prediction_2020-09-13.csv", sep=',', index=False)