In [181]:
# General libraries
import pandas as pd
import numpy as np
from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Libraries for developing a Neural Network
import tensorflow as tf 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical 

from sklearn.metrics import roc_auc_score


In [182]:
dataset = pd.read_csv("../data_models/dataset3.csv", sep='\t')

In [183]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
# Filling NaN values with zeros. Not sure if it is correct.
dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)

In [202]:
dataset.isna().any()

Source                     False
Sink                       False
Source_Followings          False
Source_Followers           False
Sink_Followers             False
Distance                    True
Real                       False
Common_Followings          False
Com_Followings_Ratio       False
Followers_Ratio            False
Indirect_Followings        False
Inv_Indirect_Followings    False
dtype: bool

In [203]:
msk = np.random.rand(len(dataset)) < 0.8

Y = dataset['Real']
X = dataset.drop(columns=['Real','Sink','Source','Distance'])

# Normalization
X = (X-X.min())/(X.max()-X.min())

X_train = X[msk]
X_test = X[~msk]
Y_train = Y[msk]
Y_test = Y[~msk]

In [110]:
def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

In [241]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=40,
            validation_split=0.2,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(8,))
hidden_layer_1 = Dense(100, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)

#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(20, activation='relu')(hidden_layer_1)
#hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(1, activation='sigmoid')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()
#model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', auroc])

Model: "functional_125"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_64 (InputLayer)        [(None, 8)]               0         
_________________________________________________________________
dense_189 (Dense)            (None, 100)               900       
_________________________________________________________________
dense_190 (Dense)            (None, 20)                2020      
_________________________________________________________________
dense_191 (Dense)            (None, 1)                 21        
Total params: 2,941
Trainable params: 2,941
Non-trainable params: 0
_________________________________________________________________


In [242]:
# Execute training
model.fit(**args)

Epoch 1/40
161/161 - 0s - loss: 0.6317 - accuracy: 0.6243 - val_loss: 0.7800 - val_accuracy: 0.5086
Epoch 2/40
161/161 - 0s - loss: 0.5981 - accuracy: 0.6647 - val_loss: 0.6373 - val_accuracy: 0.7301
Epoch 3/40
161/161 - 0s - loss: 0.5812 - accuracy: 0.6858 - val_loss: 0.6526 - val_accuracy: 0.6482
Epoch 4/40
161/161 - 0s - loss: 0.5724 - accuracy: 0.6924 - val_loss: 0.6195 - val_accuracy: 0.6778
Epoch 5/40
161/161 - 0s - loss: 0.5639 - accuracy: 0.6946 - val_loss: 0.6889 - val_accuracy: 0.5866
Epoch 6/40
161/161 - 0s - loss: 0.5565 - accuracy: 0.6987 - val_loss: 0.6232 - val_accuracy: 0.6615
Epoch 7/40
161/161 - 0s - loss: 0.5488 - accuracy: 0.7063 - val_loss: 0.8092 - val_accuracy: 0.6544
Epoch 8/40
161/161 - 0s - loss: 0.5519 - accuracy: 0.7147 - val_loss: 0.6127 - val_accuracy: 0.6880
Epoch 9/40
161/161 - 0s - loss: 0.5453 - accuracy: 0.7151 - val_loss: 0.6241 - val_accuracy: 0.7192
Epoch 10/40
161/161 - 0s - loss: 0.5399 - accuracy: 0.7178 - val_loss: 0.6106 - val_accuracy: 0.6903

<tensorflow.python.keras.callbacks.History at 0x144194250>

In [243]:
X_test.shape

(1549, 8)

In [244]:
# Prediction for test
predict = model.predict(X_test)
classes = []
classes = predict.argmax(axis=-1)

In [245]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))
print(classes)

Accuracy: 0.5087
[0 0 0 ... 0 0 0]


In [246]:
print(predict)

[[0.4469065 ]
 [0.15855709]
 [0.45662808]
 ...
 [0.8802182 ]
 [0.9999995 ]
 [0.42072427]]


In [247]:
Y_test.shape

(1549,)

In [248]:
classes.shape

(1549,)

In [249]:
auc = roc_auc_score(Y_test, classes)
print(auc)

0.5


In [151]:
# accuracy 
#print (classification_report(Y_test, predict,digits = 6))
#print (confusion_matrix(Y_test, predict))
#print (accuracy_score(Y_test, predict))

In [250]:
# Reading test data
test_data = pd.read_csv("../data_models/test_data.csv", sep='\t')
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)
# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data = test_data.drop(columns=['Sink','Source','Distance'])

In [251]:
predict_test = model.predict(test_data)
predict_test 

array([[0.2525749 ],
       [0.39057952],
       [0.9201307 ],
       ...,
       [0.4676679 ],
       [0.45499492],
       [0.44608235]], dtype=float32)

In [252]:
classes = []
classes = predict_test.argmax(axis=-1)


In [254]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[0]},
                                                index=[0]), ignore_index=True) 
prediction.to_csv("../predictions/prediction_2020-09-13-4.csv", sep=',', index=False)