In [None]:
# General libraries

from collections import defaultdict
import random
from random import randrange
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from timeit import default_timer as timer
sns.set_style('darkgrid')
plt.rcParams['figure.dpi'] = 108

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV

In [None]:
# dictionaries with the structure source -> array of followings
from sklearn.preprocessing import StandardScaler
dictionary = defaultdict(list)
# simple datafram to store the sources and the count of source followings
data = pd.DataFrame([])

file = open('data/train.txt', 'r')
lines = file.readlines()
count = 0
for line in lines:
    count = count + 1
    split_string = list(map(int,line.strip().split("\t")))
    dictionary[split_string[0]] = []
    if (len(split_string) > 1):
        dictionary[split_string[0]] = sorted(split_string[1:len(split_string)])

    data = data.append(pd.DataFrame({'Source': split_string[0], 'Source_Followings': len(dictionary[split_string[0]]) }, index=[0]), ignore_index=True)

In [None]:
def followingInCommon(node1,node2):
    list1 = dictionary[node1]
    list2 = dictionary[node2]
    common_elements = set(list1).intersection(list2)
    return len(common_elements)

# Test
followingInCommon(3849054,161276)

In [None]:
real_edges = pd.read_csv("model_data/real_edges.csv", sep='\t')
fake_edges = pd.read_csv("model_data/fake_edges.csv", sep='\t')

In [None]:
real_edges['Common_Followings'] = real_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [None]:
fake_edges['Common_Followings'] = fake_edges.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [None]:
#real_edges['Com_Followings_Ratio'] = real_edges['Common_Followings']/real_edges['Source_Followings']
#fake_edges['Com_Followings_Ratio'] = fake_edges['Common_Followings']/fake_edges['Source_Followings']
#real_edges['Followers_Ratio'] = real_edges['Sink_Followers']/real_edges['Source_Followers']
#fake_edges['Followers_Ratio'] = fake_edges['Sink_Followers']/fake_edges['Source_Followers']

In [None]:
#real_edges.head()

In [None]:
real_edges['Real'] = 1
fake_edges['Real'] = -1

In [None]:
#real_edges.shape
#fake_edges.shape

In [None]:
frames = [real_edges, fake_edges]
dataset = pd.concat(frames, ignore_index=True)

In [None]:
# Shuffle the data set
dataset = dataset.sample(frac=1).reset_index(drop=True)
#dataset.tail()
#dataset['Com_Followings_Ratio'] = dataset['Com_Followings_Ratio'].fillna(0)

In [None]:
dataset['Distance'] =dataset.apply(lambda x: x.Distance if x.Distance != math.inf else 2000, axis =1)

dataset = dataset.drop(columns=['Source'])
dataset = dataset.drop(columns=['Sink'])
dataset.head()
dataset.to_csv("model_data/data.csv", sep='\t', index=False)

In [None]:
#msk = np.random.rand(len(dataset)) < 0.8

#Y = dataset['Real']
#X = dataset.drop(columns=['Real'])
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=1)
scaler = StandardScaler()

scaler.fit(train_df.drop('Real',axis=1))
scaler.fit(test_df.drop('Real',axis=1))
X_train =scaler.transform(train_df.drop('Real',axis=1))        # fill in
Y_train =train_df.Real# fill in

X_test = scaler.transform(test_df.drop('Real',axis=1)) # fill in
Y_test = test_df.Real # fill in

# Normalization
#X = (X-X.min())/(X.max()-X.min())

#X_train = X[msk]
#X_test = X[~msk]
#Y_train = Y[msk]
#Y_test = Y[~msk]

print(len(X_test))
print(X_test)

In [None]:
# plt.scatter(X_train[Y_train==1,0], X_train[Y_train==1,1], X_train[Y_train==1,2],label="Real ($y=1$)", c='r')
# plt.scatter(X_train[Y_train==-1,0], X_train[Y_train==-1,1], X_train[Y_train==-1,2],label="Fake ($y=-1$)", c='b')
# #plt.xlabel("Heart weight")
# #plt.ylabel("Body weight")
# plt.legend()
# plt.show()

In [None]:
# C_range = np.logspace(-2, 5, 8)
# gamma_range = np.logspace(-6, 1, 16)
#
# # Visualise the grid
# xx, yy = np.meshgrid(C_range, gamma_range)
# plt.plot(xx, yy, 'ko')
# plt.xscale('log')
# plt.yscale('log')
# plt.xlabel('$C$')
# plt.ylabel(r'$\gamma$')
# plt.show()

In [None]:
#cv = StratifiedShuffleSplit(n_splits=30, test_size=0.1, random_state=1)
#grid = GridSearchCV(SVC(kernel='rbf'), param_grid={'gamma': gamma_range, 'C': C_range}, cv=cv)
#grid.fit(X_train, Y_train)
#print("The best parameters are {0.best_params_} with an accuracy of {0.best_score_:.3g}".format(grid))

In [None]:
# Libraries for developing a Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical



In [None]:
# Model arguments
args = dict(x = X_train,
            y = Y_train,
            epochs=40,
            validation_split=0.2,
            verbose=2,
            shuffle=True)

# Layer definition
input_layer = Input(shape=(7,))
hidden_layer_1 = Dense(7, activation='relu',activity_regularizer=regularizers.l1(10e-5))(input_layer)
#hidden_layer_1 = Dropout(0.3)(hidden_layer_1)
#hidden_layer_2 = Dense(8, activation='sigmoid')(hidden_layer_1)
hidden_layer_2 = Dense(7, activation='relu')(hidden_layer_1)
#hidden_layer_2 = Dropout(0.3)(hidden_layer_2)
output_layer = Dense(2, activation='softmax')(hidden_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

# Model set up
model.compile(tf.keras.optimizers.RMSprop(learning_rate=0.01),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
# Execute training
model.fit(**args)

In [None]:
predict = model.predict(X_test)
classes = predict.argmax(axis=-1)

In [None]:
print(classes)

In [None]:
print('Accuracy: ' + "{:.4f}".format(accuracy_score(Y_test, classes)))


In [None]:
test_data = pd.read_csv("model_data/test_data.csv", sep='\t')

In [None]:
test_data.head()

In [None]:
test_data['Distance'] =dataset.apply(lambda x: x.Distance if x.Distance != math.inf else 2000, axis =1)

test_data['Common_Followings'] = test_data.apply(lambda x: followingInCommon(x['Source'], x['Sink']), axis=1)

In [None]:
test_data['Com_Followings_Ratio'] = test_data['Common_Followings']/test_data['Source_Followings']
test_data['Followers_Ratio'] = test_data['Sink_Followers']/test_data['Source_Followers']
test_data['Com_Followings_Ratio'] = test_data['Com_Followings_Ratio'].fillna(0)

In [None]:
test_data.tail()

In [None]:

# Normalization
test_data = (test_data-test_data.min())/(test_data.max()-test_data.min())
test_data.head()

In [None]:
predict_test = model.predict(test_data)
classes = predict_test.argmax(axis=-1)

In [None]:
predict_test

In [None]:
classes

In [None]:
prediction = pd.DataFrame([])
i = 0
for row in predict_test:
    i += 1
    prediction = prediction.append(pd.DataFrame({'Id': i, 'Predicted': row[1]},
                                                index=[0]), ignore_index=True)

In [None]:
prediction.head()

In [None]:
prediction.to_csv("predictions/prediction_2020-09-10.csv", sep=',', index=False)