# Sybil Detection

In [1]:
import os
import pandas as pd
import numpy as np

from avd.graph_learning_controller import GraphLearningController
from avd.learners.sklearner import SkLearner
from avd.configs import config
from avd.datasets.twitter import load_data

Specify output directory for classifiers.

In [2]:
output_folder = os.getcwd() + "/data/output"
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

Define what is considered positive and negative labels.

In [3]:
labels = {"neg": "Real", "pos": "Fake"}

We first load the Twitter graph.
load_data will return a graph object(twitter_graph) and a config object(twitter_config).

In [4]:
twitter_graph, twitter_config = load_data(dataset_file_name="twitter_filtered.csv", labels_file_name="twitter_labels_filtered.csv", 
                                          labels_map=labels, limit=5000000) # Loads filtered dataset.
print(len(twitter_graph.vertices))

Loading labels...
Loading graph...
Data loaded.
75624


Some of the extracted feature can be useful for understanding the result, but they will not be used in the classification proccess.

In [5]:
if twitter_graph.is_directed:
    meta_data_cols = ["dst", "src", "out_degree_v", "in_degree_v", "out_degree_u", "in_degree_u"]
else:
    meta_data_cols = ["dst", "src", "number_of_friends_u", "number_of_friends_v"]

Finally, we train the classification algorithm.
The Twitter dataset is incomplete and missing a lot of data.
In order to deal with this problem, we train 10 times and aggregate the results.

In [6]:
twitter_config._name = "twitter_" + "RandomForest"
learner = SkLearner(labels=labels)
glc = GraphLearningController(learner, twitter_config)
result_path = os.path.join(output_folder, twitter_config.name  + "res.csv")
glc.classify_by_links(twitter_graph, 
                      result_path, 
                      test_size={"neg": 2000, "pos": 200},
                      train_size={"neg": 5000, "pos": 5000}, 
                      meta_data_cols=meta_data_cols)

Setting training and test sets
Existing files were loaded.
Training 10-fold validation: {'accuracy': 0.8192999999999999, 'precision': 0.7824698666306154, 'recall': 0.8848, 'f1': 0.8303853250923815, 'auc': 0.85451, 'fpr': 0.24620000000000003, 'tnr': 0.7538}
Validate_prediction_by_links: {'accuracy': 0.7868181818181819, 'precision': 0.08359133126934984, 'recall': 0.135, 'f1': 0.10325047801147226, 'auc': 0.49845000000000006, 'fpr': 0.148}


## Logistic Regression

In [7]:
twitter_config._name = "twitter_" + "LogisticRegression"
IsoForest_learner = SkLearner(labels=labels).set_logistic_regression_classifier()
glc = GraphLearningController(IsoForest_learner, twitter_config)
result_path = os.path.join(output_folder, twitter_config.name  + "res.csv")
glc.evaluate_classifier(twitter_graph, 
                        test_size={"neg": 2000, "pos": 200},
                        training_size={"neg": 5000, "pos": 5000}, 
                        meta_data_cols=meta_data_cols)

Setting training and test sets


 23%|██▎       | 2299/10000 [00:00<00:00, 11362.73feature/s]

Extracting features for training set:
Graph loaded


100%|██████████| 10000/10000 [00:00<00:00, 13518.10feature/s]
 43%|████▎     | 946/2200 [00:00<00:00, 9457.75feature/s]

Features were written to: /Users/vhying/Desktop/CS 221/Project/Network-Anomaly-Detection/data/temp/twitter_LogisticRegression__train.csv
Extracting features for test set:
Graph loaded


70690feature [00:05, 13475.32feature/s]                  


Features were written to: /Users/vhying/Desktop/CS 221/Project/Network-Anomaly-Detection/data/temp/twitter_LogisticRegression__test.csv
Training 10-fold validation: {'accuracy': 0.829, 'precision': 0.7923047701868932, 'recall': 0.892, 'f1': 0.8391242529007122, 'auc': 0.8597158, 'fpr': 0.23399999999999999, 'tnr': 0.766}


## Adaboost

In [8]:
twitter_config._name = "twitter_" + "Adaboost"
learner = SkLearner(labels=labels).set_adaboost_classifier()
glc = GraphLearningController(learner, twitter_config)
result_path = os.path.join(output_folder, twitter_config.name  + "res.csv")
glc.classify_by_links(twitter_graph, 
                      result_path,
                      test_size={"neg": 2000, "pos": 200},
                      train_size={"neg": 5000, "pos": 5000},
                      meta_data_cols=meta_data_cols)

Setting training and test sets
Existing files were loaded.
Training 10-fold validation: {'accuracy': 0.8255000000000001, 'precision': 0.7894622375429599, 'recall': 0.8884000000000001, 'f1': 0.8358838719938493, 'auc': 0.8602497999999998, 'fpr': 0.2374, 'tnr': 0.7626000000000001}
Validate_prediction_by_links: {'accuracy': 0.7836363636363637, 'precision': 0.08682634730538923, 'recall': 0.145, 'f1': 0.10861423220973783, 'auc': 0.46581249999999996, 'fpr': 0.1525}


## Bagging Random Forest

In [None]:
twitter_config._name = "twitter_" + "RFBagging"
learner = SkLearner(labels=labels).set_rf_bagging_classifier()
glc = GraphLearningController(learner, twitter_config)
result_path = os.path.join(output_folder, twitter_config.name  + "res.csv")
glc.classify_by_links(twitter_graph, 
                      result_path,
                      test_size={"neg": 2000, "pos": 200},
                      train_size={"neg": 5000, "pos": 5000},
                      meta_data_cols=meta_data_cols)

Setting training and test sets
Existing files were loaded.


## Gradient Boosting

In [None]:
twitter_config._name = "twitter_" + "GradientBoosting"
learner = SkLearner(labels=labels).set_gradient_boosting_classifier()
glc = GraphLearningController(learner, twitter_config)
result_path = os.path.join(output_folder, twitter_config.name  + "res.csv")
glc.classify_by_links(twitter_graph, 
                      result_path,
                      test_size={"neg": 2000, "pos": 200},
                      train_size={"neg": 5000, "pos": 5000},
                      meta_data_cols=meta_data_cols)

## Isolation Forest

In [None]:
twitter_config._name = "twitter_" + "IsolationForest"
learner = SkLearner(labels=labels).set_isolation_forest_classifier()
glc = GraphLearningController(learner, twitter_config)
result_path = os.path.join(output_folder, twitter_config.name  + "res.csv")
glc.classify_by_links(twitter_graph, 
                      result_path,
                      test_size={"neg": 2000, "pos": 200},
                      train_size={"neg": 5000, "pos": 5000},
                      meta_data_cols=meta_data_cols)

## Precision at K

In [None]:
def aggreagate_res(data_folder, res_path):
    results_frame = pd.DataFrame()
    for f in os.listdir(data_folder):
        temp_df = pd.read_csv(data_folder + "/" + f,index_col=0, encoding='utf-8', engine='python')
        results_frame = results_frame.append(temp_df)
    results_frame = results_frame.groupby("src_id").mean()

    return results_frame.reset_index()

In [None]:
df = aggreagate_res(output_folder, "res.csv").sort_values("mean_link_label", ascending=False)

In order to calculate precision at k, we add two additional columns:
1. The sum of the positive examples.
2. k  the total number of resutls until the specific row. (row number)

In [None]:
df["actual_sum"] = df["actual"].cumsum()
df["k"] = 1
df["k"] = df["k"].cumsum()

In [None]:
df.head(10)

We add an additional column that stores the p@k values by calculating the precision at k.

In [None]:
df["p@k"] = df.apply(lambda x: x["actual_sum"]/x["k"], axis=1)

In [None]:
df[["k", "p@k"]].head(10)

We can see in the p@k plot that the resuts are musch better then random which about 6%.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure()
df[["k", "p@k"]][:500].plot(x="k", y= "p@k")
plt.plot(df[["k"]].values, np.full((len(df[["k"]]),1), 0.06))