# Baseline Graph Model

## Connected Subgraphs as a measure of activity aggregation for detecting fraud accounts

In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

### Reading labels and test accounts

In [None]:
data_prefix = '../processed-data/'
user_to_label = pd.read_csv(data_prefix + "tags.csv").set_index('userId')
with open(data_prefix + "test_users.csv", "r") as fh:
        new_users_test = [line.strip() for line in fh]

### Reading edgelists

In [None]:
relations_df = pd.read_csv("data/relations.csv.gz", sep='\t', compression='gzip', 
                           names=["day", "ms", "src", "dst", "relation"])
edges = relations_df[['src', 'dst']].drop_duplicates()

### Graph Construction

In [None]:
G = nx.DiGraph()
G.add_edges_from(zip(edges.src.values, edges.dst.values))

### Connected Subgraphs

In [None]:
node_to_connected_graph_size = {}
connected_subgraphs = nx.connected_component_subgraphs(G)

num_components = 0
for subgraph in connected_subgraphs:
    num_v = subgraph.order()
    num_components += 1
    for node in subgraph:
        node_to_connected_graph_size[node] = num_v

In [None]:
print("Number of connected components is {}".format(num_components))
print("Size of largest connected component is {}".format(max(node_to_connected_graph_size.values())))
print("Distribution of conneted component sizes is {}".format(Counter(node_to_connected_graph_size.values())))
plt.hist(np.log10(list(node_to_connected_graph_size.values())))

In [None]:
thresholds = [2, 3, 4, 5, 10]
baseline_predictions = {thresh: [] for thresh in thresholds}
for user in map(int, new_users_test):
    for thresh in thresholds:
        if user not in node_to_connected_graph_size:
            baseline_predictions[thresh].append(0)
        else:
            pred = 1 if node_to_connected_graph_size[user] > thresh else 0
            baseline_predictions[thresh].append(pred)

In [None]:
def compute_metrics(predictions, labels):
    true_positives = sum([pred for (i, pred) in enumerate(predictions) if pred and labels[i]])
    false_positives = sum([pred for (i, pred) in enumerate(predictions) if pred and not labels[i]])
    false_negatives = len([pred for (i, pred) in enumerate(predictions) if not pred and labels[i]])
    true_negatives = len([pred for (i, pred) in enumerate(predictions) if not pred and not labels[i]])
    
    confusion_matrix = np.array([[true_positives, false_positives], [false_negatives, true_negatives]])
    accuracy = (true_positives + true_negatives)/len(labels)
    precision = true_positives/(true_positives + false_positives)
    recall = true_positives/(true_positives + false_negatives)
    f1 = 2*(precision*recall)/(precision + recall)
    
    return (confusion_matrix, f1, accuracy, precision, recall)

In [None]:
true_labels = [user_to_label.loc[int(user)].label for user in new_users_test]
for thresh in baseline_predictions:
    print("======= Baseline metrics with thresholds at {}".format(thresh))
    confusion_matrix, f1, accuracy, precision, recall = compute_metrics(baseline_predictions[thresh], true_labels)
    print("Confusion Matrix:")
    print(pd.DataFrame(confusion_matrix, columns=["labels positive", "labels negative"], index=["predicted positive", "predicted negative"]))
    print("f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, acc: {:.4f}".format(f1, precision, recall, accuracy))
    print()