In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import networkx as nx

In [2]:
network = pd.read_csv("./data/output/twitter_res.csv")
fakeIds = pd.read_csv("./data/twitter_fake_ids.csv")

FileNotFoundError: [Errno 2] File b'./data/twitter.csv' does not exist: b'./data/twitter.csv'

In [None]:
df = pd.DataFrame(columns=["id", "following", "followers", "label"])

In [None]:
ids = set(np.append(network["src"].unique(), network["dst"].unique()))
df["id"] = sorted(ids)

In [None]:
# Sets number of peers following for all users
following_vals = dict(network["src"].value_counts())
def setFollowing(x):
    try:
        return following_vals[x]
    except:
        return 0
df["following"] = df["id"].apply(setFollowing)

In [None]:
# Sets number of followers for all users - note: takes a while
follower_vals = dict(network["dst"].value_counts()) 
def setFollowers(x):
    try:
        return follower_vals[x]
    except:
        return 0
df["followers"] = df["id"].apply(setFollowers)

In [None]:
# Sets labels for fake users (1 if fake, 0 otherwise) - note: takes a while
def setLabels(x):
    try:
        if x in fakeIds["id"].values:
            return 1
        else:
            return 0
    except:
        return 0    
df["label"] = df["id"].apply(setLabels)

In [None]:
df.to_csv("temp.csv") # for testing

In [None]:
df = pd.read_csv("temp.csv") # reset df 

In [None]:
df = df.loc[(df["following"] != 0) && (df["followers"] != 0)]

In [None]:
print("Anomalies in Pruned Network dataset:",len(df.loc[df["label"] == 1].values))
print("Anomalies in FakeIds dataset:",len(fakeIds["id"].values))

In [None]:
df.iloc[:,2:].describe()

In [None]:
# Looks like it follows intuitive reasoning; most users have fewer followers than they follow.
plt.scatter(df["following"], df["followers"], s=0.5)
plt.title("Number Following vs. Number Followers")
plt.xlabel("Following")
plt.ylabel("Followers")

In [None]:
# Pruning the dataset makes us lose many anomalies -- following 0 users might be an indicator of a fake user?
# Also reduces dataset size from ~5.4 million to just 80,480 users, which seems weird --> analyze this more?
# Should we even prune users - up for discussion!
df["label"].value_counts() 

In [None]:
# Draw the directed graph programatically - Note: takes a REALLY long time
G = nx.Graph()

print("Creating list of edges...")
edgesList = list(zip(network["src"], network["dst"]))
print("Done.")

print("Drawing edges...")
G.add_edges_from(edgesList) # draw all edges
print("Done.")

print("Coloring nodes representing fake users...")
color_map = [ "red" for node in G if node in fakeIds["id"].values ] # color fake users
print("Done.")

print("Drawing full graph...")
nx.draw(G, node_color = color_map,with_labels = True)
print("Done.")

plt.savefig("network.png") # save as png
plt.show() 