# working notebook

### Utility Functions:
- load_bitcoin_edge_data(filename)
- user_activity_dataframe(bitcoin_df) <-- uses user_stats(bitcoin_df)
- build_graph(bitcoin_df, user_lst=[], rating_type='all', maxdate='2016-01-24')
#### visualization functions:
- plot_timeline(bitcoin_df, title)
- add_user_to_graph(existing_graph, new_user, bitcoin_df)

### EDA Learnings:
alpha network does not have time level timestamp so cannot do velocity or bot analysis
or sort chronologically within a day

### Interesting Fraud Examples:
otc_user = '2680'

In [15]:
import pandas as pd
import numpy as np
import datetime
import networkx as nx
import nxpd

import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../src')
import helpers as h
import visualizations as v
import model as m

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)
import time

In [2]:
# Load and preprocess data
otc_df = h.load_bitcoin_edge_data('../data/soc-sign-bitcoinotc.csv.gz')
alpha_df = h.load_bitcoin_edge_data('../data/soc-sign-bitcoinalpha.csv.gz')
alpha_users = h.user_activity_dataframe(alpha_df)
otc_users = h.user_activity_dataframe(otc_df)

In [3]:
# Load graph objects
_ , alpha_G = h.build_graph(alpha_df)
_ , alpha_pos_G = h.build_graph(alpha_df, rating_type='pos')
_ , otc_G = h.build_graph(otc_df)
_ , otc_pos_G = h.build_graph(otc_df, rating_type='pos')

In [4]:
# v.plot_timeline(alpha_df, 'Alpha Bit Coin Ratings Activity')

In [5]:
# v.plot_timeline(otc_df, 'OTC Bit Coin Ratings Activity')

## Base Features

## Velocity Features

In [6]:
# # date velocity
# start_time = time.time()
# df_dv = m.feature_iteration_date_velocity(alpha_df)
# print(f"{(time.time() - start_time):.0f} seconds execution time")

# # Save File
# df_dv.to_csv('../data/df_dv.csv', index=False)

In [7]:
# Retrieve File
df_dv = pd.read_csv('../data/df_dv.csv')

In [8]:
# sequential velocity

# looks for sequential negative ratings activity
# neg_cnt_last_1_rating
# neg_cnt_last_2_rating
# neg_cnt_last_3_rating

start_time = time.time()
df_sv = m.feature_iteration_sequential_velocity(alpha_df)
print(f"{(time.time() - start_time):.0f} seconds execution time")

# Save File
df_sv.to_csv('../data/df_sv.csv', index=False)

70 seconds execution time


In [None]:
# Retrieve File
df_sv = pd.read_csv('../data/df_sv.csv')

## Graph Features

- collusion features
- retalitory rating features - see node 95-188-7 (when you run 7604

In [45]:
# reverse view to see the network of who is rating this guy
# use census type or sometbing else for feature?
start_time = time.time()
_, g = h.build_graph(alpha_df[alpha_df['rating']>0])#, maxdate=rate_date)

# both directions --> use undirected=True
# in direction --> use reverse_view()
# test_g = nx.ego_graph(nx.reverse_view(g), 7604, radius=1)
#test_g = nx.ego_graph(g, 7604, undirected=True, radius=1)
test_g = nx.ego_graph(nx.reverse_view(g), 7604, center=False, undirected=True, radius=1)
node_census = nx.triadic_census(test_g)
print(f"{(time.time() - start_time):.0f} seconds execution time")
# nx.draw_shell(test_g )
node_census
nxpd.draw(test_g )

1 seconds execution time


'/var/folders/b3/m0fdz7_d6sz58yt6vfj9mfc40000gn/T/nx_l61a5wu_.png'

In [35]:
node_census

{'003': 282,
 '012': 166,
 '102': 64,
 '021D': 0,
 '021U': 69,
 '021C': 4,
 '111D': 38,
 '111U': 6,
 '030T': 21,
 '030C': 0,
 '201': 2,
 '120D': 11,
 '120U': 7,
 '120C': 1,
 '210': 8,
 '300': 1}

## Networkx Functions

In [None]:
alpha_GU = alpha_G.to_undirected()
nx.number_of_cliques(alpha_GU, nodes=7551)

In [None]:
## Run the triadic census
census = nx.triadic_census(alpha_G)
census

In [None]:
# triad generator
triads = all_triads(alpha_G)

In [None]:
node_census = nx.triads_by_type(alpha_G)

In [None]:
keys=node_census.values()[1].keys() 

## Generate a table header
print('| Node |', ' | '.join(keys))
for k in node_census.keys(): 
     print('|', k, '|',' | '.join([str(v) for v in node_census[k].values()]))


In [None]:
node_census

In [None]:
len(census)

In [None]:
# Connected_component_subgraphs() returns a list of components,
# sorted largest to smallest
components=nx.connected_component_subgraphs(alpha_G)
# pick the first and largest component
cc = components[0]

In [None]:
from networkx.algorithms import community

communities_generator = community.girvan_newman(alpha_G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)

sorted(map(sorted, top_level_communities))



The Girvan–Newman algorithm detects communities by progressively removing edges from the original graph. The algorithm removes the “most valuable” edge, traditionally the edge with the highest betweenness centrality, at each step. As the graph breaks down into pieces, the tightly knit community structure is exposed and the result can be depicted as a dendrogram.

In [None]:
import networkx.generators.small

g = networkx.generators.small.krackhardt_kite_graph()

g.adjacency_list()

In [None]:
otc_users[otc_users['BotActivity']==True].sort_values('TimeActive')

# Need to create visualization to Define Bot acitivity
In Alpha network only bot activity is with zero time delay - all same day raters
In OTC network only 18 users have zero time delay, however, XXX
users have delay under 1? minute (what is delay )

need 3 eda plots of same day rater distribution - day by hour, hour by minute, minute by second

In [None]:
print(f"Alpha Bot users: {alpha_users['BotActivity'].sum()}")
print(f"OTC Bot users: {otc_users['BotActivity'].sum()}")

# Prediction Visualizations

In [None]:
df = pd.read_csv("../data/alpha_with_ratee_stats.csv")

In [None]:
df_ato = m.feature_iteration_ato(alpha_df)

In [None]:
df_ato.reset_index(drop=True, inplace=True)
df_ato.head()

In [None]:
df = pd.concat([df, df_ato], axis=1)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
X = df.copy()
X = X.drop(['rater', 'ratee', 'rating','date', 'color', 'penwidth', 'binomial_rating'], axis=1)
y = X.pop('class')


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, shuffle=True,
                                                    random_state=123)

RF = RandomForestClassifier(n_jobs=-1, random_state=123)
RF.fit(X_train, y_train)
y_preds = RF.predict(X_test)
recall = recall_score(y_test, y_preds)
precision = precision_score(y_test, y_preds)

# print(X_test[(y_preds==0) & (X_test['num_neg_received']>0)].head(10))
print(recall)
print(precision)
RF.feature_importances_
 

In [None]:
import seaborn as sns
fig = plt.figure(figsize=(15,15))

name = "Random Forest"
indices = np.argsort(RF.feature_importances_)[::-1][:40]
ax1 = sns.barplot(y=X_train.columns[indices][:40],x = RF.feature_importances_[indices][:40] , orient='h')
ax1.set_xlabel("Relative importance",fontsize=12)
ax1.set_ylabel("Features",fontsize=12)
ax1.tick_params(labelsize=9)
ax1.set_title(name + " feature importance")

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

thresh = 0.2
pred_proba = RF.predict_proba(X_test)[:,1]
cnf_matrix = confusion_matrix(y_test, pred_proba>=thresh)
print(cnf_matrix)
tn, fp, fn, tp = cnf_matrix.ravel()
tn, fp, fn, tp
neg = tn + fp
pos = fn + tp
tnpct = tn/neg
fppct = fp/neg
fnpct = fn/pos
tppct = tp/pos
cnt_matrix_pct = np.round(np.array([tnpct,fppct,fnpct, tppct]), 2)
cnt_matrix_pct.reshape((2,2))

In [None]:
df.iloc[y_test-y_preds].shape

In [None]:
y_test-y_preds

In [None]:
X_test.loc[22765]

In [None]:
df.loc[22765]

In [None]:
for input, prediction, label in zip(X_test, y_preds, y_test):
    if prediction != label:
        print(input, 'has been classified as ', prediction, 'and should be ', label) 

In [None]:
def plot_confusion_matrix(ax, cm, title, classes=['Legitimate','Fraud'],
                          cmap=plt.cm.Blues, currency=False):
    """
    Plots a single confusion matrix. If currency=True then displays results as currency.

    Parameters
    ----------
    cm: array (confusion matrix)
    title: String
    test_size: float - size/percentage of holout dataset
    goal: float - project goal for ultimate dollar loss rate

    Returns
    -------
    """   
    
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        cost=cm[i, j]
        if currency:
            cost = f'${cost:0,.2f}' 
        ax.text(j, i, cost, horizontalalignment="center", 
        color="white" if cm[i, j] > thresh else "black")
    ax.imshow(cm, interpolation='nearest', cmap=cmap)

    if currency:
        ax.set_title(f'{title}\nCost Matrix')
    else:
        ax.set_title(f'{title}\nConfusion Matrix')
    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=0)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes, rotation=90)

    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')    



In [None]:
import itertools
fig, ax = plt.subplots()
plot_confusion_matrix(ax, cnf_matrix, "test", classes=['Pos Rating','Neg Rating'])

In [None]:
from sklearn.metrics import precision_recall_curve

# Make function for indivisual plot and then you can call it over and over
def Plot_PR_Curve(X, y):
    '''
    Calculates and Plots PR Curve and puts threshold marker on curve.
    '''
    # get points to plot on the PR Curve
    thresh = 0.5
    precision, recall, thresholds = precision_recall_curve(y, pred_proba) 
        
    # find the index of the record with closest threshold to desired threshold value
    threshold_idx = np.argmin(np.abs(thresholds-thresh))
    
    plt.plot(precision, recall)
    plt.plot(precision, threshold_idx, recall, threshold_idx, 'o', 
             markersize=10, fillstyle='full', 
             label="{thresh:.2f} threshold", mew=2)

    plt.legend(loc='center', frameon=False)
    plt.title('Precision-Recall Curve Comparison')    
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.xlim(xmin=-0.05,xmax=1.05);
    plt.ylim(ymin=-0.05,ymax=1.05);  

In [None]:
Plot_PR_Curve(X_test, y_test)

## Node2vec

In [None]:
# embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.

from node2vec import Node2Vec

# Precompute probabilities and generate walks
# node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
node2vec = Node2Vec(G, dimensions=14, walk_length=30, num_walks=25, workers=4)

# Embed
# windows is the number of max distance from the node that the vector is going to be based on
# maybe i can move this to 1 or 2??
model = node2vec.fit(window=5, min_count=1)#, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

# Look for most similar nodes
model.wv.most_similar('2')  # Output node names are always strings

# giving me 10 most similar to user
user = '1006'
model.wv.most_similar(user) 

# gives 64 length vector for user
vector = model.wv[user]
print(vector)

embeddingsframe = pd.DataFrame(model.get_embedding())

n = [] # node list?
e = [] # embeddings list

with open('./trimmed_network.emb') as fin:
    for line in model:
        node_emb = line.strip().split()  # turns into a list and removes white spaces at beginning and ending of string
        n.append(node_emb[0])
        e.append(node_emb[1:])

n = n[1:]
n = [int(i) for i in n] #  converts node to an int datatype
embs = np.zeros([len(e)-1,14])
for i in range(1,len(e)):
    embs[i-1] = e[i]
embs.shape

# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

## Velocity

In [None]:
def date_velocity(bitcoin_df, user, rate_date, vel_parm, user_type):
    df = bitcoin_df.copy()
    from_date = str(pd.Timestamp(rate_date) - pd.offsets.Hour(vel_parm))
    vel_neg, vel_all = \
    df[(df[user_type]==user) & (df['date'] <= rate_date) & (df['date'] > from_date)]['class'].agg(['sum', 'count'])
    vel_pos = vel_all - vel_neg
    A = np.array([vel_neg, vel_pos, vel_all])
    A[np.isnan(A)] = 0
    return A

In [None]:
import time
def feature_iteration_date_velocity(bitcoin_df):
    start_time = time.time()
    df = bitcoin_df.copy()
    df = df[['ratee', 'rater', 'rating','date','class']]
    for i, row in df.iterrows():
        user = row['ratee']
        rate_date = row['date']
        vel_24_in_neg, vel_24_in_pos, vel_24_in_all = velocity(df, user, rate_date, vel_parm=24, user_type="ratee")
        vel_24_out_neg, vel_24_out_pos, vel_24_out_all = velocity(df, user, rate_date, vel_parm=24, user_type="rater")
        vel_48_in_neg, vel_48_in_pos, vel_48_in_all = velocity(df, user, rate_date, vel_parm=48, user_type="ratee")
        vel_48_out_neg, vel_48_out_pos, vel_48_out_all = velocity(df, user, rate_date, vel_parm=48, user_type="rater")
        df.at[(i,'vel_24_in_pos')] = vel_24_in_pos
        df.at[(i,'vel_24_in_neg')] = vel_24_in_neg
        df.at[(i,'vel_24_in_all')] = vel_24_in_all
        df.at[(i,'vel_24_out_pos')] = vel_24_out_pos
        df.at[(i,'vel_24_out_neg')] = vel_24_out_neg
        df.at[(i,'vel_24_out_all')] = vel_24_out_all
        df.at[(i,'vel_24_all')] = vel_24_in_all + vel_24_out_all
        df.at[(i,'vel_48_in_pos')] = vel_48_in_pos
        df.at[(i,'vel_48_in_neg')] = vel_48_in_neg
        df.at[(i,'vel_48_in_all')] = vel_48_in_all
        df.at[(i,'vel_48_out_pos')] = vel_48_out_pos
        df.at[(i,'vel_48_out_neg')] = vel_48_out_neg
        df.at[(i,'vel_48_out_all')] = vel_48_out_all
        df.at[(i,'vel_48_all')] = vel_48_in_all + vel_48_out_all
    print(f"{(time.time() - start_time):.0f} seconds execution time")
    df.drop(['class'], axis=1)
    return df

In [None]:
start_time = time.time()
df = m.feature_iteration_velocity(alpha_df)
print(f"{(time.time() - start_time):.0f} seconds execution time")

In [None]:
from matplotlib.dates import DateFormatter, DayLocator

df2 = df[df['ratee']==7512].copy()
df2.set_index('date', inplace=True)
fig, ax = plt.subplots(figsize=(16,4))
ax.bar(df2.index.values,
       df2['vel_24_all'],
       color='purple')
# ax = df2['vel_24_all'].plot(color='r', kind='bar', label='vel');

In [None]:
df[(alpha_df['rater']==7512) | (df['ratee']==7512)].head()#[['date','vel_24_all','vel_24_in_all','vel_24_out_all']]

In [None]:
alpha_df.groupby(['rater', 'date'])['rating'].count().sort_values(ascending=False)

In [None]:
vel_24_in_neg

In [None]:
alpha_df[(alpha_df['rater']==185) | (alpha_df['ratee']==185)]

# new collusion feature

In [None]:
def update_bitcoin_df_attibutes
    

In [None]:
def feature_creation_collusion2(bitcoin_df, user, rate_date):
    """ Returns array containing predictive features for 
    an individual bitcoin rating.
    Input: 
        bitcoin_df:  Dataframe containing bitcoin ratings as edges
        user: int
        rate_date: date used for feature generation
    Output:
        array
    """
    df = bitcoin_df.copy()
    user_data_in = df[(df['ratee']==user) & ((df['date'] < rate_date) | ((df['date']==rate_date) & (df['rating'] > 0)))]
    if len(user_data_in)==0:
        return np.zeros(8)
    
    
    num_ratings_received = len(user_data_in)
    num_neg_received = user_data_in['class'].sum()
    num_pos_received = num_ratings_received - num_neg_received
    neg_ratings_pct = num_neg_received / num_ratings_received
    rating_sum = user_data_in['rating'].sum()
    days_active = (rate_date - user_data_in['date'].min()).days
    _, g = h.build_graph(df, maxdate=rate_date)
    cluster_coef = nx.clustering(g, user)
    g = g.to_undirected()
    num_cliques = nx.number_of_cliques(g, user)

    A = np.array([num_ratings_received, num_neg_received, num_pos_received, 
                  neg_ratings_pct, rating_sum, days_active, cluster_coef, num_cliques])
    A[np.isnan(A)] = 0
    return A