# Basic Model

In [3]:
import pandas as pd
import numpy as np
import datetime
import networkx as nx

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import sys
sys.path.insert(0, '../src')
import helpers as h
import visualizations as v

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

In [4]:
# Load and preprocess data
otc_df = h.load_bitcoin_edge_data('../data/soc-sign-bitcoinotc.csv.gz')
alpha_df = h.load_bitcoin_edge_data('../data/soc-sign-bitcoinalpha.csv.gz')
alpha_users = h.user_activity_dataframe(alpha_df)
otc_users = h.user_activity_dataframe(otc_df)
alpha_G = h.load_bitcoin_graph(alpha_df)
oct_G = h.load_bitcoin_graph(otc_df)

What is the range of user numbers in the networks?

In [3]:
otc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35592 entries, 0 to 35591
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   rater   35592 non-null  int64         
 1   ratee   35592 non-null  int64         
 2   rating  35592 non-null  int64         
 3   date    35592 non-null  datetime64[ns]
 4   class   35592 non-null  int64         
dtypes: datetime64[ns](1), int64(4)
memory usage: 1.4 MB


In [44]:
# number of unique days in dataset 
# This is the number of iterations of node2vec that will need to be run
# may take 2 days - need to test time on maximum size graph (last day)

alpha["date"].dt.normalize().nunique()

1647

In [4]:
alpha['class'].mean()

0.06350781443810469

In [20]:
# first fraud
alpha_df[alpha_df['rating']<0].sort_values('date')[270:290]

Unnamed: 0,rater,ratee,rating,date,class,binomial_rating,color,penwidth
4551,11,183,-10,2012-07-20 21:00:00,1,-1,red,4
6385,20,7565,-2,2012-07-21 21:00:00,1,-1,red,2
21757,593,7441,-9,2012-07-21 21:00:00,1,-1,red,4
21758,593,7397,-10,2012-07-21 21:00:00,1,-1,red,4
17447,1762,188,-1,2012-07-22 21:00:00,1,-1,red,1
18036,690,216,-1,2012-07-26 21:00:00,1,-1,red,1
2189,4,211,-5,2012-07-29 21:00:00,1,-1,red,3
19171,410,282,-5,2012-08-06 21:00:00,1,-1,red,3
5041,13,7565,-10,2012-08-13 21:00:00,1,-1,red,4
24025,7565,5342,-10,2012-08-13 21:00:00,1,-1,red,4


In [14]:
def get_ratee_features(bitcoin_df, user, rate_date):
    df = bitcoin_df.copy()
    user_data_in = df[(df['ratee']==user) & (df['date'] < rate_date)]
    user_data_out = df[(df['rater']==user) & (df['date'] < rate_date)]

    num_ratings_received = len(user_data_in)
    num_neg_received = user_data_in['class'].sum()
    num_pos_received = num_ratings_received - num_neg_received

    num_ratings_given = len(user_data_out)
    num_neg_given = user_data_out['class'].sum()
    num_pos_given = num_ratings_given - num_neg_given

    A = np.array([num_ratings_received,num_neg_received,num_pos_received, 
                  num_ratings_given,num_neg_given,num_pos_given])
    A[np.isnan(A)] = 0
    return A

user = 430
rate_date = '2014-03-08 21:00:00'
get_ratee_features(alpha, user, rate_date)

NameError: name 'alpha' is not defined

In [8]:

def add_ratee_stats(bitcoin_df):
    df = bitcoin_df.copy()
    for i, row in df.iterrows():
        user = row['ratee']
        rate_date = row['date']
        num_ratings_received, num_neg_received, num_pos_received, \
        num_ratings_given, num_neg_given, num_pos_given = get_ratee_features(df, user, rate_date)
        df.at[(i,'num_ratings_received')] = num_ratings_received
        df.at[(i,'num_neg_received')] = num_neg_received
        df.at[(i,'num_pos_received')] = num_pos_received
        df.at[(i,'num_ratings_given')] = num_ratings_given
        df.at[(i,'num_neg_given')] = num_neg_given
        df.at[(i,'num_pos_given')] = num_pos_given
    return df
alpha_with_ratee_stats = add_ratee_stats(alpha)

In [9]:
alpha_with_ratee_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24186 entries, 0 to 24185
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   rater                 24186 non-null  int64         
 1   ratee                 24186 non-null  int64         
 2   rating                24186 non-null  int64         
 3   date                  24186 non-null  datetime64[ns]
 4   class                 24186 non-null  int64         
 5   num_ratings_received  24186 non-null  float64       
 6   num_neg_received      24186 non-null  float64       
 7   num_pos_received      24186 non-null  float64       
 8   num_ratings_given     24186 non-null  float64       
 9   num_neg_given         24186 non-null  float64       
 10  num_pos_given         24186 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(4)
memory usage: 2.0 MB


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
X = alpha_with_ratee_stats.copy()
X = X.drop(['rater', 'ratee', 'rating','date'], axis=1)
y = X.pop('class')


X_train, X_test, y_train, y_test = train_test_split(X, y)

RF = RandomForestClassifier(n_jobs=-1, random_state=1)
RF.fit(X_train, y_train)
y_preds = RF.predict(X_test)
recall = recall_score(y_test, y_preds)
precision = precision_score(y_test, y_preds)

# print(X_test[(y_preds==0) & (X_test['num_neg_received']>0)].head(10))
print(recall)
print(precision)
RF.feature_importances_

0.375
0.746031746031746


array([0.13256784, 0.35505707, 0.13585419, 0.1439359 , 0.0997925 ,
       0.13279251])

In [11]:
# decision based on the existence of a negative rating
recall = recall_score(y_test, X_test['num_neg_received']>0)
precision = precision_score(y_test, X_test['num_neg_received']>0)
print(recall)
print(precision)

0.4973404255319149
0.26600284495021337


In [16]:
y_test[18671]

1

In [12]:
X_test[y_preds==1]

Unnamed: 0,num_ratings_received,num_neg_received,num_pos_received,num_ratings_given,num_neg_given,num_pos_given
16908,160.0,5.0,155.0,202.0,18.0,184.0
11593,38.0,0.0,38.0,43.0,1.0,42.0
18671,27.0,23.0,4.0,17.0,1.0,16.0
23692,12.0,1.0,11.0,23.0,11.0,12.0
4547,43.0,13.0,30.0,52.0,9.0,43.0
10201,26.0,0.0,26.0,31.0,2.0,29.0
16525,38.0,0.0,38.0,43.0,1.0,42.0
7018,7.0,3.0,4.0,7.0,2.0,5.0
24145,32.0,2.0,30.0,13.0,0.0,13.0
10297,12.0,8.0,4.0,2.0,0.0,2.0


In [18]:
# Create Graph Object:
import networkx as nx
import nxpd

node_lst = alpha[['ratee','rater']].values.tolist()

G = nx.DiGraph()
G.add_edges_from(node_lst)

In [40]:
# embedding dimension to 14, the number of walks to 25, and the number of iterations to 15.

from node2vec import Node2Vec

# Precompute probabilities and generate walks
# node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
node2vec = Node2Vec(G, dimensions=14, walk_length=30, num_walks=25, workers=4)

HBox(children=(HTML(value='Computing transition probabilities'), FloatProgress(value=0.0, max=3783.0), HTML(va…




In [41]:
# Embed
# windows is the number of max distance from the node that the vector is going to be based on
# maybe i can move this to 1 or 2??
model = node2vec.fit(window=5, min_count=1)#, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

# Look for most similar nodes
model.wv.most_similar('2')  # Output node names are always strings


[('1998', 0.8824143409729004),
 ('2404', 0.879304051399231),
 ('2459', 0.8626430034637451),
 ('2006', 0.8611680269241333),
 ('2497', 0.8588654398918152),
 ('2446', 0.8540571331977844),
 ('2698', 0.8517052531242371),
 ('2488', 0.8372464179992676),
 ('2499', 0.8294756412506104),
 ('381', 0.7843669652938843)]

In [42]:
# giving me 10 most similar to user
user = '1006'
model.wv.most_similar(user) 

[('1671', 0.9400119781494141),
 ('1011', 0.8523332476615906),
 ('2607', 0.8320485353469849),
 ('1650', 0.8249363899230957),
 ('1681', 0.8230453729629517),
 ('5029', 0.8208856582641602),
 ('1672', 0.7976632118225098),
 ('1246', 0.7975524663925171),
 ('1451', 0.7842124700546265),
 ('2662', 0.7791309356689453)]

In [39]:
# gives 64 length vector for user
vector = model.wv[user]
print(vector)

[ 4.677279   -0.14440085 -2.0062332  -1.3968614  -2.1195004   0.6520846
  0.25759023  1.7289563  -6.4360104  -4.766796   -3.6646328   1.9258972
  0.9211684  -0.02154114 -1.5147105   1.4741819  -2.0353732  -2.5083094
  1.0531923   1.2944742  -1.1803691  -1.2185649   2.4793582  -0.7870794
 -1.8255451  -4.7876205   3.1776264   1.2976794  -1.3398994  -0.18072475
 -1.1531967   2.2801034   2.0099773   0.1105336  -4.9227095   1.6961759
 -0.2999797   2.7208598   0.77425224 -1.4751661  -1.2134589  -5.677068
 -3.8328917  -0.51765317  4.0102754   2.1206903  -3.466651   -0.6337631
  1.2065107  -4.0630584  -0.6360448   1.0586778   5.7647915  -2.9969475
  4.499282    0.23898387 -4.1326427  -3.2803533   3.3385267   0.8167142
  3.614496   -2.587229    4.2410693  -2.459809  ]


In [30]:
embeddingsframe = pd.DataFrame(model.get_embedding())

AttributeError: 'Word2Vec' object has no attribute 'get_embedding'

In [29]:
n = [] # node list?
e = [] # embeddings list

with open('./trimmed_network.emb') as fin:
    for line in model:
        node_emb = line.strip().split()  # turns into a list and removes white spaces at beginning and ending of string
        n.append(node_emb[0])
        e.append(node_emb[1:])

n = n[1:]
n = [int(i) for i in n] #  converts node to an int datatype
embs = np.zeros([len(e)-1,14])
for i in range(1,len(e)):
    embs[i-1] = e[i]
embs.shape

TypeError: 'int' object is not iterable

In [None]:
# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)