In [1]:
import networkx as nx
import math
import sys
import pandas as pd
# MAKE SURE TO RUN "data_preprocessing.sh" BEFORE RUNNING THE JUPYTER NOTEBOOK

# Preprocessing

In [2]:
df = pd.read_csv('./otc.csv')
df = df.drop(['timestamp'], axis = 1)
df['rating'] = df['rating']/10
df.to_csv('OTCNet.csv', index = False)

# Fairness and Goodness

In [3]:
# Sets initial fairness goodness scores to each node in the graph
# method returns list of fairness and goodness initialized values
def initialize_scores(G):
    fairness = {}
    goodness = {}
    
    nodes = G.nodes()
    for node in nodes:
        fairness[node] = 1
        try:
            goodness[node] = G.in_degree(node, weight='weight')*1.0/G.in_degree(node)
        except:
            goodness[node] = 0
    return fairness, goodness

In [4]:
# Algorithm to calculate fairness and goodness for a given WSN - weigthed signed graph
# method returns list of fairness and goodness calculated values
def compute_fairness_goodness(G):
    fairness, goodness = initialize_scores(G)
    
    nodes = G.nodes()
    iter = 0
    while iter < 100:
        df = 0
        dg = 0
        
        for node in nodes:
            inedges = G.in_edges(node, data='weight')
            #print(inedges)
            g = 0
            for edge in inedges:
                g += fairness[edge[0]]*edge[2]

            try:
                dg += abs(g/len(inedges) - goodness[node])
                goodness[node] = g/len(inedges)
            except:
                pass

        for node in nodes:
            outedges = G.out_edges(node, data='weight')
            f = 0
            for edge in outedges:
                f += 1.0 - abs(edge[2] - goodness[edge[1]])/2.0
            try:
                df += abs(f/len(outedges) - fairness[node])
                fairness[node] = f/len(outedges)
            except:
                pass
        
        if df < math.pow(10, -6) and dg < math.pow(10, -6):
            break
        iter+=1
    
    return fairness, goodness

In [5]:
# creating a holder for directed graph
G = nx.DiGraph()

In [6]:
# opens graph dataset
f = open("./OTCNet.csv","r")

In [7]:
for l in f:
    try:
        ls = l.strip().split(",")
        G.add_edge(ls[0], ls[1], weight = float(ls[2])) ## the weight should already be in the range of -1 to 1
    except:
        continue
f.close()

In [8]:
# Computing fairness and goodness
fairness, goodness = compute_fairness_goodness(G)

In [9]:
df_f = pd.read_csv('./OTCNet.csv')
df = pd.DataFrame()

df['Id'] = df_f[['node1', 'node2']].apply(lambda x: '-'.join(x.fillna('').map(str)), axis=1)
df['Id'] = df['Id'].str.strip('-')

result_f = {}
for index, row in df_f.iterrows():
    node = str(int(row['node1']))
    result_f[index] = fairness[node]
    
result_g = {}
for index, row in df_f.iterrows():
    node = str(int(row['node2']))
    result_g[index] = goodness[node]

In [10]:
df['fairness'] = pd.Series(result_f, index=df_f.index, dtype=float)
df['goodness'] = pd.Series(result_g, index=df_f.index, dtype=float)

In [11]:
df

Unnamed: 0,Id,fairness,goodness
0,6-2,0.895726,0.269531
1,6-5,0.895726,0.214168
2,1-15,0.922436,0.144465
3,4-3,0.891287,-0.035796
4,13-16,0.945563,0.756450
5,13-10,0.945563,0.552062
6,7-5,0.943844,0.214168
7,2-21,0.893744,0.224765
8,2-20,0.893744,0.182410
9,21-2,0.880717,0.269531


# Bias and Deserve

In [12]:
# Sets initial bias deserve scores to each node in the graph
# method returns list of bias and deserve initialized values
def initialize_scores_bd(G):
    bias = {}
    deserve = {}
    
    nodes = G.nodes()
    #print(nodes)
    for node in nodes:
        bias[node] = -1
        try:
            deserve[node] = G.in_degree(node, weight='weight')*1.0/G.in_degree(node)
        except:
            deserve[node] = 0
    return bias, deserve

In [14]:
# Algorithm to calculate bias and deserve for a given WSN - weigthed signed graph
# method returns list of bias and deserve calculated values
def compute_bias_deserve(G):
    bias, deserve = initialize_scores_bd(G)
    
    nodes = G.nodes()
    iter = 0
    while iter < 100:
        d = 0
        db = 0
        
        for node in nodes:
            inedges = G.in_edges(node, data='weight')
            #print(inedges)
            d = 0
            for edge in inedges:
                Xkj = max(0, bias[node]*edge[2])
                d += edge[2]*(1 - Xkj)
            try:
                dd += abs(d/len(inedges) - deserve[node])
                deserve[node] = d/len(inedges)
            except:
                pass

        for node in nodes:
            outedges = G.out_edges(node, data='weight')
            b = 0
            for edge in outedges: 
                b += (edge[2] - deserve[edge[1]])
            try:
                db += abs(b/(2*len(outedges)) - bias[node])
                bias[node] = b/(2*len(outedges))
            except:
                pass
        
        
        if db < math.pow(10, -6):
            break
        #print('Differences in bias score score = %.2f' % db)
        iter+=1
    
    return bias, deserve

In [15]:
G = nx.DiGraph()

In [16]:
f = open("OTCNet.csv","r")

In [17]:
for l in f:
    try:
        ls = l.strip().split(",")
        G.add_edge(ls[0], ls[1], weight = float(ls[2])) ## the weight should already be in the range of -1 to 1
    except:
        continue
f.close()

In [18]:
bias, deserve = compute_bias_deserve(G)

In [19]:
result_b = {}
for index, row in df_f.iterrows():
    node = str(int(row['node1']))
    result_b[index] = bias[node]

In [20]:
result_d = {}
for index, row in df_f.iterrows():
    node = str(int(row['node2']))
    #print(bias[node])
    result_d[index] = deserve[node]

In [21]:
df['bias'] = pd.Series(result_b, index=df.index, dtype=float)
df['deserve'] = pd.Series(result_d, index=df.index, dtype=float)

In [22]:
df

Unnamed: 0,Id,fairness,goodness,bias,deserve
0,6-2,0.895726,0.269531,0.031592,0.300000
1,6-5,0.895726,0.214168,0.031592,0.233333
2,1-15,0.922436,0.144465,0.024508,0.153846
3,4-3,0.891287,-0.035796,0.057172,-0.028571
4,13-16,0.945563,0.756450,0.002174,0.800000
5,13-10,0.945563,0.552062,0.002174,0.600000
6,7-5,0.943844,0.214168,0.030493,0.233333
7,2-21,0.893744,0.224765,0.066708,0.250000
8,2-20,0.893744,0.182410,0.066708,0.200000
9,21-2,0.880717,0.269531,0.092503,0.300000


In [23]:
# Combined result of fairness-goodness and bias-deserve along with the actual ratings
df['rating'] = pd.Series(df_f['rating'], index=df.index, dtype=float)
df.to_csv('./LRA.csv')
df

Unnamed: 0,Id,fairness,goodness,bias,deserve,rating
0,6-2,0.895726,0.269531,0.031592,0.300000,0.4
1,6-5,0.895726,0.214168,0.031592,0.233333,0.2
2,1-15,0.922436,0.144465,0.024508,0.153846,0.1
3,4-3,0.891287,-0.035796,0.057172,-0.028571,0.7
4,13-16,0.945563,0.756450,0.002174,0.800000,0.8
5,13-10,0.945563,0.552062,0.002174,0.600000,0.8
6,7-5,0.943844,0.214168,0.030493,0.233333,0.1
7,2-21,0.893744,0.224765,0.066708,0.250000,0.5
8,2-20,0.893744,0.182410,0.066708,0.200000,0.5
9,21-2,0.880717,0.269531,0.092503,0.300000,0.5


# Linear Regression Model

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import math
import sys
import pandas as pd
import numpy as np

In [25]:
# Opeing the combined results .csv
df_mix = pd.read_csv('./LRA.csv')

# Linear regression on the data
rating = df_mix['rating']
X =  df_mix.drop(['bias', 'Id', 'rating'], axis = 1)

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, rating, test_size=0.25, random_state=1)

lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
# calculating prediction on test dataset
pred = lr.predict(X_test)

# RMSE on linear regression model
math.sqrt(mean_squared_error(y_test, pred))

0.2626121187197187

In [27]:
# Pearson correlation coefficient
np.corrcoef(y_test, pred)

array([[1.        , 0.66034658],
       [0.66034658, 1.        ]])

# Leave One Out

Make sure to atleast run the itr for 10
NOTE: Results in the report might not match with scripts since the results were obtained for different iterations due to computational constraints 

In [36]:
from sklearn.model_selection import LeaveOneOut
import networkx as nx
import math
import sys
import pandas as pd

fairness and goodness

In [37]:
df = pd.read_csv('./OTCNet.csv')
X =  df

X = X.values

# count for number of iterations
loo = LeaveOneOut()
loo.get_n_splits(X)

35592

In [46]:
fairness = {}
goodness = {}
fairness_test = {}
goodness_test = {}

# set no. of iterations. By default it will run for 35592 (loo.get_n_splits(X) above)
# Results calculated for itr = 5000
itr = 10

for train_index, test_index in loo.split(X):
    if test_index[0] < (itr):
        X_train, X_test = X[train_index], X[test_index]
        G = nx.DiGraph()
        for index in train_index:
            G.add_edge(X_train[index-1][0], X_train[index-1][1], weight = float(X_train[index-1][2])) ## the weight should already be in the range of -1 to 1
        fairness, goodness = compute_fairness_goodness(G)
        try:
            fairness_test[test_index[0]] = fairness[X_test[0][0]]
        except:
            fairness_test[test_index[0]] = 0
            pass
        try:
            goodness_test[test_index[0]] = goodness[X_test[0][1]]
        except:
            goodness_test[test_index[0]] = 0
            pass
    else:
        break
    print('iteration', test_index[0])

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9


In [47]:
df['fairness'] = pd.Series(fairness_test, index=df.index, dtype=float)
df['goodness'] = pd.Series(goodness_test, index=df.index, dtype=float)

In [48]:
df = df.dropna()
df.to_csv('./leaveoneout_fairness_goodness.csv')

In [49]:
# avg RMSE for leave one out
RMSE_fg_leave = 0
RMSE_fg_leave += math.sqrt(mean_squared_error(df['rating'], df['fairness']*df['goodness']))

In [50]:
RMSE_fg_leave

0.4095057156102173

In [51]:
# Pearson correlation coefficient
np.corrcoef(df['rating'], df['fairness']*df['goodness'])

array([[ 1.        , -0.12052684],
       [-0.12052684,  1.        ]])

Bias and Deserve

In [52]:
df = pd.read_csv('./OTCNet.csv')
X =  df

X = X.values

loo = LeaveOneOut()
loo.get_n_splits(X)

35592

In [53]:
bias = {}
deserve = {}
bias_test = {}
deserve_test = {}

# set no. of iterations. By default it will run for 35592 (loo.get_n_splits(X) above)
itr = 10

for train_index, test_index in loo.split(X):
    if test_index[0] < (itr):
        X_train, X_test = X[train_index], X[test_index]
        G = nx.DiGraph()
        for index in train_index:
            G.add_edge(X_train[index-1][0], X_train[index-1][1], weight = float(X_train[index-1][2])) ## the weight should already be in the range of -1 to 1
        bias, deserve = compute_bias_deserve(G)
        try:
            bias_test[test_index[0]] = bias[X_test[0][0]]
        except:
            bias_test[test_index[0]] = 0
            pass
        try:
            deserve_test[test_index[0]] = deserve[X_test[0][1]]
        except:
            deserve_test[test_index[0]] = 0
            pass
    else:
        break
    print('iteration', test_index[0])

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9


In [54]:
df['deserve'] = pd.Series(deserve_test, index=df.index, dtype=float)

In [55]:
df = df.dropna()
df.to_csv('./leaveoneout_bias_deserve.csv')

In [56]:
# avg RMSE for leave one out
RMSE_bd_leave = 0
RMSE_bd_leave += math.sqrt(mean_squared_error(df['rating'], df['deserve']))

In [57]:
RMSE_bd_leave

0.39558439540620016

In [58]:
# Pearson correlation coefficient
np.corrcoef(df['rating'], df['deserve'])

array([[ 1.       , -0.1190947],
       [-0.1190947,  1.       ]])

# Leave N Out

Make sure to atleast run the itr for 10
NOTE: Results in the report might not match with scripts since the results were obtained for different iterations due to computational constraints 

In [59]:
from sklearn.model_selection import LeavePOut
from sklearn.metrics import mean_squared_error
import networkx as nx
import math
import sys
import pandas as pd
import numpy as np

Fairness and Goodness

In [60]:
df = pd.read_csv('./OTCNet.csv')

N = 5

X =  df
#y =  df['rating']
X = X.values
#y = y.values

lpo = LeavePOut(N)
lpo.get_n_splits(X)

475837529743534588728

In [61]:
# set no. of iterations. By default it will run for 475837529743534588728 (lpo.get_n_splits(X) above)
# results were taken for 100 iterations
itr = 10
fairness = {}
goodness = {}
predicted = pd.DataFrame(np.random.randn(5, 1), columns=['rat'])
test = pd.DataFrame(np.random.randn(5, 1), columns=['rat'])
goodness_test = {}
RMSE = 0
PCC = 0
for train_index, test_index in lpo.split(X):
    if test_index[4] < (itr+N):
        X_train, X_test = X[train_index], X[test_index]
        G = nx.DiGraph()
        test['rat'][0] = X_test[0][2]
        test['rat'][1] = X_test[1][2]
        test['rat'][2] = X_test[2][2]
        test['rat'][3] = X_test[3][2]
        test['rat'][4] = X_test[4][2]
        for index in range(0, 35587):
            G.add_edge(X_train[index][0], X_train[index][1], weight = float(X_train[index][2])) ## the weight should already be in the range of -1 to 1
        try:
            fairness, goodness = compute_fairness_goodness(G)
        except:
            pass
        try:
            predicted['rat'][0] = fairness[X_test[0][0]]*goodness[X_test[0][1]]
        except:
            predicted['rat'][0] = 0
            continue
        try:
            predicted['rat'][1] = fairness[X_test[1][0]]*goodness[X_test[1][1]]
        except:
            predicted['rat'][1] = 0
            continue
        try:
            predicted['rat'][2] = fairness[X_test[2][0]]*goodness[X_test[2][1]]
        except:
            predicted['rat'][2] = 0
            continue
        try:
            predicted['rat'][3] = fairness[X_test[3][0]]*goodness[X_test[3][1]]
        except:
            predicted['rat'][3] = 0
            continue
        try:
            predicted['rat'][4] = fairness[X_test[4][0]]*goodness[X_test[4][1]]
        except:
            predicted['rat'][4] = 0
            continue
    else:
        break

    RMSE += math.sqrt(mean_squared_error(test['rat'], predicted['rat']))
    PCC += np.corrcoef(test['rat'], predicted['rat'])
    print('iteration', test_index[4] - N)

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9


In [62]:
# avg RMSE for leave N out
RMES_fg_leaveNout = RMSE/itr
RMES_fg_leaveNout

0.406733391531918

In [63]:
# Pearson correlation coefficient
PCC_fg_leaveNout = PCC/itr
PCC_fg_leaveNout

array([[ 1.        , -0.37802359],
       [-0.37802359,  1.        ]])

Bias and Deserve

In [64]:
# set no. of iterations. By default it will run for 475837529743534588728 (lpo.get_n_splits(X) above)
itr = 10

bias = {}
deserve = {}
predicted = pd.DataFrame(np.random.randn(5, 1), columns=['rat'])
test = pd.DataFrame(np.random.randn(5, 1), columns=['rat'])

goodness_test = {}
RMSE = 0
PCC = 0
for train_index, test_index in lpo.split(X):
    if test_index[4] < (itr+N):
        X_train, X_test = X[train_index], X[test_index]
        G = nx.DiGraph()
        test['rat'][0] = X_test[0][2]
        test['rat'][1] = X_test[1][2]
        test['rat'][2] = X_test[2][2]
        test['rat'][3] = X_test[3][2]
        test['rat'][4] = X_test[4][2]
        for index in range(0, 35587):
            G.add_edge(X_train[index][0], X_train[index][1], weight = float(X_train[index][2])) ## the weight should already be in the range of -1 to 1
        try:
            bias, deserve = compute_bias_deserve(G)
        except:
            pass
        try:
            predicted['rat'][0] = deserve[X_test[0][1]]
        except:
            predicted['rat'][0] = 0
            continue
        try:
            predicted['rat'][1] = deserve[X_test[1][1]]
        except:
            predicted['rat'][1] = 0
            continue
        try:
            predicted['rat'][2] = deserve[X_test[2][1]]
        except:
            predicted['rat'][2] = 0
            continue
        try:
            predicted['rat'][3] = deserve[X_test[3][1]]
        except:
            predicted['rat'][3] = 0
            continue
        try:
            predicted['rat'][4] = deserve[X_test[4][1]]
        except:
            predicted['rat'][4] = 0
            continue
    else:
        break
    RMSE += math.sqrt(mean_squared_error(test['rat'], predicted['rat']))
    PCC += np.corrcoef(test['rat'], predicted['rat'])
    print('iteration', test_index[4] - N)

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9


In [65]:
# Computing avg RMSE for leave N out
RMES_bd_leaveNout = RMSE/itr
RMES_bd_leaveNout

0.3986889987620701

In [66]:
# Pearson correlation coefficient for leave N out
PCC_bd_leaveNout = PCC/itr
PCC_bd_leaveNout

array([[ 1.        , -0.34813613],
       [-0.34813613,  1.        ]])

# RMSE for each

In [67]:
df_mix = pd.read_csv('./LRA.csv')
rating = df_mix['rating']

In [68]:
# RMSE on fairness and goodness multiply
math.sqrt(mean_squared_error(rating, df_mix['fairness']*df_mix['goodness']))

0.2749148059676137

In [69]:
# RMSE on deserve 
math.sqrt(mean_squared_error(rating, df_mix['deserve']))

0.2712422053218507

In [70]:
# RMSE on fairness and goodness avg
math.sqrt(mean_squared_error(rating, (df_mix['fairness']+df_mix['goodness']))/2)

0.6812191995220688

In [71]:
# calculating PCC on product(fairness(node1), goodness(node2))
np.corrcoef(rating, df_mix['fairness']*df_mix['goodness'])

array([[1.       , 0.6525159],
       [0.6525159, 1.       ]])

In [72]:
# calculating PCC on avg(fairness(node1), goodness(node2))
np.corrcoef(rating, (df_mix['fairness']+df_mix['goodness'])/2)

array([[1.        , 0.60702733],
       [0.60702733, 1.        ]])

In [73]:
# calculating PCCon on deserve
np.corrcoef(rating, df_mix['deserve'])

array([[1.        , 0.64818942],
       [0.64818942, 1.        ]])

In [74]:
# calclating fNg avg on ratings greater than 0.3
avg = {}
f = {}
g = {}
count = -1

In [75]:
for index, row in df_mix.iterrows():
    try:
        if row['rating'] > 0.3:
            count = count + 1
            avg[count] = row['rating']
            f[count] = row['fairness']*row['goodness']
            g[count] = (row['fairness']+row['goodness'])/2
    except:
        continue

In [76]:
df_gt = pd.DataFrame()

In [77]:
df_gt['rating'] = pd.Series(avg, index=df_mix.index, dtype=float)
df_gt['fxg'] = pd.Series(f, index=df_mix.index, dtype=float)
df_gt['avg'] = pd.Series(g, index=df_mix.index, dtype=float)

In [78]:
df_gt = df_gt.dropna()

In [79]:
# RMSE on fairness and goodness multiply
math.sqrt(mean_squared_error(df_gt['rating'], df_gt['fxg']))

0.519159565075064

In [80]:
# RMSE on fairness and goodness avg
math.sqrt(mean_squared_error(df_gt['rating'], df_gt['avg']))

0.2915986280345191