In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import (StandardScaler, MinMaxScaler)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [5]:
G = pickle.load(open('assets/email_prediction_NEW.txt', 'rb'))
print(f"Graph with {nx.number_of_nodes(G)} nodes and {nx.number_of_edges(G)} edges")

Graph with 1005 nodes and 16706 edges


# ***2A***

In [15]:
# features to consider

# clustering, degree, degree_centrality, closeness_centrality, betweenness_centrality, page_rank ,is_management

# So, whenever a nx function returns a dict, the keys are in the same order as G.nodes()'s return.
list(nx.clustering(G).keys()) == list(G.nodes())

True

In [25]:
clustering = nx.clustering(G)
degree = dict(nx.degree(G))
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
page_rank = nx.pagerank(G)
managerial = nx.get_node_attributes(G, "ManagementSalary")

In [28]:
df = pd.DataFrame({
    "clustering": clustering, "degree": degree, "degree_centrality": degree_centrality,
    "closeness_centrality": closeness_centrality, "betweenness_centrality": betweenness_centrality,
    "page_rank": page_rank, "managerial": managerial
}, index = G.nodes())

df.head()

Unnamed: 0,clustering,degree,degree_centrality,closeness_centrality,betweenness_centrality,page_rank,managerial
0,0.276423,44,0.043825,0.421991,0.001124,0.001224,0.0
1,0.265306,52,0.051793,0.42236,0.001195,0.001426,
581,0.248276,32,0.031873,0.412974,0.000378,0.000894,0.0
6,0.155183,115,0.114542,0.475805,0.012387,0.003146,1.0
65,0.17288,91,0.090637,0.461049,0.012473,0.002857,


In [35]:
# training data

scaler = StandardScaler()

train = df.loc[~df.managerial.isna(), :]
train_x = train.drop("managerial", axis = 1)
train_y = train.managerial

train_x_scaled = scaler.fit_transform(train_x)

In [46]:
# testing data

test = df.loc[df.managerial.isna(), :]
test_x = test.drop("managerial", axis = 1)

test_x_scaled = scaler.transform(test_x)

In [47]:
rforest = RandomForestClassifier(n_estimators = 1000)

In [48]:
rforest.fit(train_x_scaled, train_y)

In [49]:
rforest.predict_proba(test_x_scaled)[:, 1];

In [50]:
pd.Series(rforest.predict_proba(test_x_scaled)[:, 1], index = test_x.index)

1      0.017
65     0.971
18     0.102
215    0.955
283    0.999
       ...  
691    0.000
788    0.001
944    0.000
798    0.000
808    0.000
Length: 252, dtype: float64

In [51]:
# GOOD :)

def salary_predictions():
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier
    

    clustering = nx.clustering(G)
    degree = dict(nx.degree(G))
    degree_centrality = nx.degree_centrality(G)
    closeness_centrality = nx.closeness_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)
    page_rank = nx.pagerank(G)
    managerial = nx.get_node_attributes(G, "ManagementSalary")


    df = pd.DataFrame({
    "clustering": clustering, "degree": degree, "degree_centrality": degree_centrality,
    "closeness_centrality": closeness_centrality, "betweenness_centrality": betweenness_centrality,
    "page_rank": page_rank, "managerial": managerial
    }, index = G.nodes())


    # training data

    scaler = StandardScaler()
    
    train = df.loc[~df.managerial.isna(), :]
    train_x = train.drop("managerial", axis = 1)
    train_y = train.managerial
    
    train_x_scaled = scaler.fit_transform(train_x)
    
    # testing data
    
    test = df.loc[df.managerial.isna(), :]
    test_x = test.drop("managerial", axis = 1)
    
    test_x_scaled = scaler.transform(test_x)

    rforest = RandomForestClassifier(n_estimators = 1000)
    rforest.fit(train_x_scaled, train_y)
    
    return pd.Series(rforest.predict_proba(test_x_scaled)[:, 1], index = test_x.index)

# ***2B***

In [2]:
future_connections = pd.read_csv('assets/Future_Connections.csv', index_col=0, converters={0: eval})
future_connections.head(10)

Unnamed: 0,Future Connection
"(6, 840)",0.0
"(4, 197)",0.0
"(620, 979)",0.0
"(519, 872)",0.0
"(382, 423)",0.0
"(97, 226)",1.0
"(349, 905)",0.0
"(429, 860)",0.0
"(309, 989)",0.0
"(468, 880)",0.0


In [3]:
future_connections.isna().sum()/ future_connections.shape[0]

Future Connection    0.250001
dtype: float64

In [6]:
# we are only interested in the edges in future_connections
# which include edges that do not exist in the graph
G.number_of_edges(), future_connections.shape[0]

(16706, 488446)

In [7]:
ncommon_neighbours = [len(list(nx.common_neighbors(G, node_1, node_2))) for (node_1, node_2) in future_connections.index]
jaccards_coef = [tup[2] for tup in list(nx.jaccard_coefficient(G))]
pref_attach = [tup[2] for tup in list(nx.preferential_attachment(G))]
comm_common_neighbours = [tup[2] for tup in list(nx.cn_soundarajan_hopcroft(G, community = "Department"))]

In [8]:
df = pd.DataFrame({"common_neighbours": ncommon_neighbours,
             "jaccards_coef": jaccards_coef,             
             "prefer_attach": pref_attach,
             "commu_cneigh": comm_common_neighbours}, index = future_connections.index)

In [9]:
merged_df = df.merge(future_connections, how = "inner", left_index = True, right_index = True)\
            .rename({"Future Connection": "future_connection"}, axis = 1)

In [10]:
merged_df.future_connection.isna().sum()

122112

In [11]:
merged_df

Unnamed: 0,common_neighbours,jaccards_coef,prefer_attach,commu_cneigh,future_connection
"(6, 840)",9,0.045802,4180,6,0.0
"(4, 197)",2,0.027273,3124,3,0.0
"(620, 979)",0,0.022222,4224,3,0.0
"(519, 872)",2,0.036364,3168,4,0.0
"(382, 423)",0,0.012821,1628,1,0.0
...,...,...,...,...,...
"(165, 923)",0,0.000000,10,0,
"(673, 755)",0,0.000000,10,0,
"(939, 940)",0,0.000000,1,0,
"(555, 905)",0,0.000000,1,0,


In [12]:
scaler = StandardScaler()

train = merged_df.loc[~merged_df.future_connection.isna(), :]
train_x = train.drop("future_connection", axis = 1)
train_y = train.future_connection

train_x_scaled = scaler.fit_transform(train_x)

In [13]:
test = merged_df.loc[merged_df.future_connection.isna(), :]
test_x = test.drop("future_connection", axis = 1)
test_x_scaled = scaler.transform(test_x)

In [14]:
rforest = RandomForestClassifier(n_estimators = 100)

In [15]:
rforest.fit(train_x_scaled, train_y)

In [24]:
pd.Series(rforest.predict_proba(np.asarray(test_x_scaled))[:, 1], index = test_x.index)

(107, 348)    0.000000
(542, 751)    0.000000
(20, 426)     0.800000
(50, 989)     0.012321
(942, 986)    0.000000
                ...   
(165, 923)    0.000000
(673, 755)    0.000000
(939, 940)    0.000000
(555, 905)    0.000000
(75, 101)     0.001333
Length: 122112, dtype: float64

In [22]:
np.asarray(test_x_scaled)

array([[ 0.0566939 , -0.61475701, -0.37936143, -0.49708651],
       [-0.44041357, -0.61475701, -0.13358809, -0.49708651],
       [ 2.04512376,  0.01612632, -0.48469286, -0.28712177],
       ...,
       [-0.44041357, -0.61475701, -0.62461844, -0.49708651],
       [-0.44041357, -0.61475701, -0.62461844, -0.49708651],
       [-0.19185984, -0.61475701, -0.62461844, -0.49708651]])

In [25]:
def new_connections_predictions():
    
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier
        
    ncommon_neighbours = [len(list(nx.common_neighbors(G, node_1, node_2))) for (node_1, node_2) in future_connections.index]
    jaccards_coef = [tup[2] for tup in list(nx.jaccard_coefficient(G))]
    pref_attach = [tup[2] for tup in list(nx.preferential_attachment(G))]
    comm_common_neighbours = [tup[2] for tup in list(nx.cn_soundarajan_hopcroft(G, community = "Department"))]
    
    df = pd.DataFrame({"common_neighbours": ncommon_neighbours,
                 "jaccards_coef": jaccards_coef,             
                 "prefer_attach": pref_attach,
                 "commu_cneigh": comm_common_neighbours}, index = future_connections.index)
    
    merged_df = df.merge(future_connections, how = "inner", left_index = True, right_index = True)\
                .rename({"Future Connection": "future_connection"}, axis = 1)
    
    scaler = StandardScaler()
    
    train = merged_df.loc[~merged_df.future_connection.isna(), :]
    train_x = train.drop("future_connection", axis = 1)
    train_y = train.future_connection
    
    train_x_scaled = scaler.fit_transform(train_x)
    
    test = merged_df.loc[merged_df.future_connection.isna(), :]
    test_x = test.drop("future_connection", axis = 1)
    test_x_scaled = scaler.transform(test_x)
    
    rforest = RandomForestClassifier(n_estimators = 100)
    rforest.fit(train_x_scaled, train_y)
    
    return pd.Series(rforest.predict_proba(np.asarray(test_x_scaled))[:, 1], index = test_x.index)

In [26]:
new_connections_predictions()

(107, 348)    0.000000
(542, 751)    0.000000
(20, 426)     0.810000
(50, 989)     0.012413
(942, 986)    0.000000
                ...   
(165, 923)    0.000000
(673, 755)    0.000000
(939, 940)    0.000000
(555, 905)    0.000000
(75, 101)     0.000000
Length: 122112, dtype: float64