# Management Salary Prediction from email network

In [1]:
! head -n 2 datadump/email_prediction_vf.pkl

€•      Œnetworkx.classes.graph”ŒGraph”“”)�”}”(Œgraph_attr_dict_factory”builtins”Œdict”“”Œnode_dict_factory”Œnode_attr_dict_factory”Œadjlist_outer_dict_factory”Œadjlist_inner_dict_factory”Œedge_attr_dict_factory”Œgraph”}”Œ_node”}”(K }”(Œ
Department”Œnumpy.core.multiarray”Œscalar”“”Œnumpy”Œdtype”“”Œi8”K K‡”R”(KŒ<”NNNJÿÿÿÿJÿÿÿÿK t”b       ”†”R”ŒManagementSalary”hhŒf8”K K‡”R”(KhNNNJÿÿÿÿJÿÿÿÿK t”b        ”†”R”uK}”(hhh       ”†”R”h"hh%      ø”†”R”uK}”(hhh       ”†”R”h"hh%      ø”†”R”uK}”(hhh       ”†”R”h"hh%      ð?”†”R”uK}”(hhh       ”†”R”h"hh%      ð?”†”R”uK}”(hhh       ”†”R”h"hh%      ø”†”R”uK}”(hhh       ”†”R”h"hh%      ð?”†”R”uK}”(hhh       ”†”R”h"hh%        ”†”R”u}”(hhh       ”†”R”h"hh%      ø”†”R”uK	}”(hhh       ”†”R”h"hh%        ”†”R”uK


The `email_prediction.txt` contains the gpickle file of the network

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%matplotlib widget


In [4]:
G = nx.read_gpickle('datadump/email_prediction_vf.pkl')

In [5]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 1005
Number of edges: 16706
Average degree:  33.2458


In [6]:
list(G.nodes(data=True))[1:20]

[(1, {'Department': 1, 'ManagementSalary': nan}),
 (2, {'Department': 21, 'ManagementSalary': nan}),
 (3, {'Department': 21, 'ManagementSalary': 1.0}),
 (4, {'Department': 21, 'ManagementSalary': 1.0}),
 (5, {'Department': 25, 'ManagementSalary': nan}),
 (6, {'Department': 25, 'ManagementSalary': 1.0}),
 (7, {'Department': 14, 'ManagementSalary': 0.0}),
 (8, {'Department': 14, 'ManagementSalary': nan}),
 (9, {'Department': 14, 'ManagementSalary': 0.0}),
 (10, {'Department': 9, 'ManagementSalary': 0.0}),
 (11, {'Department': 14, 'ManagementSalary': 0.0}),
 (12, {'Department': 14, 'ManagementSalary': 1.0}),
 (13, {'Department': 26, 'ManagementSalary': 1.0}),
 (14, {'Department': 4, 'ManagementSalary': nan}),
 (15, {'Department': 17, 'ManagementSalary': 0.0}),
 (16, {'Department': 34, 'ManagementSalary': 0.0}),
 (17, {'Department': 1, 'ManagementSalary': 0.0}),
 (18, {'Department': 1, 'ManagementSalary': nan}),
 (19, {'Department': 14, 'ManagementSalary': 0.0})]

Here we note that each node is an employee and they have 2 attributes. `Department` refers to their Department code and `ManagementSalary` is a binary variable with
* `1` indicating that they draw a management salary
* `0` indicating that they don't
* `nan` indicating the missing information which we need to predict from the data given (acts as test set)

In [7]:
list(G.edges(data=True))[1:20]

[(0, 17, {}),
 (0, 316, {}),
 (0, 146, {}),
 (0, 581, {}),
 (0, 268, {}),
 (0, 221, {}),
 (0, 218, {}),
 (0, 18, {}),
 (0, 734, {}),
 (0, 178, {}),
 (0, 380, {}),
 (0, 0, {}),
 (0, 459, {}),
 (0, 215, {}),
 (0, 250, {}),
 (0, 148, {}),
 (0, 73, {}),
 (0, 74, {}),
 (0, 248, {})]

In [8]:
df = pd.DataFrame(index=G.nodes())
df["Department"] = pd.Series(nx.get_node_attributes(G,"Department"))
df["ManagementSalary"] = pd.Series(nx.get_node_attributes(G,"ManagementSalary"))

In [9]:
df

Unnamed: 0,Department,ManagementSalary
0,1,0.0
1,1,
2,21,
3,21,1.0
4,21,1.0
...,...,...
1000,4,
1001,21,
1002,1,0.0
1003,6,0.0


In [10]:
df['clustering'] = pd.Series(nx.clustering(G))
df['degree'] = pd.Series(dict(G.degree()))

In [11]:
df

Unnamed: 0,Department,ManagementSalary,clustering,degree
0,1,0.0,0.276423,44
1,1,,0.265306,52
2,21,,0.297803,95
3,21,1.0,0.384910,71
4,21,1.0,0.318691,96
...,...,...,...,...
1000,4,,0.600000,6
1001,21,,0.844444,10
1002,1,0.0,0.000000,1
1003,6,0.0,0.000000,1


### Understanding the degree distribution of the network

In [12]:
df.hist("degree")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [13]:
f, (ax1,ax2) = plt.subplots(1, 2, sharey=True, sharex = True)
pd.plotting.hist_frame(df[df["ManagementSalary"] == 1], "degree", ax=ax1,density=True,)
ax1.set_title("ManagementSalary == 1")
pd.plotting.hist_frame(df[df["ManagementSalary"] == 0], "degree", ax=ax2, density=True)
ax2.set_title("ManagementSalary == 0")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

The folks with management salary seem to have a large degree as opposed to the other group

In [14]:
f, (ax1,ax2) = plt.subplots(1, 2, sharey=True, sharex = True)
pd.plotting.hist_frame(df[df["ManagementSalary"] == 1], "clustering", ax=ax1,density=True,)
ax1.set_title("ManagementSalary == 1")
pd.plotting.hist_frame(df[df["ManagementSalary"] == 0], "clustering", ax=ax2, density=True)
ax2.set_title("ManagementSalary == 0")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
management_nodes = [node_detail[0] for node_detail in list(G.nodes(data=True)) if node_detail[1]["ManagementSalary"] == 1]
print(management_nodes)

[3, 4, 6, 12, 13, 21, 28, 29, 35, 36, 44, 47, 57, 58, 63, 64, 81, 82, 83, 84, 86, 87, 96, 105, 106, 107, 114, 115, 121, 128, 129, 131, 132, 133, 135, 136, 137, 138, 147, 151, 153, 160, 165, 166, 170, 171, 183, 187, 189, 191, 197, 198, 201, 209, 210, 211, 232, 269, 280, 281, 282, 285, 290, 292, 301, 318, 327, 329, 333, 337, 340, 355, 361, 376, 377, 379, 388, 397, 405, 411, 414, 417, 418, 419, 423, 424, 432, 444, 446, 453, 454, 462, 481, 489, 493, 495, 498, 509, 527, 543, 546, 548, 550, 552, 560, 573, 594, 747, 782, 809, 821, 828, 840, 859, 880, 882, 895, 925, 971]


In [16]:
non_management_nodes = [node_detail[0] for node_detail in list(G.nodes(data=True)) if node_detail[1]["ManagementSalary"] == 0]
print(non_management_nodes)

[0, 7, 9, 10, 11, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 32, 33, 38, 39, 41, 42, 43, 46, 48, 49, 50, 51, 52, 53, 56, 59, 61, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 80, 85, 88, 89, 90, 91, 92, 93, 94, 95, 98, 99, 100, 102, 104, 109, 110, 111, 112, 116, 117, 118, 119, 120, 123, 124, 125, 126, 127, 130, 134, 139, 140, 143, 146, 148, 149, 152, 155, 156, 157, 159, 161, 162, 163, 164, 167, 168, 169, 172, 173, 174, 176, 177, 178, 179, 180, 182, 184, 185, 186, 188, 190, 192, 194, 195, 199, 203, 205, 206, 207, 208, 212, 213, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 233, 234, 235, 237, 238, 241, 242, 243, 244, 245, 246, 248, 250, 254, 256, 257, 258, 259, 260, 263, 265, 268, 273, 274, 275, 276, 277, 284, 286, 287, 288, 289, 291, 293, 296, 297, 300, 302, 303, 304, 305, 307, 309, 313, 314, 321, 322, 323, 324, 325, 326, 328, 330, 332, 334, 335, 336, 338, 339, 341, 343, 344, 345, 346, 347, 348, 349, 350, 353, 354, 357, 358, 359, 360, 364, 365, 368, 369, 370, 371

In [17]:
nan_nodes = [node_detail[0] for node_detail in list(G.nodes(data=True)) if np.isnan(node_detail[1]["ManagementSalary"])]
print(nan_nodes)

[1, 2, 5, 8, 14, 18, 27, 30, 31, 34, 37, 40, 45, 54, 55, 60, 62, 65, 77, 79, 97, 101, 103, 108, 113, 122, 141, 142, 144, 145, 150, 154, 158, 175, 181, 193, 196, 200, 202, 204, 214, 215, 230, 231, 236, 239, 240, 247, 249, 251, 252, 253, 255, 261, 262, 264, 266, 267, 270, 271, 272, 278, 279, 283, 294, 295, 298, 299, 306, 308, 310, 311, 312, 315, 316, 317, 319, 320, 331, 342, 351, 352, 356, 362, 363, 366, 367, 372, 380, 382, 384, 385, 386, 389, 395, 399, 402, 406, 408, 409, 412, 420, 434, 435, 443, 447, 451, 456, 457, 458, 465, 466, 471, 477, 482, 483, 484, 485, 487, 492, 494, 496, 500, 503, 505, 513, 516, 518, 520, 522, 524, 529, 530, 531, 533, 538, 545, 557, 571, 578, 582, 583, 602, 604, 605, 612, 613, 615, 625, 636, 640, 641, 646, 647, 651, 655, 656, 664, 666, 669, 670, 671, 676, 678, 682, 683, 685, 691, 708, 710, 713, 717, 720, 725, 729, 736, 738, 741, 742, 743, 748, 758, 760, 764, 765, 766, 768, 773, 776, 783, 786, 787, 788, 789, 798, 799, 800, 808, 817, 818, 819, 820, 822, 825, 826,

In [18]:
def get_management_neighbour_count(node_id):
    neighbours = set(list(G.neighbors(node_id)))
    management_count = len(neighbours.intersection(management_nodes))
    return management_count
    

In [19]:
def get_nonmanagement_neighbour_count(node_id):
    neighbours = set(list(G.neighbors(node_id)))
    non_management_count = len(neighbours.intersection(non_management_nodes))
    return non_management_count

In [20]:
df["management_neighbours_count"] = pd.Series(dict(map(lambda x: (x, get_management_neighbour_count(x)), list(df.index))))
df["management_neighbours_pct"] = df["management_neighbours_count"]/ df["degree"]
df["nonmanagement_neighbours_count"] = pd.Series(dict(map(lambda x: (x, get_nonmanagement_neighbour_count(x)), list(df.index))))
df["nonmanagement_neighbours_pct"] = df["nonmanagement_neighbours_count"]/ df["degree"]
df

Unnamed: 0,Department,ManagementSalary,clustering,degree,management_neighbours_count,management_neighbours_pct,nonmanagement_neighbours_count,nonmanagement_neighbours_pct
0,1,0.0,0.276423,44,6,0.136364,26,0.590909
1,1,,0.265306,52,15,0.288462,27,0.519231
2,21,,0.297803,95,31,0.326316,34,0.357895
3,21,1.0,0.384910,71,24,0.338028,25,0.352113
4,21,1.0,0.318691,96,35,0.364583,36,0.375000
...,...,...,...,...,...,...,...,...
1000,4,,0.600000,6,3,0.500000,1,0.166667
1001,21,,0.844444,10,3,0.300000,4,0.400000
1002,1,0.0,0.000000,1,1,1.000000,0,0.000000
1003,6,0.0,0.000000,1,0,0.000000,1,1.000000


### Understanding the neighbours linked with managers

In [21]:
f, (ax1,ax2) = plt.subplots(1, 2, sharey=True, sharex = True)
pd.plotting.hist_frame(df[df["ManagementSalary"] == 1],"management_neighbours_pct" , ax=ax1,density=True,)
ax1.set_title("ManagementSalary == 1")
pd.plotting.hist_frame(df[df["ManagementSalary"] == 0], "management_neighbours_pct", ax=ax2, density=True)
ax2.set_title("ManagementSalary == 0")
f.suptitle("Distribution of management neighbours as percentage of node degree")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
f, (ax1,ax2) = plt.subplots(1, 2, sharey=True, sharex = True)
pd.plotting.hist_frame(df[df["ManagementSalary"] == 1],"nonmanagement_neighbours_pct" , ax=ax1,density=True,)
ax1.set_title("ManagementSalary == 1")
pd.plotting.hist_frame(df[df["ManagementSalary"] == 0], "nonmanagement_neighbours_pct", ax=ax2, density=True)
ax2.set_title("ManagementSalary == 0")
f.suptitle("Distribution of non-management neighbours as percentage of node degree")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Understanding the centrality scores of the nodes

* _Closeness Centrality_ gives  the reciprocal of the average shortest path distance to `u` over all `n-1` reachable nodes

* _Betweenness centrality_ of a node $v$ is the sum of the
fraction of all-pairs shortest paths that pass through $v$ $ c_B(v) =\sum_{s,t \in V} \frac{\sigma(s, t|v)}{\sigma(s, t)} $ where $V$ is the set of nodes, $\sigma(s, t)$ is the number of shortest $(s, t)$-paths,  and $\sigma(s, t|v)$ is the number of those paths  passing through some  node $v$ other than $s, t$.
If $s = t$, $\sigma(s, t) = 1$, and if $v \in {s, t}$, $\sigma(s, t|v) = 0$ .

In [23]:
df["betweenness_centrality"] = pd.Series(nx.betweenness_centrality(G))
df["closeness_centrality"] = pd.Series(nx.closeness_centrality(G))

In [24]:
hubs, authorities = nx.hits(G)
df["hubs"] = pd.Series(hubs)
df["authorities"] = pd.Series(authorities)
df

Unnamed: 0,Department,ManagementSalary,clustering,degree,management_neighbours_count,management_neighbours_pct,nonmanagement_neighbours_count,nonmanagement_neighbours_pct,betweenness_centrality,closeness_centrality,hubs,authorities
0,1,0.0,0.276423,44,6,0.136364,26,0.590909,0.001124,0.421991,0.000944,0.000944
1,1,,0.265306,52,15,0.288462,27,0.519231,0.001195,0.422360,0.001472,0.001472
2,21,,0.297803,95,31,0.326316,34,0.357895,0.006570,0.461490,0.002680,0.002680
3,21,1.0,0.384910,71,24,0.338028,25,0.352113,0.001654,0.441663,0.002369,0.002369
4,21,1.0,0.318691,96,35,0.364583,36,0.375000,0.005547,0.462152,0.003055,0.003055
...,...,...,...,...,...,...,...,...,...,...,...,...
1000,4,,0.600000,6,3,0.500000,1,0.166667,0.000004,0.355934,0.000161,0.000161
1001,21,,0.844444,10,3,0.300000,4,0.400000,0.000004,0.339789,0.000194,0.000194
1002,1,0.0,0.000000,1,1,1.000000,0,0.000000,0.000000,0.297983,0.000017,0.000017
1003,6,0.0,0.000000,1,0,0.000000,1,1.000000,0.000000,0.298167,0.000024,0.000024


In [25]:
def plot_comparison(column_name, title):
    f, (ax1,ax2) = plt.subplots(1, 2, sharey=True, sharex = True)
    pd.plotting.hist_frame(df[df["ManagementSalary"] == 1],column_name , ax=ax1,density=True,)
    ax1.set_title("ManagementSalary == 1")
    pd.plotting.hist_frame(df[df["ManagementSalary"] == 0], column_name, ax=ax2, density=True)
    ax2.set_title("ManagementSalary == 0")
    f.suptitle(title)
    plt.show()

In [26]:
plot_comparison(column_name="betweenness_centrality",title="Distribution of betweennness centrality")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [27]:
plot_comparison(column_name="closeness_centrality",title="Distribution of closeness centrality")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [28]:
plot_comparison(column_name="hubs",title="Distribution of hub score")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [29]:
plot_comparison(column_name="authorities",title="Distribution of authorities score")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [30]:
df[df["ManagementSalary"] == 1].describe()

Unnamed: 0,Department,ManagementSalary,clustering,degree,management_neighbours_count,management_neighbours_pct,nonmanagement_neighbours_count,nonmanagement_neighbours_pct,betweenness_centrality,closeness_centrality,hubs,authorities
count,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0
mean,17.680672,1.0,0.382361,81.941176,26.344538,0.353486,35.680672,0.411299,0.006493,0.446433,0.002658,0.002658
std,10.914097,0.0,0.204973,58.125856,17.673324,0.12872,28.343514,0.108583,0.010271,0.042038,0.001812,0.001812
min,0.0,1.0,0.054423,2.0,1.0,0.083333,0.0,0.0,0.0,0.306198,3e-05,3e-05
25%,10.0,1.0,0.24902,38.0,13.0,0.293774,14.5,0.36675,0.000536,0.423842,0.001318,0.001318
50%,15.0,1.0,0.335484,63.0,22.0,0.333333,25.0,0.415929,0.004163,0.443081,0.002235,0.002235
75%,25.5,1.0,0.461247,119.5,37.0,0.39039,52.0,0.473684,0.008093,0.475688,0.003911,0.003911
max,38.0,1.0,1.0,347.0,94.0,1.0,164.0,0.714286,0.087415,0.573848,0.00837,0.00837


In [31]:
df[df["ManagementSalary"] == 0].describe()

Unnamed: 0,Department,ManagementSalary,clustering,degree,management_neighbours_count,management_neighbours_pct,nonmanagement_neighbours_count,nonmanagement_neighbours_pct,betweenness_centrality,closeness_centrality,hubs,authorities
count,634.0,634.0,634.0,634.0,634.0,634.0,634.0,634.0,634.0,634.0,634.0,634.0
mean,12.996845,0.0,0.39755,24.613565,6.697161,0.283558,11.65142,0.458239,0.000664,0.366926,0.0007050412,0.0007050412
std,9.964391,0.0,0.260008,23.895584,7.943904,0.230709,11.153812,0.216296,0.001135,0.074726,0.0008324019,0.0008324019
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.057999000000001e-60,4.675065e-58
25%,4.0,0.0,0.273066,5.0,1.0,0.142857,2.0,0.362648,3e-06,0.33674,0.0001140876,0.0001140876
50%,13.0,0.0,0.372597,19.0,4.0,0.25,9.0,0.474547,0.00012,0.376235,0.0004235229,0.0004235229
75%,19.0,0.0,0.526316,35.0,9.0,0.355157,18.0,0.571429,0.000756,0.41293,0.0009607271,0.0009607271
max,41.0,0.0,1.0,129.0,48.0,1.0,54.0,1.0,0.005881,0.487568,0.005141574,0.005141574


In [32]:
def plot_scatter_by_management(x,y):
    plt.figure()
    plt.scatter(df[df["ManagementSalary"] == 0][x],df[df["ManagementSalary"] == 0][y],marker="o", color="blue")
    plt.scatter(df[df["ManagementSalary"] == 1][x],df[df["ManagementSalary"] == 1][y],marker="*", color="red")
    plt.show()

plot_scatter_by_management("degree","hubs")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [33]:
plot_scatter_by_management("degree","management_neighbours_pct")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
plot_scatter_by_management("nonmanagement_neighbours_pct", "management_neighbours_pct")

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [35]:
validation_df = df.loc[nan_nodes]
validation_df

Unnamed: 0,Department,ManagementSalary,clustering,degree,management_neighbours_count,management_neighbours_pct,nonmanagement_neighbours_count,nonmanagement_neighbours_pct,betweenness_centrality,closeness_centrality,hubs,authorities
1,1,,0.265306,52,15,0.288462,27,0.519231,0.001195,0.422360,0.001472,0.001472
2,21,,0.297803,95,31,0.326316,34,0.357895,0.006570,0.461490,0.002680,0.002680
5,25,,0.107002,171,50,0.292398,83,0.485380,0.030995,0.501484,0.004011,0.004011
8,14,,0.447059,37,9,0.243243,22,0.594595,0.000557,0.413151,0.000791,0.000791
14,4,,0.215784,80,15,0.187500,44,0.550000,0.003726,0.442068,0.001856,0.001856
...,...,...,...,...,...,...,...,...,...,...,...,...
992,4,,0.000000,3,1,0.333333,0,0.000000,0.000000,0.332998,0.000072,0.000072
994,21,,0.000000,1,1,1.000000,0,0.000000,0.000000,0.320517,0.000033,0.000033
996,14,,0.000000,1,1,1.000000,0,0.000000,0.000000,0.300578,0.000019,0.000019
1000,4,,0.600000,6,3,0.500000,1,0.166667,0.000004,0.355934,0.000161,0.000161


In [36]:
train_indices = management_nodes.copy()
train_indices.extend(non_management_nodes)

train_df = df.loc[train_indices]
train_df

Unnamed: 0,Department,ManagementSalary,clustering,degree,management_neighbours_count,management_neighbours_pct,nonmanagement_neighbours_count,nonmanagement_neighbours_pct,betweenness_centrality,closeness_centrality,hubs,authorities
3,21,1.0,0.384910,71,24,0.338028,25,0.352113,0.001654,0.441663,0.002369,0.002369
4,21,1.0,0.318691,96,35,0.364583,36,0.375000,0.005547,0.462152,0.003055,0.003055
6,25,1.0,0.155183,115,34,0.295652,52,0.452174,0.012387,0.475805,0.002554,0.002554
12,14,1.0,0.302597,58,13,0.224138,29,0.500000,0.004715,0.433151,0.001500,0.001500
13,26,1.0,0.110900,180,53,0.294444,93,0.516667,0.023565,0.503050,0.004325,0.004325
...,...,...,...,...,...,...,...,...,...,...,...,...
998,14,0.0,0.000000,1,0,0.000000,0,0.000000,0.000000,0.313753,0.000026,0.000026
999,15,0.0,0.000000,1,0,0.000000,0,0.000000,0.000000,0.311729,0.000040,0.000040
1002,1,0.0,0.000000,1,1,1.000000,0,0.000000,0.000000,0.297983,0.000017,0.000017
1003,6,0.0,0.000000,1,0,0.000000,1,1.000000,0.000000,0.298167,0.000024,0.000024


In [37]:
features = set(df.columns)
features = list(features.difference(['ManagementSalary','nonmanagement_neighbours_count', 'management_neighbours_count']))
print(features)

['management_neighbours_pct', 'nonmanagement_neighbours_pct', 'hubs', 'closeness_centrality', 'authorities', 'Department', 'degree', 'clustering', 'betweenness_centrality']


In [38]:
target = 'ManagementSalary'

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(train_df[features],train_df[target],test_size = 0.25)

In [41]:
from sklearn.model_selection import cross_val_score
from sklearn import svm

In [42]:
clf_svm = svm.SVC(kernel='rbf', C=1)

In [43]:
scores = cross_val_score(clf_svm, X_train, y_train, cv=5)

In [44]:
scores

array([0.87610619, 0.88495575, 0.85840708, 0.90265487, 0.85714286])

In [45]:
clf_svm.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
clf_rf = RandomForestClassifier(n_jobs=-1, n_estimators=500)

In [48]:
scores = cross_val_score(clf_rf, X_train, y_train, cv=5)

In [49]:
scores

array([0.91150442, 0.92920354, 0.88495575, 0.90265487, 0.875     ])

In [50]:
clf_rf = clf_rf.fit(X_train, y_train)

In [51]:
from sklearn.ensemble import GradientBoostingClassifier

In [52]:
clf_gb = GradientBoostingClassifier(n_estimators=500)

In [53]:
scores = cross_val_score(clf_gb, X_train, y_train, cv=5)
scores

array([0.92035398, 0.91150442, 0.90265487, 0.90265487, 0.89285714])

In [54]:
clf_gb.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [63]:
from sklearn.metrics import roc_curve, roc_auc_score, classification_report

In [56]:
svm_pred = clf_svm.predict(X_train)

In [57]:
fpr, tpr , thresholds = roc_curve(y_train, svm_pred)

In [58]:
roc_auc = roc_auc_score(y_train, svm_pred)
roc_auc

0.6568917018284106

In [59]:
def plot_roc(y_true, y_pred, method):
    fpr, tpr , thresholds = roc_curve(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange',
             lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver operating characteristic - {method}')
    plt.legend(loc="lower right")
    plt.show()
    

In [65]:
svm_test_pred = clf_svm.predict(X_test)
plot_roc(y_test, svm_test_pred, "SVM - Test")
print(classification_report(y_test, svm_test_pred))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95       160
         1.0       0.87      0.45      0.59        29

    accuracy                           0.90       189
   macro avg       0.89      0.72      0.77       189
weighted avg       0.90      0.90      0.89       189



In [66]:
rf_pred_test = clf_rf.predict(X_test)
plot_roc(y_test, rf_pred_test, "Random Forest- Test")
print(classification_report(y_test, rf_pred_test))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       160
         1.0       0.76      0.66      0.70        29

    accuracy                           0.92       189
   macro avg       0.85      0.81      0.83       189
weighted avg       0.91      0.92      0.91       189



In [67]:
gb_pred_test = clf_gb.predict(X_test)
plot_roc(y_test, gb_pred_test, "GBT - Test")
print(classification_report(y_test, gb_pred_test))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       160
         1.0       0.76      0.76      0.76        29

    accuracy                           0.93       189
   macro avg       0.86      0.86      0.86       189
weighted avg       0.93      0.93      0.93       189

