# Using network analysis to efficiently spread data awareness

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
import seaborn as sns

import networkx as nx
import pandas as pd
import numpy as np


from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)


In [None]:
fname = 'AnonymisedStaff.csv'

allstaffdata = pd.read_csv(fname, encoding = "ISO-8859-1")
colTypes = {'ID': str, 'Department': str, 'Manager': str, 'Location': str, 'DataFan': str}
allstaffdata = pd.read_csv(fname, encoding = "ISO-8859-1", dtype = colTypes)



In [None]:
allstaffdata.columns
allstaffdata.head()
## For anonymised data:
department = 'Department'
identifier = 'ID' 
manager = 'Manager'
location = 'Location'


## Where are the data staff?
* Which departments?
* Which company locations?
* Which teams (indicated by line managers)?

In [None]:
staffdata = allstaffdata

datafans = staffdata[staffdata['DataFan']=='yes']
#List of departments covered by data fans
DFDepartments = np.unique(datafans[[department]])
Departments = np.unique(staffdata[[department]])
#List of locations covered by data fans
DFLocations = np.unique(datafans[[location]])
Locations = np.unique(staffdata[[location]])
#List of data fan line managers
DFManagers = np.unique(datafans[[manager]])
Managers = np.unique(staffdata[[manager]])

print('#Data fans: {}'.format(len(datafans)))

#Coverage of data fans:
print('#Departments covered by data fans: {} out of {}'.format(len(DFDepartments), len(Departments)))
print('#staff covered: {} out of {}\n'.format(len(staffdata[staffdata[department].isin(DFDepartments)]), len(staffdata)))
print(DFDepartments)
print('#Locations covered by data fans: {} out of {}'.format(len(DFLocations), len(Locations)))
print('#staff covered: {} out of {}\n'.format(len(staffdata[staffdata[location].isin(DFLocations)]), len(staffdata)))
print(DFLocations)
print('#Managers covered by data fans: {} out of {}'.format(len(DFManagers), len(Managers)))
print('#staff covered: {} out of {}\n'.format(len(staffdata[staffdata[manager].isin(DFManagers)]), len(staffdata)))
print(DFManagers)

#Identify more susceptible staff members in terms of sharing the same departments, locations or teams as existing fans.
staffdata['PotentialDepartmentDataFan'] = np.where(staffdata[department].isin(DFDepartments), 'yes', 'no')
staffdata['PotentialLocationDataFan'] = np.where(staffdata[location].isin(DFLocations), 'yes', 'no')
staffdata['PotentialManagerDataFan'] = np.where(staffdata[manager].isin(DFManagers), 'yes', 'no')

staffdata[staffdata[identifier].isin(datafans[[identifier]])]['PotentialLocationDataFan'] = 'yes'
staffdata[staffdata[identifier].isin(datafans[[identifier]])]['PotentialDepartmentDataFan'] = 'yes'
staffdata[staffdata[identifier].isin(datafans[[identifier]])]['PotentialManagerDataFan'] = 'yes'

### Data fans in the staff network


In [None]:
import networkx as nx
#https://plot.ly/python/igraph-networkx-comparison/ (Comparison of two of the main Python network libraries.)

#The line management network
G = nx.convert_matrix.from_pandas_edgelist(staffdata, source=manager, target=identifier, 
                                           create_using=nx.DiGraph)
edges = pd.DataFrame({'target' : staffdata[identifier],
                      'source' : staffdata[manager]})

nodes = pd.DataFrame({'node' : staffdata[identifier],
                      'name' : staffdata[identifier],
                      'Department' : staffdata[department],
                      'Manager': staffdata[manager],
                      'Location' : staffdata[location],
                      'DataFan': staffdata['DataFan'],
                     'PotentialDepartmentDataFan': staffdata['PotentialDepartmentDataFan'],
                     'PotentialLocationDataFan': staffdata['PotentialLocationDataFan'],
                     'PotentialManagerDataFan': staffdata['PotentialManagerDataFan']})



In [None]:
#Get the team network - management as proxy for connections between individuals (team membership). 
#Two employees are connected if they share the same line manager.
#Get the list of managers (teams)
from itertools import permutations, chain

managers = staffdata[manager].unique()
manager_teams = {};
for mgr in managers:
    manager_teams[mgr] = staffdata[staffdata[manager]== mgr][identifier].tolist()

#Permutations to get links between staff of the same team.
team_links = [list(permutations(team, 2)) for team in manager_teams.values()] 
team_links = list(chain(*team_links))

#Add the team network to the existing network from line management.
G.add_edges_from(team_links) 
  


In [None]:
# Some centrality measures
d = nx.degree(G)
c = nx.degree_centrality(G)
b = nx.betweenness_centrality(G)

In [None]:
## NB there are some managers who are in the graph but not in staffdata because they are externals, so
##there are a greater number of nodes in G than there are records in staffdata.

degreesdict = {name:degree for name, degree in d}

degrees = pd.DataFrame.from_dict(degreesdict, orient='index')
centralities = pd.DataFrame.from_dict(c, orient='index')
betweenness = pd.DataFrame.from_dict(b, orient='index')
degrees.columns = ['Degree']
centralities.columns = ['DegreeCentrality']
betweenness.columns = ['BetweennessCentrality']
print(degrees.sort_values(by='Degree', ascending=False).head(20))
print(centralities.sort_values(by='DegreeCentrality', ascending=False).head(20))
print(betweenness.sort_values(by='BetweennessCentrality', ascending=False).head(20))

datafannames = datafans[identifier].tolist()
fandegrees = degrees[degrees.index.isin(datafannames)]
fancentralities = centralities[centralities.index.isin(datafannames)]
fanbetweenness = betweenness[betweenness.index.isin(datafannames)]

print(fandegrees.sort_values(by='Degree', ascending=False).head())
print(fancentralities.sort_values(by='DegreeCentrality', ascending=False).head())
print(fanbetweenness.sort_values(by='BetweennessCentrality', ascending=False).head())


In [None]:
# Join degrees data to other data
staffdata = staffdata.merge(degrees, left_on=identifier, right_on=degrees.index)
staffdata = staffdata.merge(centralities, left_on=identifier, right_on=degrees.index)

In [None]:
def plotMetricHist(df, colname='Degree'):
    
    data = [go.Histogram(x=df[colname])]
    layout = go.Layout(
        xaxis=dict(
            #type='log',
            autorange=True
        ),
        yaxis=dict(
            #type='log',
            autorange=True
        )
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    

In [None]:
plotMetricHist(staffdata, 'Degree')
#plotMetricHist(staffdata, 'DegreeCentrality')

In [None]:
staffdata.head()

In [None]:
#Network drawing function.
def drawnetwork(staffdata=staffdata, field='DataFan', d=d, dthreshold=2, sizefield='Degree', 
                sizemultiplier=3, G=G):
    
    #Formatting settings
    #Set colour mappings to field    
    flist = staffdata[field].unique()
    numcolours = len(flist)
    colours = sns.color_palette("hls", numcolours)
    colourmappings = dict(zip(flist, colours))
    
    #Filtering by degree
    #Remove nodes with degree (d) below threshold (dthreshold)
    selected_nodes = [n for n in nodes.name if d[n] > dthreshold]
    plotgraph = G.subgraph(selected_nodes)
    
    # Set node positions
    pos = nx.spring_layout(plotgraph, seed=0)
    for node in plotgraph.nodes():
        plotgraph.node[node]['pos']= pos[node]
        
    # Set other node attributes
    excluded = []
    xlist = []
    ylist = []
    textlist = []
    sizelist = []
    namelist = []
    colourlist = []
    
    

    for node in plotgraph.nodes():
       
    
        try:
            
        
            f = nodes[nodes['name']==node][field].values[0]
        
            x, y = plotgraph.node[node]['pos']
            xlist.append(x)
            ylist.append(y)
            
            ## Add node labels for hover over text
            text = node + ' <br>#connections: ' + str(d[node])
            textlist.append(text)
            
            ## Size the node depending on sizefield and sizemultiplier
            if sizefield=='':
                size = 1;
            else:
                size = staffdata[staffdata[identifier]==node][sizefield].values[0]
            sizelist.append(size * sizemultiplier)
            
            ## Map the colours to the nodes depending on the field values
            fcolour = 'rgba({}, {}, {}, {})'.format(colourmappings[f][0], colourmappings[f][1], colourmappings[f][2], .8)    
            colourlist.append(fcolour)
    
        except:
            excluded.append(node)
        
    print('Number of nodes excluded because {} not given: {}\n'.format(field, len(excluded)))

   
    

    ## Create the visualisation
    xlistedge =[]
    ylistedge = []
    
    for edge in plotgraph.edges():
        x0, y0 = plotgraph.node[edge[0]]['pos']
        x1, y1 = plotgraph.node[edge[1]]['pos']
        xlistedge += [x0, x1, None]
        ylistedge += [y0, y1, None]        
        
    # Create edge trace:
    edge_trace = go.Scatter(x = xlistedge, y = ylistedge, text = textlist,
                    line = go.scatter.Line(width = 0.5, color = '#888'),
                    mode = 'lines', hoverinfo = 'none')
    
    # Create node trace:
    node_trace = go.Scatter(x = xlist, y = ylist, text = textlist, mode = 'markers',
                    hoverinfo='text',
                    marker = go.scatter.Marker(
                    color = colourlist,
                    size = sizelist,
                    line = dict(color='rgb(50,50,50)', width=0.5)))


    data=[node_trace, edge_trace]
    layout = go.Layout(title=field, 
                   showlegend=False, 
                   xaxis=dict(
                   autorange=True,
                   showgrid=False,
                   zeroline=False,
                   showline=False,
                   ticks='',
                   showticklabels=False),
            yaxis=dict(
                autorange=True,
                showgrid=False,
                zeroline=False,
                showline=False,
                ticks='',
                showticklabels=False
            )
        )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='Staff network')
    

In [None]:
drawnetwork(sizefield='')
drawnetwork()

## Making Marketing Decisions based on network analysis

We can use very simple network analyses to decide how best to use our data fans to spread the message. For example, we can compare the networks that would result if they spread the message to staff in their departments vs. focusing on staff in their location vs. spreading to their team (as indicated by shared managers).  


In [None]:
## Show what might happen over time:

def networkevolution(df=staffdata, numsteps=3, transfield=department, infectfield='DataFan'):
    
    network = df.copy(deep=True)
    print('Network evolution over {} time steps based on shared {}s: '.format(numsteps, transfield))
    
    
    for i in range(0, numsteps):
        drawnetwork(staffdata=network, field=infectfield)
        
        infected = network[network[infectfield]=='yes']
        
        #List of field values covered by infected
        infectedVals = np.unique(infected[[transfield]])
        vals = np.unique(network[[transfield]])
        
        print('#{}s covered by infected: {} out of {}'.format(transfield, len(infectedVals), len(vals)))
        print('#staff infected: {} out of {} at t {}'.format(len(infected), len(network), i))
        
        print(infectedVals)
        
        network['next'] = np.where(network[transfield].isin(infectedVals), 'yes', 'no')
        print('#staff next infected: {} out of {}'.format(len(network[network['next']=='yes']), len(network)))
        
        #List of field values covered by infected
        infectedVals = np.unique(infected[[transfield]])
        vals = np.unique(network[[transfield]])
        
    
        network[network[identifier].isin(infected[identifier].values)]['next'] = 'yes'
        #If the field is also a member of staff (e.g. manager), then they also need to be infected.
        network[network[identifier].isin(infectedVals)]['next'] = 'yes'
        
        network.loc[:, infectfield] = network['next'].values
        print('Check reset: {}\n'.format(network[infectfield].equals(network['next'])))
        
        

In [None]:
networkevolution(transfield=department)
networkevolution(transfield=manager)
networkevolution(transfield=location)

### Who should I target next?


In [None]:
#Get the top x most connected individuals
numtop = 20
sortdegrees = staffdata.sort_values(by='Degree', ascending=False)
print(sortdegrees.head(numtop))

### How many disconnected networks are there (in terms of the management structure)?
It is useful to know how many disconnected networks there are so that 'islands' can be targeted separately.

In [None]:
subgraphs = list(nx.weakly_connected_component_subgraphs(G))
print('#Distinct staff networks: {}\n'.format(len(subgraphs)))
subgraphsizes = [nx.number_of_nodes(subgraphs[i]) for i in range(0, len(subgraphs))]
print(subgraphsizes)

# Draw the two largest subgraphs
drawnetwork(field='DataFan', G=subgraphs[0])
drawnetwork(field='DataFan', G=subgraphs[1], dthreshold=0)


