In [16]:
#Import the python package pandas for reading in data
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
import community
from networkx.readwrite import json_graph
import json

In [17]:
#Read in our workshop data with the read_csv
data = pd.read_csv('workshop_survey_data.csv')

In [18]:
#Show our column names so that we can check that we have everything
print(data.columns)

Index(['Timestamp', 'first_name', 'twitter', 'department', 'university',
       'place_uni', 'year', 'year_uni', 'today', 'research', 'conference',
       'experience_in_DH', 'DH_methods', 'rationale', 'DH_means',
       'hogwarts_house', 'game_of_thrones', 'keanu'],
      dtype='object')


In [19]:
#Replace any empty values with a set of empty string
data.fillna('', inplace=True)

In [20]:
#Show our data
print(data)

            Timestamp    first_name          twitter  \
0  5/24/2018 11:39:18           Zoe     @Zoe_LeBlanc   
1  5/24/2018 11:48:35         Keanu     @keanuthings   
2  5/28/2018 18:46:25        Golnar     @GolnarNemat   
3   5/29/2018 8:39:27          Paul                    
4  5/29/2018 11:20:15       Richard                    
5  5/29/2018 12:15:19       Taylor   @taylormariemal   
6  5/29/2018 14:56:24          Jack                    
7  5/29/2018 18:34:10         Lily        @lilyibrew   
8  5/29/2018 18:42:59  Sandra Kruse   @Sandi_Peaches   
9  5/29/2018 20:23:10       Richard       @RLHeppner   

                               department                 university  \
0                  History, Scholars' Lab     University of Virginia   
1                  Comparative Literature                       UCLA   
2         History of Art and Architecture  University of Pittsburgh    
3                                 English                        CMU   
4                      

In [21]:
#Use the splitDataFrameList Method to split up any answers that have multiple values (eg. DH methods or research interests)
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

# Call the function here, passing in our data and the column name we want to split the values for.
# Try changing the column names
first_split_data = splitDataFrameList(data, 'DH_methods', ',')
second_split_data = splitDataFrameList(first_split_data, 'university', ',')

In [22]:
# Put our source and target columns to lower case so that they are read as same items
second_split_data['DH_methods'] = second_split_data['DH_methods'].str.lower()
second_split_data['university'] = second_split_data['university'].str.lower()

In [42]:
# Let's check some metrics
# Is this a bipartite graph? 
G=nx.from_pandas_edgelist(second_split_data, 'first_name', 'university')
bipartite_graph = nx.is_bipartite(G)
print(bipartite_graph)
#Is this graph connected?
connected = nx.is_connected(G)
print(connected)
# Can we detect a community?
partition = community.best_partition(G)
print(partition)
# If yes, color the partition
color = bipartite.color(G)

# Add partition and color value sto nodes
for d, v in G.nodes(data=True):
    v['group'] = color[d]
    v['community'] = partition[d]
    
# Create a json file of nodes and write to file
data = json_graph.node_link_data(G)
with open('bipartite_data.json', 'w') as outfile:
    json.dump(data, outfile)

True
False
{'Zoe': 0, 'university of virginia': 0, 'Keanu': 1, 'ucla': 1, 'Golnar': 2, 'university of pittsburgh ': 2, 'Paul': 3, 'cmu': 3, 'Richard': 4, 'Taylor ': 4, 'duquesne university ': 4, 'Jack': 3, 'Lily ': 2, 'Sandra Kruse': 3}


In [None]:
# THAT SHOULD NOT WORK! 
#Community is an algorithm known as Louvain that is designed to work on unipartite networks 
# https://en.wikipedia.org/wiki/Louvain_Modularity
#What happens if our dataset gets bigger???

In [53]:
# Reshape our table so that first_name and university are the same entity in the bipartite graph
df = pd.melt(second_split_data, id_vars=["DH_methods"], value_vars=["first_name", "university"], value_name='entity').drop('variable', 1)
#Create our graph
G=nx.from_pandas_edgelist(df, 'DH_methods', 'entity')
bipartite_graph = nx.is_bipartite(G)
print(bipartite_graph)
#Is this graph connected?
connected = nx.is_connected(G)
print(connected)
# Can we detect a community?
partition = community.best_partition(G)
print(partition)
color = bipartite.color(G)
for d, v in G.nodes(data=True):
    v['group'] = color[d]
    v['community'] = partition[d]
data = json_graph.node_link_data(G)
with open('larger_bipartite_data.json', 'w') as outfile:
    json.dump(data, outfile)

True
True
{'networks': 0, 'Zoe': 1, ' text mining': 1, ' mapping': 0, ' data visualization': 0, ' databases': 1, ' blogging': 1, ' pedagogy': 1, 'pedagogy': 2, 'Keanu': 2, ' machine learning/ai/buzzword tech stuff': 2, 'Golnar': 0, ' website creation': 0, 'Paul': 2, 'Richard': 0, 'data visualization': 1, 'Taylor ': 1, 'Jack': 0, 'Lily ': 0, 'Sandra Kruse': 1, 'university of virginia': 1, 'ucla': 2, 'university of pittsburgh ': 0, 'cmu': 1, 'duquesne university ': 1}


In [None]:
# How can we create a unipartite network from our data??

In [48]:
df = pd.crosstab(second_split_data.university, second_split_data.first_name)
idx = df.columns.union(df.index)
df = df.reindex(index = idx, columns=idx, fill_value=0)
T = nx.from_pandas_adjacency(df)
partition = community.best_partition(T)
for d, v in T.nodes(data=True):
    v['community'] = partition[d]
data = json_graph.node_link_data(T)
data
with open('matrix_data.json', 'w') as outfile:
    json.dump(data, outfile)

In [51]:
df = second_split_data[['first_name', 'university', 'DH_methods']]
df2 = pd.melt(df, id_vars=["DH_methods"], value_vars=["first_name", "university"], value_name='entity').drop('variable', 1)
df = df2
df = pd.crosstab(df.DH_methods, df.entity)
idx = df.columns.union(df.index)
df = df.reindex(index = idx, columns=idx, fill_value=0)
print(df)
T = nx.from_pandas_adjacency(df)
degree = nx.degree(T)
partition = community.best_partition(T)
for d, v in T.nodes(data=True):
    v['value'] = degree[d]
    v['community'] = partition[d]

for u,v,d in T.edges(data=True):
    d['value'] = degree[u]
data = json_graph.node_link_data(T)
data
with open('matrix_data.json', 'w') as outfile:
    json.dump(data, outfile)

                                           blogging   data visualization  \
 blogging                                         0                    0   
 data visualization                               0                    0   
 databases                                        0                    0   
 machine learning/ai/buzzword tech stuff          0                    0   
 mapping                                          0                    0   
 pedagogy                                         0                    0   
 text mining                                      0                    0   
 website creation                                 0                    0   
Golnar                                            0                    0   
Jack                                              0                    0   
Keanu                                             0                    0   
Lily                                              0                    0   
Paul        