In [40]:
#Import the python package pandas for reading in data
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
import community

In [2]:
#Read in our workshop data with the read_csv
data = pd.read_csv('workshop_survey_data.csv')

In [3]:
#Show our column names so that we can check that we have everything
print(data.columns)

Index(['Timestamp', 'first_name', 'twitter', 'department', 'university',
       'place_uni', 'year', 'year_uni', 'today', 'research', 'conference',
       'experience_in_DH', 'DH_methods', 'rationale', 'DH_means',
       'hogwarts_house', 'game_of_thrones', 'keanu'],
      dtype='object')


In [4]:
#Replace any empty values with a set of empty string
data.fillna('', inplace=True)

In [140]:
#Show our data
print(data)

            Timestamp    first_name          twitter  \
0  5/24/2018 11:39:18           Zoe     @Zoe_LeBlanc   
1  5/24/2018 11:48:35         Keanu     @keanuthings   
2  5/28/2018 18:46:25        Golnar     @GolnarNemat   
3   5/29/2018 8:39:27          Paul                    
4  5/29/2018 11:20:15       Richard                    
5  5/29/2018 12:15:19       Taylor   @taylormariemal   
6  5/29/2018 14:56:24          Jack                    
7  5/29/2018 18:34:10         Lily        @lilyibrew   
8  5/29/2018 18:42:59  Sandra Kruse   @Sandi_Peaches   
9  5/29/2018 20:23:10       Richard       @RLHeppner   

                               department                 university  \
0                  History, Scholars' Lab     University of Virginia   
1                  Comparative Literature                       UCLA   
2         History of Art and Architecture  University of Pittsburgh    
3                                 English                        CMU   
4                      

In [5]:
#Use the splitDataFrameList Method to split up any answers that have multiple values (eg. DH methods or research interests)
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

# Call the function here, passing in our data and the column name we want to split the values for.
# Try changing the column names
first_split_data = splitDataFrameList(data, 'DH_methods', ',')
second_split_data = splitDataFrameList(first_split_data, 'university', ',')

In [6]:
# Put our source and target columns to lower case so that they are read as same items
second_split_data['DH_methods'] = second_split_data['DH_methods'].str.lower()
second_split_data['university'] = second_split_data['university'].str.lower()

In [36]:
# Let's create a network from our table
# First we create the edges from our two columns
G=nx.from_pandas_edgelist(second_split_data, 'first_name', 'university')
# If we want to include other attributes our nodes uncomment these lines
# node_data = second_split_data.set_index('first_name').to_dict('index').items()
# G.add_nodes_from(node_data)

In [66]:
# Let's check some metrics
# Is this a bipartite graph? 
bipartite_graph = nx.is_bipartite(G)
print(bipartite_graph)
#Is this graph connected?
connected = nx.is_connected(G)
print(connected)

True
True


In [67]:
# Let's compare the game of thrones network
GOT = pd.read_csv('GameOfThronesNetwork.txt')
T=nx.from_pandas_edgelist(GOT, 'Source', 'Target')
# Is this a bipartite graph? 
bipartite_graph = nx.is_bipartite(T)
print(bipartite_graph)
#Is this graph connected?
connected = nx.is_connected(T)
print(connected)

False
True


In [63]:
df = second_split_data[['first_name', 'university', 'DH_methods']]
# stacked_df = df.stack(level=0)
# stacked_df
df2 = pd.melt(df, id_vars=["DH_methods"], value_vars=["first_name", "university"], value_name='entity').drop('variable', 1)
df = df2
df = pd.crosstab(df.DH_methods, df.entity)
idx = df.columns.union(df.index)
df = df.reindex(index = idx, columns=idx, fill_value=0)
print(df)
T = nx.from_pandas_adjacency(df)
T.edges(data=True)
# degree = nx.degree(T)
# clustering = nx.average_clustering(T)
# partition = community.best_partition(T)
# connected = nx.is_connected(T)
# triangles = nx.triangles(T)
# transitivity = nx.transitivity(T)
# clusters = nx.clustering(T)
# cliques = nx.find_cliques(T)
# bip = bipartite.is_bipartite(T)
# print( bip, connected, triangles, list(cliques), )

                                           blogging   data visualization  \
 blogging                                         0                    0   
 data visualization                               0                    0   
 databases                                        0                    0   
 machine learning/ai/buzzword tech stuff          0                    0   
 mapping                                          0                    0   
 pedagogy                                         0                    0   
 text mining                                      0                    0   
 website creation                                 0                    0   
Golnar                                            0                    0   
Jack                                              0                    0   
Keanu                                             0                    0   
Lily                                              0                    0   
Paul        

EdgeDataView([(' blogging', 'Golnar', {'weight': 1}), (' blogging', 'Jack', {'weight': 1}), (' blogging', 'Paul', {'weight': 1}), (' blogging', 'Richard', {'weight': 1}), (' blogging', 'Sandra Kruse', {'weight': 1}), (' blogging', 'Taylor ', {'weight': 1}), (' blogging', 'Zoe', {'weight': 1}), (' blogging', 'cmu', {'weight': 3}), (' blogging', 'duquesne university ', {'weight': 2}), (' blogging', 'university of pittsburgh ', {'weight': 1}), (' blogging', 'university of virginia', {'weight': 1}), (' data visualization', 'Golnar', {'weight': 1}), (' data visualization', 'Jack', {'weight': 1}), (' data visualization', 'Paul', {'weight': 1}), (' data visualization', 'Richard', {'weight': 2}), (' data visualization', 'Zoe', {'weight': 1}), (' data visualization', 'cmu', {'weight': 3}), (' data visualization', 'duquesne university ', {'weight': 1}), (' data visualization', 'university of pittsburgh ', {'weight': 1}), (' data visualization', 'university of virginia', {'weight': 1}), (' databa