In [136]:
#Import the python package pandas for reading in data
import pandas as pd

In [137]:
#Read in our workshop data with the read_csv
data = pd.read_csv('workshop_survey_data.csv')

In [138]:
#Show our column names so that we can check that we have everything
print(data.columns)

Index(['Timestamp', 'first_name', 'twitter', 'department', 'university',
       'place_uni', 'year', 'year_uni', 'today', 'research', 'conference',
       'experience_in_DH', 'DH_methods', 'rationale', 'DH_means',
       'hogwarts_house', 'game_of_thrones', 'keanu'],
      dtype='object')


In [139]:
#Replace any empty values with a set of empty string
data.fillna('', inplace=True)

In [140]:
#Show our data
print(data)

            Timestamp    first_name          twitter  \
0  5/24/2018 11:39:18           Zoe     @Zoe_LeBlanc   
1  5/24/2018 11:48:35         Keanu     @keanuthings   
2  5/28/2018 18:46:25        Golnar     @GolnarNemat   
3   5/29/2018 8:39:27          Paul                    
4  5/29/2018 11:20:15       Richard                    
5  5/29/2018 12:15:19       Taylor   @taylormariemal   
6  5/29/2018 14:56:24          Jack                    
7  5/29/2018 18:34:10         Lily        @lilyibrew   
8  5/29/2018 18:42:59  Sandra Kruse   @Sandi_Peaches   
9  5/29/2018 20:23:10       Richard       @RLHeppner   

                               department                 university  \
0                  History, Scholars' Lab     University of Virginia   
1                  Comparative Literature                       UCLA   
2         History of Art and Architecture  University of Pittsburgh    
3                                 English                        CMU   
4                      

In [141]:
#Use the splitDataFrameList Method to split up any answers that have multiple values (eg. DH methods or research interests)
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df

# Call the function here, passing in our data and the column name we want to split the values for.
# Try changing the column names
first_split_data = splitDataFrameList(data, 'DH_methods', ',')
second_split_data = splitDataFrameList(first_split_data, 'university', ',')

In [142]:
# Put our source and target columns to lower case so that they are read as same items
second_split_data['DH_methods'] = second_split_data['DH_methods'].str.lower()
second_split_data['university'] = second_split_data['university'].str.lower()

In [143]:
# Select columns for network navigator
network_navigator_data = second_split_data[['DH_methods', 'university']]
# Create a csv file with our data
network_navigator_data.to_csv('navigator.csv', index=False)