# HIDVL metadata spreadsheet reshape script
Sometimes, an HIDVL batch will only include new records. As part of the process to generate draft HIDVL MARC records by batch, we need to reshape the spreadsheet exported from the Airtable metadata submission form.

In [1]:
#import modules and libraries
import pandas as pd
import numpy as np
from datetime import datetime, date, time
filetime = datetime.now()
filetime = filetime.strftime("%Y-%m-%d_%I-%M_%p")

In [2]:
post_2019_dmd = input("enter file name and if appropriate filepath of airtable metadata csv: ")

enter file name and if appropriate filepath of airtable metadata csv:  /Users/alexandra/Desktop/hidvl-experiments/hidvl_august2020/Metadata-August 2020 publication cycle.csv


In [3]:
# load new metadata dataframe from csv
df_post_2019_dmd = pd.read_csv(post_2019_dmd,na_filter=False,quotechar = '"')

In [4]:
df_post_2019_dmd = df_post_2019_dmd.replace(r'^\s*$', np.nan, regex=True)

In [5]:
df_post_2019_dmd = df_post_2019_dmd.fillna(np.nan)

In [6]:
# specify new column names for the incoming metadata column headers
# for august 2020 batch, mapped Run time rounded to Run_Time
post_2019_dmd_newcols = {
    "HI #" : "HI",
    "Inventory": "NOID",
    "Publication cycle" : "Publication_Cycle",
    "Date of event" : "Date_of_Production",
    "Location information": "Location_Venue",
    "Language note": "Language_Note",
    "Language": "Language_List",
    "Main production credits": "Main_Production_Credits",
    "Event type" : "Worktypes",
    "Subject": "Subjects_653",
    "Copyright holder": "Rights_Holder",
    "Artist bio": "Artist_Bio",
    "Run time rounded":"Run_Time",
    "Collection": "Series_Title",
    "Conference":"Meeting_Information"
}

In [7]:
#rename the column headers
df_post_2019_dmd.rename(columns=post_2019_dmd_newcols, inplace=True)
#see what columns we have in the dataframe now:
#print("new df",df_post_2019_dmd)
print("new df",df_post_2019_dmd.info())
#Alternate titles have imported as non-null float64 values, and I'm not sure why!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   HI                                 35 non-null     object 
 1   NOID                               35 non-null     object 
 2   DMD Finalized                      35 non-null     object 
 3   Title                              35 non-null     object 
 4   Alternate title 1                  1 non-null      object 
 5   Alternate title 2                  0 non-null      float64
 6   Alternate title 3                  0 non-null      float64
 7   Alternate title 4                  0 non-null      float64
 8   Alternate title 5                  0 non-null      float64
 9   Series_Title                       35 non-null     object 
 10  Meeting_Information                0 non-null      float64
 11  Worktypes                          35 non-null     object 
 

In [8]:
#add an empty column for 650 subjects
#based on https://stackoverflow.com/questions/16327055/how-to-add-an-empty-column-to-a-dataframe
df_post_2019_dmd["Subjects_650"] = np.nan

In [9]:
#do some concatenation to populate the format field
df_post_2019_dmd ["Format"] = df_post_2019_dmd["How many source media form items?"].astype(str) + " " + df_post_2019_dmd["Source media format"]

In [10]:
#combine copyright holder contact info into a single field
#based on https://stackoverflow.com/questions/60724940/concatenate-strings-across-columns-that-are-not-null
df_post_2019_dmd["Copyright_Contact"] = df_post_2019_dmd[["Copyright contact designation","Copyright address","Copyright business phone","Copyright mobile phone","Copyright fax","Copyright email 1","Copyright email 2","Copyright email 3","Copyright website"]].apply(lambda x: ', '.join(x.dropna()), axis=1)
df_post_2019_dmd["Copyright_Contact"] = df_post_2019_dmd["Copyright_Contact"].replace('\\n', ', ', regex=True)
print(df_post_2019_dmd["Copyright_Contact"])


0                                                      
1                                                      
2                                                      
3                                                      
4                                                      
5                                                      
6                                                      
7                                                      
8                                                      
9                                                      
10                                                     
11                                                     
12                                                     
13                                                     
14                                                     
15                                                     
16                                                     
17                                              

In [11]:
#combine alternate titles into a single cell
#based on https://stackoverflow.com/questions/60724940/concatenate-strings-across-columns-that-are-not-null
#this may not actually concatenate any alternate titles, because there is usually only ever one
df_post_2019_dmd["Alternate_Titles"] = df_post_2019_dmd[["Alternate title 1","Alternate title 2","Alternate title 3","Alternate title 4","Alternate title 5"]].apply(lambda x: '|'.join(x.dropna()), axis=1)

In [12]:
#tried filling the blank cells in this field with np.nan but it didn't work...
df_post_2019_dmd["Alternate_Titles"].fillna(np.nan)

0                                              
1                                              
2     The kidnapping of the minister of culture
3                                              
4                                              
5                                              
6                                              
7                                              
8                                              
9                                              
10                                             
11                                             
12                                             
13                                             
14                                             
15                                             
16                                             
17                                             
18                                             
19                                             
20                                      

In [13]:
#I still wanted to check to see if the values had become null values!
print(df_post_2019_dmd ["Alternate_Titles"].isnull())

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
Name: Alternate_Titles, dtype: bool


In [14]:
#get rid of any newline characters
df_post_2019_dmd["Alternate_Titles"] = df_post_2019_dmd["Alternate_Titles"].replace('\\n', '', regex=True)
df_post_2019_dmd["Main_Production_Credits"] = df_post_2019_dmd["Main_Production_Credits"].replace('\\n', '', regex=True)
df_post_2019_dmd["Summary"] = df_post_2019_dmd["Summary"].replace('\\n', '', regex=True)
df_post_2019_dmd["Artist_Bio"] = df_post_2019_dmd["Artist_Bio"].replace('\\n', '', regex=True)
df_post_2019_dmd["Participants"] = df_post_2019_dmd["Participants"].replace('\\n', '', regex=True)


In [15]:
#print a sample record that had newlines
print(df_post_2019_dmd.loc[8,"Artist_Bio"])

Deborah Castillo is a Venezuelan-born, Brooklyn-based multidisciplinary artist. She holds an MFA and BFA from Armando Reverón Higher Education School of Fine Arts in Caracas, Venezuela. Castillo has been granted numerous awards and residencies including NYFA Immigrant Artist Mentoring Program, (2015), NYC, The Banff  Center. Artist in Residence  Program in Visual Arts (2015) Canada, Atlantic Center for the Arts (2014), Florida and London Print Studio, (2007) UK as well as “Premio Armando Reverón”; AVAP in the “Young Artist Category” (2013), “XI Salón Eugenio Mendoza” Award, Sala Mendoza, (2003); VI Salón CANTV, Jóvenes con FIA” Award, (2003) Caracas, Venezuela and more. Her work has been exhibited at Museum of Arts and Design (US), New Museum (US), Rufino Tamayo Museum (Mexico), Carrillo Gil  Museum (Mexico); Escuela de Bellas Artes, Bolivian Biennial SIART (Bolivia), UCLA (US), ICA (UK), Palais de Tokyo (France, The Broad Museum (US), Smack Mellon (US), and the Hemispheric Institute (

In [16]:
#drop unwanted columns and see what remains
#for august 2020, dropped Run time instead of Run time rounded
df_post_2019_dmd.drop(["Run time","DMD Finalized","How many source media form items?","Source media format","Alternate title 1","Alternate title 2","Alternate title 3","Alternate title 4","Alternate title 5","Copyright contact designation","Copyright address","Copyright business phone","Copyright mobile phone","Copyright fax","Copyright email 1","Copyright email 2","Copyright email 3","Copyright website"], axis=1, inplace=True)

In [17]:
print(df_post_2019_dmd.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   HI                       35 non-null     object 
 1   NOID                     35 non-null     object 
 2   Title                    35 non-null     object 
 3   Series_Title             35 non-null     object 
 4   Meeting_Information      0 non-null      float64
 5   Worktypes                35 non-null     object 
 6   Date_of_Production       31 non-null     object 
 7   Location_Venue           27 non-null     object 
 8   Subjects_653             35 non-null     object 
 9   Summary                  35 non-null     object 
 10  Artist_Bio               35 non-null     object 
 11  Rights_Holder            35 non-null     object 
 12  Main_Production_Credits  22 non-null     object 
 13  Participants             18 non-null     object 
 14  Run_Time                 35 

In [18]:
df_post_2019_dmd = df_post_2019_dmd.sort_index(axis=1)

In [19]:
#df_combined_dmd = pd.concat([df_pre_2019_dmd,df_post_2019_dmd],ignore_index=True,keys=['pre', 'post'])
#print(df_combined_dmd)
print(df_post_2019_dmd.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Alternate_Titles         35 non-null     object 
 1   Artist_Bio               35 non-null     object 
 2   Copyright_Contact        35 non-null     object 
 3   Date_of_Production       31 non-null     object 
 4   Format                   35 non-null     object 
 5   HI                       35 non-null     object 
 6   Language_List            35 non-null     object 
 7   Language_Note            13 non-null     object 
 8   Location_Venue           27 non-null     object 
 9   Main_Production_Credits  22 non-null     object 
 10  Meeting_Information      0 non-null      float64
 11  NOID                     35 non-null     object 
 12  Participants             18 non-null     object 
 13  Publication_Cycle        35 non-null     object 
 14  Rights_Holder            35 

In [20]:
df_post_2019_dmd.to_csv("hidvl_metadata_reshaped_%s.csv"%filetime, index=False)

The end!