# UPDATE MASTER CSV

Follow instructions to save all metadata in a single master CSV.

## Step 1

Run the cell below.

You will be prompted to follow a link to authorize access to the Drive. Click the link and make sure that you sign in as ptrstoryteam@gmail.com. Copy the authorization token, return to this page, paste the token into the provided box, and hit enter on your keyboard. This completes the authorization.

In [None]:
# LIBRARIES

import os
import shutil
import glob
import csv
from datetime import datetime

from google.colab import drive
import pandas as pd

# MOUNTING THE DRIVE/AUTHORIZATION

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Step 2

Run the cell below to complete initial setup.

In [None]:
# DIRECTORIES

# Path to where all metadata will be stored and processed in GDrive
FULL_PATH_CSV = "/content/drive/My Drive/Protect the Results Story Team/10 PTR DISTRIBUTION/Footage Library/METADATA/"

# Folder for incoming (unprocessed) csvs
INCOMING_CSV_FOLDER = "incoming_csv/"

# Folder for processed csvs
PROCESSED_CSV_FOLDER = "processed_csv/"

# Folder for logs of processed csvs
PROCESSED_LOGS_FOLDER= "processed_logs/"

# FILES

# Master CSV file 
NAME_MASTER_CSV = "master_csv_test.csv"

# OTHER CONSTANTS

# Columns of interest 
COLUMNS = ["hash_id", "project_id", "contributor_email", "contributor_name","time_uploaded",
           "shot_title", "caption", "media_type", "latitude", "longitude", 
           "video_duration", "tags"]

# FUNCTIONS

def processed_csv(csv_full_path, already_processed_hash_id):
  df_raw = pd.read_csv(csv_full_path)

  # Record the hashid that has potentially changed
  potentially_changed_hashid = []

  # Record hashid that is not processed
  error_hashid = []

  # Iterate through each row
  for i, row in df_raw.iterrows():
    hashid = row["hash_id"]
    if row["hash_id"] in already_processed_hash_id:
      potentially_changed_hashid.append(hashid)
      continue
    try: 
      # Assuming the tags are always separated by newlines
      row["tags"] = row["tags"].replace("\n", " ") # Ensures that multiple occurences of \n are replaced by whitespaces
      row["tags"] = row["tags"].split() # Split into seperate tags

      # Obtain subset to insert into master CSV
      subset_csv = row[COLUMNS ].to_frame().T

      # Insert into CSV
      subset_csv.to_csv(FULL_PATH_CSV+NAME_MASTER_CSV, header=None, mode='a')

    except Exception as e:
          print("\n")
          print(str(e))
          print("Unable to process: " + hashid)
          error_hashid.append((hashid, str(e)))
          continue
  return potentially_changed_hashid, error_hashid

## Step 3+

Run the cell below to update the master CSV with data from incoming CSVs. Re-run the cell every time new CSVs are uploaded.

In [None]:
if(os.path.exists(FULL_PATH_CSV+NAME_MASTER_CSV)):
  print("MASTER CSV ALREADY EXISTS")
else:
  # Create a csv file with the columns if it does not exist
  print("NO MASTER CSV FOUND, CREATING NEW FILE")
  pd.DataFrame(columns=COLUMNS).to_csv(FULL_PATH_CSV+NAME_MASTER_CSV)

# Keep track of hashids that have been processed
already_processed_hash_id = pd.read_csv(FULL_PATH_CSV+NAME_MASTER_CSV)["hash_id"].to_list()

# All files ending with .csv
all_files_in_dir = os.listdir(FULL_PATH_CSV + INCOMING_CSV_FOLDER)
print(all_files_in_dir)

# This searches all folders in the METADATA folder, not just incoming
# But when I change it to incoming, it doesn't process CSVs at all
all_csv_files_in_dir = [filename for filename in glob.iglob(FULL_PATH_CSV + INCOMING_CSV_FOLDER + '**.csv', recursive=False)]
print(all_csv_files_in_dir)

for csv_full_path in all_csv_files_in_dir:
  csv_filename= csv_full_path.split("/")[-1]
  print("Processing " + csv_filename + "\n")
  potentially_changed_hashid, error_hashid = processed_csv(csv_full_path, already_processed_hash_id)

  try:
    # Move file to processed_folder
    shutil.move(csv_full_path,FULL_PATH_CSV + PROCESSED_CSV_FOLDER + csv_filename)

    print("Moving " + csv_filename + " to processed folder")

    # Record log
    full_path_filename_log = FULL_PATH_CSV + PROCESSED_LOGS_FOLDER + csv_filename[:-4] + ".txt"
    with open(full_path_filename_log , "w+") as log_file:
        print("List of hashids skipped:\n", file=log_file)
        print(*potentially_changed_hashid, sep="\n", file=log_file)

        print("\nList of hashids with errors:\n", file=log_file)
        for t in potentially_changed_hashid:
          line = ' '.join(str(x) for x in t)
          print(line, sep="\n", file=log_file)


    print("\n Processing done \n")
    
  except Exception as e:
          print("\n")
          print(str(e))
          print("Unable to process: " + hashid)
          error_hashid.append((hashid, str(e)))
          continue


MASTER CSV ALREADY EXISTS
['swingstates_footage_export.csv']
['/content/drive/My Drive/Protect the Results Story Team/10 PTR DISTRIBUTION/Footage Library/METADATA/incoming_csv/swingstates_footage_export.csv']
Processing swingstates_footage_export.csv

Moving swingstates_footage_export.csv to processed folder

 Processing done 



### Attribution

This code was written by Jenisha Patel and very very slightly edited and reorganized by Brienne Hayes.