# This notebook conducts a train / val / test split on the wildlife IDs, then generates the respective video train / val / test matrices, and stores all of it to ensure reproducible results.

In [None]:
import os
from tqdm import tqdm
import numpy as np
import tensorflow as tf
import pandas as pd
import requests
import re
import json
import pickle
from path import Path

In [None]:
print(tf.__version__)
print(pd.__version__)
! python --version

Please set the base path where the data is stored.

In [None]:
# this should be the path where you are storing the entire YT8M dataset
base_path = Path('/nfs/turbo/seas-nhcarter/human_wildlife_interactions')
# this should be where you are storing the cloned repo
repo_path = Path("/nfs/turbo/seas-nhcarter/human_wildlife_interactions/repo")

In [None]:
# automatically set the paths to some other files
video_path = Path(base_path / 'video')
frame_path = Path(base_path / 'frame')
hunting_dict = Path(repo_path / "human_wildlife_interactions/data/processed/hunting_dict.json")

In [None]:
# read in the data
with open(video_path / 'classifier_video_data/video_data.pkl', 'rb') as file:
    data = pickle.load(file, encoding = 'utf-8')
with open(hunting_dict) as file:
    cluster_results = json.load(file)

### Train / Val / Test split

In [None]:
# remove the entries that didn't make it through clustering
keylist = list(data.keys())
cluster_keys = list(cluster_results.keys())
y_vals = []
for key in keylist:
    if key not in cluster_keys:
        del(data[key])
    else:
        y_vals.append(cluster_results[key])
        
# train-test-splitX
X_train, X_test, y_train, y_test = train_test_split(list(data.keys()),y_vals, test_size=.15, random_state=42, stratify=y_vals)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size =.15, random_state=42, stratify=y_train)
# sanity check
print(len(X_train), len(X_val), len(X_test), len(y_train), len(y_val), len(y_test))

### Generate Matrices

In [None]:
def matrix_generator(data_ids):
    raw_data_storage = []
    
    for key in data_ids:
        row = []
        rgb = data[key]['video_example'].features.feature['mean_rgb'].float_list.value
        row.extend(rgb)
        audio = data[key]['video_example'].features.feature['mean_audio'].float_list.value
        row.extend(audio)
        y_value = cluster_results[key]
        row.append(y_value)
        raw_data_storage.append(row)
        
    return np.array(raw_data_storage)

train_matrix = matrix_generator(X_train)
test_matrix = matrix_generator(X_test)
val_matrix = matrix_generator(X_val)

### Write to Storage

In [None]:
write_path = base_path / 'classifier_video_data

# train / val / test id lists
with open(write_path / 'train_ids.pkl', 'wb') as file:
    pickle.dump(X_train, file)
with open(write_path / 'val_mat.csv', 'wb') as file:
    pickle.dump(X_val, file)
with open(write_path / 'test_mat.csv', 'wb') as file:
    pickle.dump(X_test, file)

# matrices
train_df = pd.DataFrame(train_matrix)
test_df = pd.DataFrame(test_matrix)
val_df = pd.DataFrame(val_matrix)
train_df.to_csv(write_path /'train_mat.csv')
test_df.to_csv(write_path / 'test_mat.csv')
val_df.to_csv(write_path / 'val_mat.csv')