<a href="https://colab.research.google.com/github/YinterestingProjects/human-wildlife-interactions/blob/main/label_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Reading in Vocab Metadata

In [3]:
# updated vocab.csv
new_url = 'https://research.google.com/youtube8m/csv/2/vocabulary.csv'
new_vocab = pd.read_csv(new_url)
new_vocab.head(2)

Unnamed: 0,Index,TrainVideoCount,KnowledgeGraphId,Name,WikiUrl,Vertical1,Vertical2,Vertical3,WikiDescription
0,0,788288,/m/03bt1gh,Game,https://en.wikipedia.org/wiki/Game,Games,,,"A game is structured form of play, usually und..."
1,1,539945,/m/01mw1,Video game,https://en.wikipedia.org/wiki/Video_game,Games,,,A video game is an electronic game that involv...


In [4]:
# check if Wildlife entity is present
new_vocab[new_vocab.Name == 'Wildlife']

Unnamed: 0,Index,TrainVideoCount,KnowledgeGraphId,Name,WikiUrl,Vertical1,Vertical2,Vertical3,WikiDescription
364,363,4243,/m/01280g,Wildlife,https://en.wikipedia.org/wiki/Wildlife,Pets & Animals,,,Wildlife traditionally refers to undomesticate...


In [5]:
# filter to all records within Pets & Animal Vertical 
animal_df = new_vocab[(new_vocab.Vertical1 == 'Pets & Animals') | (new_vocab.Vertical2 == 'Pets & Animals')] # Pets & Animal only present in V1&2
print(f'There are {animal_df.Name.nunique()} unique entities in Pets & Animal Vertical')
# animal_df.Name.unique()

# export summary df with all Animal entities and counts 
summary_df = animal_df.groupby(['Name','KnowledgeGraphId']).agg({'TrainVideoCount':'sum'}).reset_index()
summary_df.to_csv('/content/drive/MyDrive/MADS/MADS_Capstone/data/animal_entities_summary.csv', index=False)
summary_df.tail(5)

There are 163 unique entities in Pets & Animal Vertical


Unnamed: 0,Name,KnowledgeGraphId,TrainVideoCount
158,Welsh Corgi,/m/01ksq5,286
159,Whale,/m/084zz,1179
160,White-tailed deer,/m/02r0zt,529
161,Wildlife,/m/01280g,4243
162,Zebra,/m/0898b,202


In [6]:
# create entity to id lookup dictionary
entity2id = dict(zip(summary_df.Name, summary_df.KnowledgeGraphId.str[3:]))

#### pull all video IDs for entity (API based)

In [31]:
import requests
import re

def get_entity_videoIds(entity_name):
  ''' gets a list of video ids in the YT8M training dataset tagged with a given entity(name)'''

  entity_id = entity2id[entity_name]
  
  url = f'https://storage.googleapis.com/data.yt8m.org/2/j/v/{entity_id}.js'
  response = requests.get(url)
  response.raise_for_status() 
  
  data = response.text
  pattern = r'\w+'
  ids = re.findall(pattern, data)[2:]
  print(f'{entity_name}({entity_id}): {len(ids)} videos found')

  return ids

In [33]:
videoIds = get_entity_videoIds('Wildlife')

Wildlife(01280g): 4243 videos found


#### pull all video IDs for entity (file-based)

In [9]:
# pickle file extracted from Daniel's Github https://github.com/danielgordon10/youtube8m-data
train_label_dict = pd.read_pickle('/content/drive/MyDrive/MADS/MADS_Capstone/data/parsed_data_renamed_train.pkl')
train_label_dict.keys()  

dict_keys(['videos', 'vocabulary'])

In [10]:
label2vocab = train_label_dict['vocabulary']
vocab2label = {label2vocab[k]:k for k in label2vocab}

video2label = train_label_dict['videos']
label2video = {}
for video in video2label:
  for label in video2label[video]:
    if label in label2video:
      label2video[label].append(video)
    else:
      label2video[label] = [video]

In [11]:
vocab2label['Wildlife']

363

In [12]:
wildlife_youtubeIds = label2video[363]

In [13]:
len(wildlife_youtubeIds)

4159

### Archive (2017 vocab.csv)
Entity ***Wildlife*** was not present in 2017 version of vocab

In [14]:
# 2017 vocab.csv
vocab_df = pd.read_csv('/content/drive/MyDrive/MADS/MADS_Capstone/data/vocabulary.csv')
vocab_df.head()

Unnamed: 0,Index,TrainVideoCount,KnowledgeGraphId,Name,WikiUrl,Vertical1,Vertical2,Vertical3,WikiDescription
0,3,378135,/m/01jddz,Concert,https://en.wikipedia.org/wiki/Concert,Arts & Entertainment,,,A concert is a live music performance in front...
1,7,200813,/m/0k4j,Car,https://en.wikipedia.org/wiki/Car,Autos & Vehicles,,,"A car is a wheeled, self-powered motor vehicle..."
2,8,181579,/m/026bk,Dance,https://en.wikipedia.org/wiki/Dance,Arts & Entertainment,,,Dance is a performance art form consisting of ...
3,11,135357,/m/02wbm,Food,https://en.wikipedia.org/wiki/Food,Food & Drink,,,Food is any substance consumed to provide nutr...
4,12,130835,/m/02vx4,Association football,https://en.wikipedia.org/wiki/Association_foot...,Sports,,,"Association football, more commonly known as f..."


In [15]:
len(vocab_df)

1000

In [16]:
len(vocab_df.Index)

1000

In [17]:
print(f'There are {vocab_df.Vertical1.nunique()} unique vertical 1 categories')
print(f'There are {vocab_df.Vertical2.nunique()} unique vertical 2 categories')
print(f'There are {vocab_df.Vertical3.nunique()} unique vertical 3 categories')

There are 24 unique vertical 1 categories
There are 18 unique vertical 2 categories
There are 6 unique vertical 3 categories


In [18]:
vocab_df.Vertical1.unique()

array(['Arts & Entertainment', 'Autos & Vehicles', 'Food & Drink',
       'Sports', 'Pets & Animals', 'Internet & Telecom',
       'Hobbies & Leisure', 'Business & Industrial', 'Beauty & Fitness',
       'Computers & Electronics', 'Home & Garden', 'Shopping', 'Travel',
       'Real Estate', 'Jobs & Education', 'Reference', 'Games', 'Science',
       'Law & Government', 'Books & Literature', 'People & Society',
       'News', 'Finance', 'Health'], dtype=object)

In [19]:
vocab_df.Vertical2.unique()

array([nan, 'Travel', 'Law & Government', 'Pets & Animals',
       'Internet & Telecom', 'Real Estate', 'Hobbies & Leisure',
       'Business & Industrial', 'Games', 'Sports', 'Home & Garden',
       'Reference', 'Science', 'People & Society', 'Food & Drink',
       'Books & Literature', 'Shopping', 'News', 'Arts & Entertainment'],
      dtype=object)

In [20]:
vocab_df.Vertical3.unique()

array([nan, 'Science', 'Law & Government', 'Home & Garden', 'Travel',
       'Sports', 'Food & Drink'], dtype=object)

In [21]:
animals_df = vocab_df[(vocab_df.Vertical1 == 'Pets & Animals') | (vocab_df.Vertical2 == 'Pets & Animals')]

In [22]:
print(f'There are {animals_df.Name.nunique()} unique name categories') 

There are 50 unique name categories


In [23]:
animals_df.Name.unique()

array(['Pet', 'Fish', 'Cat', 'Aquarium', 'Pony', 'Parrot', 'Lion',
       'Reptile', 'Shark', 'Deer', 'Bear', 'Penguin', 'Reef aquarium',
       'German Shepherd', 'Turtle', 'Duck', 'Rabbit', 'Largemouth bass',
       'Whale', 'Hamster', 'Dolphin', 'Snake', 'Pig', 'Bulldog',
       'Elephant', 'Monkey', 'Giant panda', 'Black cat', 'Lizard',
       'Goldfish', 'Golden Retriever', 'Tiger', 'Guinea pig', 'Mouse',
       'Mexican Creole hairless pig', 'Dog agility', 'Border Collie',
       'Frog', 'Eagle', 'Killer whale', 'Koi', 'Rock dove',
       'Domestic pigeon', 'Rottweiler', 'Hedgehog', 'Collie', 'Catfish',
       'White-tailed deer', 'Pug', 'Siamese fighting fish'], dtype=object)

In [24]:
vocab_df[vocab_df.Name == 'Wildlife']

Unnamed: 0,Index,TrainVideoCount,KnowledgeGraphId,Name,WikiUrl,Vertical1,Vertical2,Vertical3,WikiDescription
