In [38]:
import pandas as pd
import json
import csv

# Converting json to csv file

In [84]:
with open('/Users/kyle/awn_data/analyst/Takeout/YouTube and YouTube Music/history/watch-history.json') as file:
    data = json.load(file)

data_file = open('watch-history.csv', 'w', newline='')
csv_writer = csv.writer(data_file)

filtering the data to remove any ads, survey questions, and other marketing related videos that aren't representative of my actual watch history

In [85]:
count = 0

#filter out the youtube ads that appear in watch history
#filter out the youtube survey questions
filtered_data = [item for item in data if "details" not in item]
filtered_data = [item for item in filtered_data if (item['title'] != "Answered survey question")]
filtered_data = [item for item in filtered_data if "titleUrl" in item]

#create the csv file from the filtered data
for entry in filtered_data:
    if count == 0:
        header = entry.keys()
        csv_writer.writerow(header)
        count += 1
    csv_writer.writerow(entry.values())

data_file.close()

Here's what the dataframe looks like

In [86]:
df = pd.read_csv("/Users/kyle/awn_data/analyst/Takeout/watch-history.csv")
df.head

<bound method NDFrame.head of        header                                              title  \
0     YouTube  Watched TWICE - SIGNAL (late 2000s Kpop flip b...   
1     YouTube   Watched NewJeans - ASAP (pluggnb flip by ilypvc)   
2     YouTube         Watched How to actually become productive.   
3     YouTube  Watched You probably won’t survive 2024... Top...   
4     YouTube  Watched the ChatGPT store is about to launch… ...   
...       ...                                                ...   
8919  YouTube      Watched How to Learn ANY Language with Movies   
8920  YouTube               Watched What Is Your Language Level?   
8921  YouTube                                     Watched Reverb   
8922  YouTube     Watched Fashion Tools That’ll CHANGE YOUR LIFE   
8923  YouTube  Watched What Writers Can Learn From The Enneag...   

                                         titleUrl  \
0     https://www.youtube.com/watch?v=I8VgI2bFlfQ   
1     https://www.youtube.com/watch?v=F5vI8qmb2

# Using the Youtube Data v3 API

### Building the API

In [53]:
key = 'AIzaSyAAYRz4ctksjVf_VYKcmU6Zj5opYKQqUgE'
connURL = 'https://www.googleapis.com/youtube/v3/videos'

In [78]:
from googleapiclient.discovery import build

service = build('youtube', 'v3', developerKey=key)

### Testing its capabilities

Here, we will grab a random video to test with

In [91]:
testString = "https://www.youtube.com/watch?v=I8VgI2bFlfQ" # picked from the csv file

testRequestVideo = service.videos().list( # grabs more data on the video (specifically want the categoryId)
    part = 'snippet',
    id = ['I8VgI2bFlfQ', 'ZgYFpWivJ5o']
)
testRequestVideo.execute()

{'kind': 'youtube#videoListResponse',
 'etag': 'gJPn2djiss44M6ZAXkiH6DYMQ_Y',
 'items': [{'kind': 'youtube#video',
   'etag': 'FSzvjj0ubllfB9Ff_QOCHJCN_GA',
   'id': 'I8VgI2bFlfQ',
   'snippet': {'publishedAt': '2023-02-06T12:04:42Z',
    'channelId': 'UC_IGhptKgpbsVdsGDpmimIg',
    'title': 'TWICE - SIGNAL (late 2000s Kpop flip by ilypvc)',
    'description': '#twice #트와이스 #pluggnb \ni tried to make twice song to some old kpop type flip !! lol\ndont use without permissions',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/I8VgI2bFlfQ/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/I8VgI2bFlfQ/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/I8VgI2bFlfQ/hqdefault.jpg',
      'width': 480,
      'height': 360},
     'standard': {'url': 'https://i.ytimg.com/vi/I8VgI2bFlfQ/sddefault.jpg',
      'width': 640,
      'height': 480},
     'maxres': {'url': 'https://i.ytimg

### Grabbing important information from the API
As you can see in the test above, the categoryId is a numerical variable that corresponds to the category it belongs to. In order to convert this number into thte actual category, we will use Youtube's dictionary of categories, accessing it through the requests library in python.

In [89]:
import requests
params = {
    'key': key,
    'part':'snippet',
}
response = requests.get("https://www.googleapis.com/youtube/v3/videoCategories?part=snippet&regionCode=US&key=AIzaSyAAYRz4ctksjVf_VYKcmU6Zj5opYKQqUgE", params=params)
categories = {}
for item in response.json()['items']:
    categories[item['id']] = item['snippet']['title']

categories

Music


Here, we are cleaning up any videos that ended up being taken down, which makes accessing information on it impossible to do currently. So, we will be checking for this particular pattern that shows up for every request to a publicly unavailable video.

In [None]:
pattern = '"playabilityStatus":{"status":"ERROR","reason":"Video unavailable"'
pattern2 = "This is a private video."
def try_site(index_url):
    request = requests.get(index_url[1], allow_redirects=False)
    return [index_url[0], pattern in request.text]

In [None]:
index_urls = []
for index, row in df.iterrows():
    index_urls.append([index, row['titleUrl']])

In [None]:
# using parallel processing to make this code faster
import concurrent.futures

to_remove = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(try_site, index_url) for index_url in index_urls}
    for future in concurrent.futures.as_completed(futures):
        if (future.result()[1]):
            to_remove.append(future.result()[0])
print(to_remove)

In [None]:
print(f'\nThe total number of videos removed from the watch history for no longer being publicly available: {len(to_remove)}\n')
test1 = to_remove[0]
print(f"Before removing index {test1}: {df.iloc[test1]}")
df = df.drop(index=to_remove)
print(f"After removing index {test1}: {df.iloc[test1]}")

Now that we have removed the publicly unavailable rows from the dataframe, we can continue with getting information on each video. We have a way of grabbing the video ids of any video url, as well as the informaiton of any video id. However, since the dataframe is so large, we need a speedier and more efficient way that will also limit the amount of free credits we use with this api! So, we will be doing batch requests.

In [None]:
videoIds = [[]]
batch_size = 20
for index, row in df.iterrows():
    url = row["titleUrl"]
    videoId = url[url.rfind("=")+1:]
    videoIds[len(videoIds) - 1].append(videoId)
    if (index + 1) % batch_size == 0:
        videoIds.append([])

In [None]:
print(f"Number of batches of size 20: {len(videoIds)}")

In [None]:
from googleapiclient.errors import HttpError
import googleapiclient

In [None]:
categoryCol = []
descriptionCol = []
tagsCol = []
def fetch_video_details(video_ids):
    try:
        request = service.videos().list(
            part="snippet",
            id=video_ids
        )
        response = request.execute()
        for i in range(len(video_ids)):
            if(response['items']):
                area = response['items'][i]['snippet']
                descriptionCol.append(area['description'])
                categoryCol.append(categories[area['categoryId']])
                if "tags" in area:
                    tagsCol.append(response['items'][i]['snippet']['tags'])
                else:
                    tagsCol.append([''])
            else:
                categoryCol.append(None)
                descriptionCol.append(None)
                tagsCol.append(None)
        return response
    except googleapiclient.errors.HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred:\n{e.content}")
        return response

In [None]:
for batch in videoIds:
    fetch_video_details(batch)