## Get Channel ID

In [1]:
from googleapiclient.discovery import build

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("API_KEY")

In [5]:
# Create a YouTube API client
youtube = build('youtube', 'v3', developerKey=api_key)

# Search for the channel by name
request = youtube.search().list(
    q='Lex Fridman',
    type='channel',
    part='snippet',
    maxResults=1
)
response = request.execute()

# Get the channel ID
channel_id = response['items'][0]['snippet']['channelId']
print("Channel ID:", channel_id)

Channel ID: UCSHZKyawb77ixDdsGog4iWA


## Fetch All Videos From Past 2 Years

#### 1.  Import Required Libraries

In [7]:
# Import libraries
import pandas as pd
import time
from datetime import datetime, timedelta, timezone
import os

## Define and Create Directory Paths

To ensure reproducibility and organized storage, we programmatically create directories if they don't already exist for:

- **raw data**
- **processed data**
- **results**
- **documentation**

These directories will store intermediate and final outputs for reproducibility.

In [9]:
#Get working directory
current_dir = os.getcwd()
#go one directory up to root directory
project_root_dir = os.path.dirname(current_dir)
#Define path to data files
data_dir = os.path.join(project_root_dir, 'data')
raw_dir = os.path.join(data_dir, 'raw')
processed_dir = os.path.join(data_dir, 'processed')
#Define path to results folder
results_dir = os.path.join(project_root_dir, 'results')
#Define path to results folder
docs_dir = os.path.join(project_root_dir, 'docs')

#Create directories if they do not exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)
os.makedirs(docs_dir, exist_ok=True)

#### 2. Set Up Time Range (Last 2 Years) 

In [11]:
# Today's date and 2 years back
today = datetime.now(timezone.utc).isoformat()
two_years_ago = (datetime.now(timezone.utc) - timedelta(days=730)).isoformat()

#### 3. Fetch Videos Metadata

In [13]:
video_data = []

# Paginate through all videos
next_page_token = None

while True:
    request = youtube.search().list(
        part = "snippet",
        channelId = 'UCSHZKyawb77ixDdsGog4iWA',
        maxResults = 50,
        publishedAfter = two_years_ago,
        publishedBefore = today,
        order = "date",
        type = "video",
        pageToken = next_page_token
    )
    response = request.execute()
    
    for item in response['items']:
        video_id = item['id']['videoId']
        title = item['snippet']['title']
        published = item['snippet']['publishedAt']
        
        # Get video stats
        video_request = youtube.videos().list(
            part = "statistics,snippet",
            id = video_id
        )
        video_response = video_request.execute()
        
        for v in video_response['items']:
            stats = v['statistics']
            snippet = v['snippet']
            video_data.append({
                "videoId": video_id,
                "title": title,
                "publishedAt": published,
                "viewCount": int(stats.get("viewCount", 0)),
                "likeCount": int(stats.get("likeCount", 0)),
                "commentCount": int(stats.get("commentCount", 0)),
                "tags": snippet.get("tags", []),
                "description": snippet.get("description", "")
            })

    # Check if more pages exist
    next_page_token = response.get("nextPageToken")
    if not next_page_token:
        break

#### 4. Save Video Data to CSV

In [15]:
video_df = pd.DataFrame(video_data)
video_filename = os.path.join(raw_dir, "lex_fridman_videos.csv")
video_df.to_csv(video_filename, index=False)
print("Saved video metadata. Total videos:", len(video_df))
print(f"\nvideo datase saved to: {video_filename}")

Saved video metadata. Total videos: 94

video datase saved to: C:\Users\user\Documents\tekHer\YouTube-Channel-Insights\data\raw\lex_fridman_videos.csv


##  Fetch Top 50 Comments for Each Video

For each video:

- Get top 50 comments

- Store:

- `videoId`

- `authorDisplayName`

- `textDisplay`

- `likeCount`

- `publishedAt`

In [17]:
import time

comment_data = []

for video_id in video_df['videoId']:
    try:
        request = youtube.commentThreads().list(
            part = "snippet",
            videoId = video_id,
            maxResults = 50,
            textFormat = "plainText"
        )
        response = request.execute()

        for item in response.get('items', []):
            snippet = item['snippet']['topLevelComment']['snippet']
            comment_data.append({
                "videoId": video_id,
                "authorDisplayName": snippet.get("authorDisplayName", ""),
                "textDisplay": snippet.get("textDisplay", ""),
                "likeCount": snippet.get("likeCount", 0),
                "publishedAt": snippet.get("publishedAt", "")
            })

        # Delay to avoid hitting rate limits
        time.sleep(1)

    except Exception as e:
        print(f"Error fetching comments for {video_id}: {e}")

#### Save Comments to CSV

In [19]:
comments_df = pd.DataFrame(comment_data)
comments_filename = os.path.join(raw_dir, "lex_fridman_comments.csv")
comments_df.to_csv(comments_filename, index=False)
print("Saved comments. Total:", len(comments_df))
print(f"\nvideo datase saved to: {comments_filename}")

Saved comments. Total: 4684

video datase saved to: C:\Users\user\Documents\tekHer\YouTube-Channel-Insights\data\raw\lex_fridman_comments.csv
