In [3]:
from googleapiclient.discovery import build
import pandas as pd
from datetime import datetime, timedelta
import logging
 
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler("youtube_data_collection.log"), logging.StreamHandler()]
)
 
# API key from Google Cloud Console
API_KEY = ''
 
# Set up YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)
 
def get_video_details(video_id):
    """Fetch additional details for a specific video by its ID."""
    logging.info(f"Fetching details for video ID: {video_id}")
    request = youtube.videos().list(
        part="snippet,statistics",
        id=video_id
    )
    response = request.execute()
   
    if response['items']:
        item = response['items'][0]
        video_details = {
            'title': item['snippet']['title'],
            'description': item['snippet']['description'],
            'published_at': item['snippet']['publishedAt'],
            'view_count': item['statistics'].get('viewCount', 'N/A'),
            'like_count': item['statistics'].get('likeCount', 'N/A'),
            'comment_count': item['statistics'].get('commentCount', 'N/A'),
            'video_id': video_id,
            'url': f"https://www.youtube.com/watch?v={video_id}",
            'thumbnails': item['snippet']['thumbnails']
        }
        logging.info(f"Fetched details for video ID: {video_id}")
        return video_details
    else:
        logging.warning(f"No details found for video ID: {video_id}")
        return None
 
def search_youtube_videos(keywords, hashtags):
    # Combine keywords and hashtags into a single search query
    query = ' '.join(keywords + hashtags)
    logging.info(f"Searching YouTube for query: {query}")
 
    # Define date limit for the last 2 years
    two_years_ago = (datetime.now() - timedelta(days=450)).isoformat("T") + "Z"
 
    # Prepare to collect all video data
    videos = []
    next_page_token = None
    page_count = 0
 
    while True:
        # Search for videos on YouTube with pagination support
        request = youtube.search().list(
            q=query,
            part='snippet',
            type='video',
            maxResults=50,  # YouTube API allows a max of 50 results per page
            publishedAfter=two_years_ago,
            pageToken=next_page_token
        )
        response = request.execute()
        page_count += 1
        logging.info(f"Processing page {page_count} of results for query: {query}")
 
        # Process each video search result
        for item in response.get('items', []):
            video_id = item['id']['videoId']
            video_info = get_video_details(video_id)
            if video_info:
                video_info['hashtag'] = ', '.join(hashtags)  # Store hashtags as a single string
                videos.append(video_info)
 
        # Check if there are more pages
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            logging.info(f"No more pages left for query: {query}")
            break  # Exit loop if no more pages are available
 
    logging.info(f"Completed search for query: {query}")
    return videos
 
# Hashtags and keywords from your image
hashtags = [ 
"#bath & bodywork",
"#bath and bodywork",
"#bathandbodyworkssale",
"#bath & body work",
"#bathandbodyworkshaul",
"#bathandbodyworksdailyfind",
"#bathandbodyworkscoupons",
"#bathandbodyworksmurah",
"#bathandbodyworksaddict",
"#bathandbodyworksph",
"#bathandbodyworkshk",
"#bathandbodyworks_code"  
]
keywords = [" "]
 
# Run the search and collect data for each hashtag
all_videos = []
for hashtag in hashtags:
    logging.info(f"Starting search for hashtag: {hashtag}")
    results = search_youtube_videos(keywords, [hashtag])
    all_videos.extend(results)
    logging.info(f"Finished search for hashtag: {hashtag}. Found {len(results)} videos.")
 
# Create a DataFrame from the results
df = pd.DataFrame(all_videos)
 


2025-01-03 12:20:20,101 - INFO - file_cache is only supported with oauth2client<4.0.0
2025-01-03 12:20:20,111 - INFO - Starting search for hashtag: #bathandbodywork
2025-01-03 12:20:20,129 - INFO - Searching YouTube for query:   #bathandbodywork
2025-01-03 12:20:21,781 - INFO - Processing page 1 of results for query:   #bathandbodywork
2025-01-03 12:20:21,781 - INFO - Fetching details for video ID: PHGiwT2GMBQ
2025-01-03 12:20:21,927 - INFO - Fetched details for video ID: PHGiwT2GMBQ
2025-01-03 12:20:21,943 - INFO - Fetching details for video ID: A8DixgAQjyQ
2025-01-03 12:20:22,077 - INFO - Fetched details for video ID: A8DixgAQjyQ
2025-01-03 12:20:22,077 - INFO - Fetching details for video ID: xq69eiNb-Eo
2025-01-03 12:20:22,195 - INFO - Fetched details for video ID: xq69eiNb-Eo
2025-01-03 12:20:22,195 - INFO - Fetching details for video ID: QWRd1xPygZI
2025-01-03 12:20:22,286 - INFO - Fetched details for video ID: QWRd1xPygZI
2025-01-03 12:20:22,288 - INFO - Fetching details for vide

HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2Cstatistics&id=HsjtvBbJjkI&key=AIzaSyCNbpBbr5HCKzue7eP7fvN5OmFk75DtnS8&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

In [4]:
df = pd.DataFrame(all_videos)

In [None]:
df.to_excel(r"D:\Avinash.Vishwakarma_Data\OneDrive - Course5 Intelligence Limited\Avinash-code\Non Billable Scrape\bath and body\Extended Data\bath and body_v6.xlsx")

Same code but with Date Filter

In [20]:
from googleapiclient.discovery import build
import pandas as pd
from datetime import datetime, timedelta
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler("youtube_data_collection.log"), logging.StreamHandler()]
)

# API key from Google Cloud Console
API_KEY = 'AIzaSyBAULgCdV0R5v5nTmi5kW6bDpqWuMPf3xE'

# Set up YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)

def get_video_details(video_id):
    """Fetch additional details for a specific video by its ID."""
    logging.info(f"Fetching details for video ID: {video_id}")
    request = youtube.videos().list(
        part="snippet,statistics",
        id=video_id
    )
    response = request.execute()

    if response['items']:
        item = response['items'][0]
        video_details = {
            'title': item['snippet']['title'],
            'description': item['snippet']['description'],
            'published_at': item['snippet']['publishedAt'],
            'view_count': item['statistics'].get('viewCount', 'N/A'),
            'like_count': item['statistics'].get('likeCount', 'N/A'),
            'comment_count': item['statistics'].get('commentCount', 'N/A'),
            'video_id': video_id,
            'url': f"https://www.youtube.com/watch?v={video_id}",
            'thumbnails': item['snippet']['thumbnails']
        }
        logging.info(f"Fetched details for video ID: {video_id}")
        return video_details
    else:
        logging.warning(f"No details found for video ID: {video_id}")
        return None

def search_youtube_videos(keywords, hashtags):
    # Combine keywords and hashtags into a single search query
    query = ' '.join(keywords + hashtags)
    logging.info(f"Searching YouTube for query: {query}")

    # Set the new start date (December 20, 2024)
    start_date = datetime(2024, 12, 20)  # Set the start date to December 20, 2024
    published_after = start_date.isoformat("T") + "Z"  # Convert to ISO format with Z for UTC

    # Prepare to collect all video data
    videos = []
    next_page_token = None
    page_count = 0

    while True:
        # Search for videos on YouTube with pagination support
        request = youtube.search().list(
            q=query,
            part='snippet',
            type='video',
            maxResults=50,  # YouTube API allows a max of 50 results per page
            publishedAfter=published_after,
            pageToken=next_page_token
        )
        response = request.execute()
        page_count += 1
        logging.info(f"Processing page {page_count} of results for query: {query}")

        # Process each video search result
        for item in response.get('items', []):
            video_id = item['id']['videoId']
            video_info = get_video_details(video_id)
            if video_info:
                video_info['hashtag'] = ', '.join(hashtags)  # Store hashtags as a single string
                videos.append(video_info)

        # Check if there are more pages
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            logging.info(f"No more pages left for query: {query}")
            break  # Exit loop if no more pages are available

    logging.info(f"Completed search for query: {query}")
    return videos

# Hashtags for the search
hashtags = [ 
    "#irishspring",
"#irishspringsoap",
"#irishspringclean",
"#irishspringfresh",
"#IrishSpringScent",
"#SoftsoapFresh",
"#SoftsoapLove",
"#SoftsoapClean",
"#softsoapbodywash",
"#softsoap"

]
keywords = [" "]  # Empty keywords to focus only on hashtags

# Run the search and collect data for each hashtag
all_videos = []
for hashtag in hashtags:
    logging.info(f"Starting search for hashtag: {hashtag}")
    results = search_youtube_videos(keywords, [hashtag])
    all_videos.extend(results)
    logging.info(f"Finished search for hashtag: {hashtag}. Found {len(results)} videos.")

# Create a DataFrame from the results
df = pd.DataFrame(all_videos)

# Optionally, save the DataFrame to a CSV file
df.to_csv('youtube_video_data.csv', index=False)
logging.info(f"Saved the data to 'youtube_video_data.csv'.")


2025-01-03 13:20:51,089 - INFO - file_cache is only supported with oauth2client<4.0.0
2025-01-03 13:20:51,123 - INFO - Starting search for hashtag: #irishspring
2025-01-03 13:20:51,123 - INFO - Searching YouTube for query:   #irishspring
2025-01-03 13:20:52,734 - INFO - Processing page 1 of results for query:   #irishspring
2025-01-03 13:20:52,734 - INFO - Fetching details for video ID: dVxS36traME
2025-01-03 13:20:52,853 - INFO - Fetched details for video ID: dVxS36traME
2025-01-03 13:20:52,855 - INFO - Fetching details for video ID: cYpxsxnynig
2025-01-03 13:20:52,947 - INFO - Fetched details for video ID: cYpxsxnynig
2025-01-03 13:20:52,947 - INFO - Fetching details for video ID: 4j4q9olYMXQ
2025-01-03 13:20:53,067 - INFO - Fetched details for video ID: 4j4q9olYMXQ
2025-01-03 13:20:53,068 - INFO - Fetching details for video ID: zyB2SILz4MI
2025-01-03 13:20:53,162 - INFO - Fetched details for video ID: zyB2SILz4MI
2025-01-03 13:20:53,166 - INFO - Fetching details for video ID: pifdrg

In [15]:
df = pd.DataFrame(all_videos)

In [17]:
df.to_excel(r'D:\Avinash.Vishwakarma_Data\OneDrive - Course5 Intelligence Limited\Avinash-code\Non Billable Scrape\bath and body\Extended Data\ytbathand body.xlsx', index=False)

In [24]:
import pandas as pd

df = pd.read_excel(r"D:\Avinash.Vishwakarma_Data\OneDrive - Course5 Intelligence Limited\Avinash-code\Non Billable Scrape\bath and body\Extended Data\irishspring_instagram-scraper_2025-01-02_12-55-05-014.xlsx",sheet_name='Data')

# Select only the required columns
columns_to_keep = ["caption","commentsCount","dimensionsHeight","dimensionsWidth","displayUrl","firstComment","id","inputUrl","likesCount","ownerFullName","ownerId","ownerUsername","shortCode","timestamp","type","url"]
df_filtered = df[columns_to_keep]

df_filtered.to_excel(r"D:\Avinash.Vishwakarma_Data\OneDrive - Course5 Intelligence Limited\Avinash-code\Non Billable Scrape\bath and body\Extended Data\irishspring cleared.xlsx", index=False)

