In [1]:
# !pip install google-api-python-client pandas vaderSentiment textblob python-dotenv

In [2]:
import csv
import os
import time

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from dotenv import load_dotenv
from googleapiclient.discovery import build
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud

In [None]:

load_dotenv()


class YouTubeDataFetcher:
    def __init__(self):
        """
        Initializes the YouTubeDataFetcher object with the API key and output file.
        """
        self.api_key = os.getenv('YOUTUBE_API_KEY')
        self.output_file = 'youtube_data.csv'
        self.youtube = self.get_youtube_client()
        self.headers = ["video_id", "title", "description", "view_count", "like_count",
                        "dislike_count", "comment_count", "duration", "favorite_count", "comments", "sentiment_score"]

        # Initialize CSV file (with headers)
        try:
            with open(self.output_file, mode='w', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=self.headers)
                writer.writeheader()
        except FileNotFoundError:
            print("Could not create output file.")

    def get_youtube_client(self):
        """
        Initializes and returns the YouTube API client.
        """
        return build("youtube", "v3", developerKey=self.api_key)

    def fetch_video_data(self, video_id):
        """
        Fetches the video details (views, likes, comments) and comments for the given video_id.
        """
        try:
            # Get the video details (view count, like count, etc.)
            video_response = self.youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=video_id
            ).execute()

            if not video_response["items"]:
                raise Exception(f"No video found for ID: {video_id}")

            video_info = video_response["items"][0]
            title = video_info["snippet"]["title"]
            description = video_info["snippet"].get("description", "No description")
            view_count = int(video_info["statistics"].get("viewCount", 0))
            like_count = int(video_info["statistics"].get("likeCount", 0))
            dislike_count = int(video_info["statistics"].get("dislikeCount", 0))
            comment_count = int(video_info["statistics"].get("commentCount", 0))
            duration = video_info["contentDetails"]["duration"]
            favorite_count = int(video_info["statistics"].get("favoriteCount", 0))

            # Fetching comments (max 100 comments)
            comments = []
            comment_response = self.youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100
            ).execute()

            for item in comment_response["items"]:
                comment_text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                comments.append(comment_text)

            return {
                "video_id": video_id,
                "title": title,
                "description": description,
                "view_count": view_count,
                "like_count": like_count,
                "dislike_count": dislike_count,
                "comment_count": comment_count,
                "duration": duration,
                "favorite_count": favorite_count,
                "comments": comments
            }

        except Exception as e:
            print(f"Error fetching data for {video_id}: {str(e)}")
            return None

    @staticmethod
    def analyze_sentiment(comments):
        """
        Analyzes the sentiment of the provided comments using VADER Sentiment Analysis.
        Returns a sentiment score.
        """
        analyzer = SentimentIntensityAnalyzer()
        sentiment_score = 0
        num_comments = len(comments)

        for comment in comments:
            sentiment = analyzer.polarity_scores(comment)
            sentiment_score += sentiment['compound']

        # Calculate the average sentiment score
        if num_comments > 0:
            sentiment_score /= num_comments
        return sentiment_score

    def save_to_csv(self, video_data):
        """
        Saves the fetched video data into the CSV file.
        """
        try:
            with open(self.output_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.DictWriter(file, fieldnames=self.headers)
                writer.writerow({
                    "video_id": video_data["video_id"],
                    "title": video_data["title"],
                    "description": video_data["description"],
                    "view_count": video_data["view_count"],
                    "like_count": video_data["like_count"],
                    "dislike_count": video_data["dislike_count"],
                    "comment_count": video_data["comment_count"],
                    "duration": video_data["duration"],
                    "favorite_count": video_data["favorite_count"],
                    "comments": " | ".join(video_data["comments"]),
                    "sentiment_score": video_data["sentiment_score"]
                })
        except Exception as e:
            print(f"Error saving data for {video_data['video_id']}: {str(e)}")

    def fetch_and_process_data(self, df):
        """
        Fetches and processes data for each video in the provided DataFrame.
        Saves the results into a CSV file.
        """
        for index, row in df.iterrows():
            video_id = row['youtubeId']
            print(f"Processing video: {video_id}")

            # Fetch video data
            video_data = self.fetch_video_data(video_id)

            if video_data:
                # Analyze sentiment of the comments
                sentiment_score = self.analyze_sentiment(video_data["comments"])
                video_data["sentiment_score"] = sentiment_score

                # Save the data to the CSV file
                self.save_to_csv(video_data)

            # Sleep to avoid rate-limiting
            time.sleep(1)  # Adjust the sleep time as needed to prevent rate limit errors

        print(f"Data fetching and saving complete. All data saved in {self.output_file}.")

    @staticmethod
    def generate_wordcloud(df):
        """
        Generates a word cloud from the comments of all videos.
        """
        all_comments = " ".join(df["comments"].dropna())
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_comments)

        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()

    @staticmethod
    def generate_graphs(df):
        """
        Generates various plots for video statistics using Plotly Express.
        """
        # Top 10 most viewed videos
        top_10 = df.nlargest(10, "view_count")
        fig = px.bar(top_10, x='title', y='view_count', title='Top 10 Most Viewed Videos')
        fig.show()

        # Bottom 10 least viewed videos
        bottom_10 = df.nsmallest(10, "view_count")
        fig = px.bar(bottom_10, x='title', y='view_count', title='Bottom 10 Least Viewed Videos')
        fig.show()

        # Most liked video
        most_liked = df.loc[df['like_count'].idxmax()]
        fig = px.bar(x=[most_liked['title']], y=[most_liked['like_count']], title='Most Liked Video')
        fig.show()

        # Least liked video
        least_liked = df.loc[df['like_count'].idxmin()]
        fig = px.bar(x=[least_liked['title']], y=[least_liked['like_count']], title='Least Liked Video')
        fig.show()

        # Video with the highest duration
        highest_duration = df.loc[df['duration'].idxmax()]
        fig = px.bar(x=[highest_duration['title']], y=[highest_duration['duration']],
                     title='Video with Highest Duration')
        fig.show()


# Load the CSV containing the video data (e.g., vdoLinks.csv)
df = pd.read_csv('vdoLinks.csv')

# Set your YouTube API key
api_key = 'YOUR_YOUTUBE_API_KEY'

# Create an instance of YouTubeDataFetcher and process the data
fetcher = YouTubeDataFetcher()
fetcher.fetch_and_process_data(df)

# After fetching and saving the data, generate the word cloud and plots
fetcher.generate_wordcloud(df)
fetcher.generate_graphs(df)


Processing video: K26_sDKnvMU
Error fetching data for K26_sDKnvMU: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=K26_sDKnvMU&maxResults=100&key=AIzaSyAtthA89DQi0fovAsijA2YgfdqhTvT96t4&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
Processing video: 3LPANjHlPxo
Error fetching data for 3LPANjHlPxo: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=3LPANjHlPxo&maxResults=100&key=AIzaSyAtthA89DQi0fovAsijA2YgfdqhTvT96t4&alt=json returned "The video identified by the <code><a h