● Data Extraction

1. Data Collection:

In [None]:
import googleapiclient.discovery
from googleapiclient.errors import HttpError
import io
import json
import os
import csv
import pandas as pd

In [None]:
# Set your API key here
API_KEY = 'AIzaSyAc-3AyUnHZnf-edqsUTNgpjmtDOG5_r4Q'

# Initialize the YouTube Data API v3 client
youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=API_KEY)

In [None]:
def search_videos(query, max_results=50):
    videos = []

    request = youtube.search().list(
        q=query,
        type='video',
        part='id',
        maxResults=max_results
    )

    response = request.execute()


    for item in response['items']:
        video_id = item['id']['videoId']
        videos.append(video_id)

    return videos

def get_video_comments_with_details(video_id):
    comments_data = []

    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            textFormat="plainText",
            maxResults=100
        )
        response = request.execute()

        for comment in response['items']:
            comment_snippet = comment['snippet']
            top_level_comment = comment_snippet['topLevelComment']['snippet']
            text = top_level_comment['textDisplay']
            author = top_level_comment['authorDisplayName']
            author_id = top_level_comment['authorChannelId']['value']

            comments_data.append({
                'VideoID': video_id,
                'AuthorName': author,
                'AuthorID': author_id,
                'Comment': text
            })
    
    except HttpError as e:
        if "commentsDisabled" in str(e):
            print(f"Comments are disabled for video: {video_id}")
        else:
            print(f"Error fetching comments for video: {video_id}")
    
    return comments_data

if __name__ == '__main__':
    # Replace 'renewable energy sustainability' with your desired query
    query = 'renewable energy sustainability'

    # Search for videos
    videos = search_videos(query)

    # Fetch and save comments with details in a DataFrame
    comments_details = []
    for video_id in videos:
        video_comments = get_video_comments_with_details(video_id)
        comments_details.extend(video_comments)

    # Convert the list of comments with details into a DataFrame
    comments_df = pd.DataFrame(comments_details)

    # Save the DataFrame to a CSV file
    comments_df.to_csv('video_comments_with_details.csv', index=False)

    print("Comments with details have been stored in 'video_comments_with_details22.csv'")

2. Data Preprocessing:
    Clean and preprocess your data, including removing duplicates, handling missing data, and performing text normalization. 

In [None]:
import pandas as pd

# Load your data into a DataFrame (this example assumes you have a CSV file)
df = pd.read_csv('video_comments_with_details.csv')  # Replace with your data file

# Remove duplicate comments
df = df.drop_duplicates(subset='Comment')

# Remove blank comments (comments with only whitespace)
df = df[df['Comment'].str.strip() != '']

# Handle missing data (if applicable)
df = df.dropna(subset=['Comment'])

# Remove comments that don't contain meaningful text (e.g., comments with very few words)
min_comment_length = 3  # Adjust this threshold as needed
df = df[df['Comment'].str.split().str.len() >= min_comment_length]

# Perform text normalization (e.g., lowercasing and punctuation removal)
df['Comment'] = df['Comment'].str.lower()
df['Comment'] = df['Comment'].str.replace(r'[^\w\s]', '')

# Save the preprocessed data to a new file
df.to_csv('preprocessed_comments_data.csv', index=False)


3. Data Filtering

In [None]:
import pandas as pd

# Step 1: Read the CSV file into a DataFrame
df = pd.read_csv('comments_with_sentiment.csv')

# Step 2: Define common keywords (replace with your list)
common_keywords = [
    "renewable", "sustainable", "climate", "clean energy", "conservation",
    "solar", "wind", "hydroelectric", "geothermal", "biomass", "tidal energy",
    "sustainable practices", "environmental sustainability", "social sustainability",
    "economic sustainability", "sustainable development",
    "climate action", "carbon footprint", "greenhouse gases", "climate mitigation",
    "climate adaptation",
    "clean power", "green energy", "clean technology", "low-carbon energy", "energy efficiency",
    "energy-saving", "energy conservation", "power reduction",
    "eco-conscious", "environmentally friendly", "eco-friendly products", "green living",
    "electric vehicles", "public transportation", "cycling", "sustainable mobility",
    "biodiversity", "habitat preservation", "conservation efforts",
    "recycling", "reuse", "reduce", "circular economy principles",
    "sustainable architecture", "LEED certification", "green building",
    "renewable sources", "renewable technology", "renewable power", "sustainable solutions",
    "sustainable living", "renewable practices", "sustainable initiatives",
    "clean environment", "green initiatives", "environmental conservation",
    "renewable economy", "sustainability goals", "renewable infrastructure", "sustainable transportation",
    "clean fuel", "green infrastructure", "sustainable consumption", "renewable innovations",
    "sustainable policies", "renewable investments", "sustainability measures",
    "clean power generation", "sustainable urban planning", "renewable solutions", "sustainability standards",
    "renewable technologies", "sustainability projects", "clean energy sources", "sustainable resources",
    "renewable initiatives", "sustainability practices", "clean energy solutions", "sustainable business",
    "renewable practices", "renewable management", "sustainability assessment", "clean energy systems",
    "sustainable development goals", "renewable conservation", "renewable strategies", "sustainability efforts",
    "clean energy policies", "green energy solutions", "sustainable practices",
    "renewable investments", "sustainability measures", "clean power generation", "sustainable urban planning",
    "renewable solutions", "sustainability standards", "renewable technologies", "sustainability projects",
    "clean energy sources", "sustainable resources", "renewable initiatives", "sustainability practices",
    "clean energy solutions", "sustainable business", "renewable practices", "renewable management",
    "sustainability assessment", "clean energy systems", "sustainable development goals", "renewable conservation",
    "renewable strategies", "sustainability efforts", "clean energy policies", "green energy solutions", "sustainable practices",
    "renewable investments", "sustainability measures", "clean power generation", "sustainable urban planning",
    "renewable solutions", "sustainability standards", "renewable technologies", "sustainability projects",
    "clean energy sources", "sustainable resources", "renewable initiatives", "sustainability practices",
    "clean energy solutions", "sustainable business", "renewable practices", "renewable management",
    "sustainability assessment", "clean energy systems", "sustainable development goals", "renewable conservation",
    "renewable strategies", "sustainability efforts", "clean energy policies", "green energy solutions", "sustainable practices", "energy", "solar", "nuclear", "green", "wind", "waste", "gas", "electricity",
    "fuel", "fossil", "oil", "plants", "coal", "water", "storage", "renewables", "technology",
    "earth", "greenhouse", "batteries", "fuels", "expensive", "hydrogen", "plant", "global",
    "turbines", "biomass", "land", "heat", "carbon", "co2", "sun", "environment", "human",
    "emissions", "uranium", "government", "recycling", "battery", "pollution", "tax", "price",
    "tons", "natural", "high", "production", "time", "use"
]


# Step 3: Create a function to check if a comment contains common keywords
def contains_common_keywords(comment):
    comment = comment.lower()
    for keyword in common_keywords:
        if keyword in comment:
            return True
    return False

# Step 4: Filter comments using the function
filtered_comments = df[df['Comment'].apply(contains_common_keywords)]

# Step 5: Save the new DataFrame to a new CSV file
filtered_comments.to_csv('filtered_comments2.csv', index=False)


Identify misconceptions or gaps in knowledge related to 
renewable source by analuzing top keywords in commentss

In [None]:
import pandas as pd
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
import string

# Load the comments with sentiment data
df = pd.read_csv('filtered_comments2.csv')

# Tokenize and preprocess the comments
df['Comment'] = df['Comment'].str.lower()
df['Comment'] = df['Comment'].str.replace(r'[^\w\s]', '')

# Separate comments by sentiment
positive_comments = df[df['Sentiment Category'] == 'Positive']['Comment']
negative_comments = df[df['Sentiment Category'] == 'Negative']['Comment']
neutral_comments = df[df['Sentiment Category'] == 'Neutral']['Comment']

# Define a list of stopwords
stop_words = set(stopwords.words("english"))

# Common keywords for identifying frequency
common_keywords = [
    "renewable", "sustainable", "climate", "clean energy", "conservation",
    "solar", "wind", "hydroelectric", "geothermal", "biomass", "tidal energy",
    "sustainable practices", "environmental sustainability", "social sustainability",
    "economic sustainability", "sustainable development",
    "climate action", "carbon footprint", "greenhouse gases", "climate mitigation",
    "climate adaptation",
    "clean power", "green energy", "clean technology", "low-carbon energy", "energy efficiency",
    "energy-saving", "energy conservation", "power reduction",
    "eco-conscious", "environmentally friendly", "eco-friendly products", "green living",
    "electric vehicles", "public transportation", "cycling", "sustainable mobility",
    "biodiversity", "habitat preservation", "conservation efforts",
    "recycling", "reuse", "reduce", "circular economy principles",
    "sustainable architecture", "LEED certification", "green building",
    "renewable sources", "renewable technology", "renewable power", "sustainable solutions",
    "sustainable living", "renewable practices", "sustainable initiatives",
    "clean environment", "green initiatives", "environmental conservation",
    "renewable economy", "sustainability goals", "renewable infrastructure", "sustainable transportation",
    "clean fuel", "green infrastructure", "sustainable consumption", "renewable innovations",
    "sustainable policies", "renewable investments", "sustainability measures",
    "clean power generation", "sustainable urban planning", "renewable solutions", "sustainability standards",
    "renewable technologies", "sustainability projects", "clean energy sources", "sustainable resources",
    "renewable initiatives", "sustainability practices", "clean energy solutions", "sustainable business",
    "renewable practices", "renewable management", "sustainability assessment", "clean energy systems",
    "sustainable development goals", "renewable conservation", "renewable strategies", "sustainability efforts",
    "clean energy policies", "green energy solutions", "sustainable practices",
    "renewable investments", "sustainability measures", "clean power generation", "sustainable urban planning",
    "renewable solutions", "sustainability standards", "renewable technologies", "sustainability projects",
    "clean energy sources", "sustainable resources", "renewable initiatives", "sustainability practices",
    "clean energy solutions", "sustainable business", "renewable practices", "renewable management",
    "sustainability assessment", "clean energy systems", "sustainable development goals", "renewable conservation",
    "renewable strategies", "sustainability efforts", "clean energy policies", "green energy solutions", "sustainable practices",
    "energy", "solar", "nuclear", "green", "wind", "waste", "gas", "electricity",
    "fuel", "fossil", "oil", "plants", "coal", "water", "storage", "renewables", "technology",
    "earth", "greenhouse", "batteries", "fuels", "expensive", "hydrogen", "plant", "global",
    "turbines", "biomass", "land", "heat", "carbon", "co2", "sun", "environment", "human",
    "emissions", "uranium", "government", "recycling", "battery", "pollution", "tax", "price",
    "tons", "natural", "high", "production", "time", "use"
]

# Function to extract keywords and calculate frequency for a given sentiment
def extract_keywords(sentiment_comments, common_keywords):
    all_sentiment_comments = ' '.join(sentiment_comments)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(all_sentiment_comments)
    
    # Remove punctuation and symbols
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Filter keywords based on common_keywords
    tokens = [word for word in tokens if word in common_keywords]
    
    fdist = FreqDist(tokens)
    common_keywords = fdist.most_common(100)  # Change the number as needed
    
    return common_keywords

# Extract keywords for each sentiment
positive_keywords = extract_keywords(positive_comments, common_keywords)
negative_keywords = extract_keywords(negative_comments, common_keywords)
neutral_keywords = extract_keywords(neutral_comments, common_keywords)

# Create DataFrames from the results
df_positive = pd.DataFrame(positive_keywords, columns=['Keyword', 'Frequency'])
df_negative = pd.DataFrame(negative_keywords, columns=['Keyword', 'Frequency'])
df_neutral = pd.DataFrame(neutral_keywords, columns=['Keyword', 'Frequency'])

# Export DataFrames to CSV files
df_positive.to_csv('positive_keywords.csv', index=False)
df_negative.to_csv('negative_keywords.csv', index=False)
df_neutral.to_csv('neutral_keywords.csv', index=False)
