In [1]:
!pip install spacy vaderSentiment youtube-transcript-api gradio pandas fpdf openpyxl google-api-python-client wordcloud matplotlib
!python -m spacy download en_core_web_sm
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from googleapiclient.discovery import build
from fpdf import FPDF
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import urllib.parse
import gradio as gr


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.3-py3-none-any.whl.metadata (17 kB)
Collecting gradio
  Downloading gradio-5.14.0-py3-none-any.whl.metadata (16 kB)
Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.0 (from gradio)
  Downloading gradio_client-1.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (f

In [2]:
# Initialize Spacy and VADER
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# YouTube Data API key
YOUTUBE_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXXXX"


In [3]:

def extract_video_id(url):
    try:
        parsed_url = urllib.parse.urlparse(url)
        if parsed_url.netloc in ['www.youtube.com', 'youtube.com']:
            if parsed_url.query:
                return urllib.parse.parse_qs(parsed_url.query)['v'][0]
            if parsed_url.path:
                match = re.search(r'/([a-zA-Z0-9_-]+)$', parsed_url.path)
                if match:
                  return match.group(1)
        elif parsed_url.netloc in ['youtu.be']:
            return parsed_url.path[1:]
        else:
            return None
    except Exception as e:
        print(f"Error parsing URL: {e}")
        return None


In [4]:
def fetch_video_metadata(video_url):
    video_id = extract_video_id(video_url)
    if not video_id:
      return None, "Invalid Youtube URL format."

    youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

    try:
        request = youtube.videos().list(part="snippet,statistics", id=video_id)
        response = request.execute()

        if not response['items']:
            return None, "Video not found with provided url."

        video_data = response['items'][0]

        metadata = {
            "channel_name": video_data['snippet']['channelTitle'],
            "video_title": video_data['snippet']['title'],
            "views": video_data['statistics']['viewCount'],
            "likes": video_data['statistics'].get('likeCount', 'N/A'),
            "dislikes": video_data['statistics'].get('dislikeCount', 'N/A'),
            "posted_date": video_data['snippet']['publishedAt']
        }

        return metadata, None

    except VideoUnavailable:
        return None, "Video is unavailable."
    except Exception as e:
        return None, str(e)


def fetch_transcript(video_url):
    video_id = extract_video_id(video_url)
    if not video_id:
        return None, "Invalid Youtube URL format"
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join([t['text'] for t in transcript])
        return text, None

    except (TranscriptsDisabled, VideoUnavailable):
        return None, "Transcript not available for this video."
    except Exception as e:
        return None, str(e)


def split_long_sentences(text):
    doc = nlp(text)  # Tokenize into sentences using Spacy
    sentences = []

    for sent in doc.sents:
        if len(sent.text.split()) > 25:
            sub_sentences = []
            current_chunk = []
            for token in sent:
                current_chunk.append(token.text)
                if token.is_punct and token.text in {".", "!", "?"}:
                    sub_sentences.append(" ".join(current_chunk).strip())
                    current_chunk = []
                elif token.text.lower() in {"and", "but", "because", "so"}:
                    if len(current_chunk) > 3:
                        sub_sentences.append(" ".join(current_chunk).strip())
                        current_chunk = []

            if current_chunk:
                sub_sentences.append(" ".join(current_chunk).strip())

            sentences.extend(sub_sentences)
        else:
            sentences.append(sent.text.strip())

    return sentences

In [5]:
def read_keywords(file_path):
    df = pd.read_excel(file_path)

    attributes = df.columns.tolist()
    keywords = {}

    for attribute in attributes:
        keywords[attribute] = df[attribute].dropna().tolist()

    return keywords, attributes



def match_keywords_in_sentences(sentences, keywords):
    matched_keywords = {attribute: [] for attribute in keywords}

    for sentence in sentences:
        for attribute, sub_keywords in keywords.items():
            for keyword in sub_keywords:
                if keyword.lower() in sentence.lower():
                    matched_keywords[attribute].append(sentence)

    return matched_keywords


In [6]:
def analyze_sentiment_for_keywords(matched_keywords, sentences):
    sentiment_results = {}

    for attribute, sentences_list in matched_keywords.items():
        positive_lines = []
        negative_lines = []

        for line in sentences_list:
            sentiment = sia.polarity_scores(line)
            if sentiment['compound'] > 0.05:
                positive_lines.append((line.strip(), sentiment['compound']))
            elif sentiment['compound'] < -0.05:
                negative_lines.append((line.strip(), sentiment['compound']))

        sentiment_results[attribute] = {
            "positive": positive_lines,
            "negative": negative_lines
        }

    return sentiment_results




In [7]:
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment

def generate_excel(sentiment_results, attributes):
    wb = Workbook()
    ws = wb.active
    ws.title = "Sentiment Analysis Results"

    # Set header font style
    header_font = Font(bold=True)
    alignment = Alignment(horizontal='center')

    # Write headers
    headers = ["Attribute", "Sentiment", "Sentence", "Sentiment Score"]
    for col_num, header in enumerate(headers, 1):
        cell = ws.cell(row=1, column=col_num)
        cell.value = header
        cell.font = header_font
        cell.alignment = alignment

    row_num = 2
    for attribute, sentiment_data in sentiment_results.items():
        for sentiment_type, sentences in sentiment_data.items():
            for sentence, score in sentences:
                ws.cell(row=row_num, column=1).value = attribute
                ws.cell(row=row_num, column=2).value = sentiment_type.capitalize()
                ws.cell(row=row_num, column=3).value = sentence
                ws.cell(row=row_num, column=4).value = score
                row_num += 1

    # Auto-adjust column width
    for col in ws.columns:
        max_length = 0
        column = col[0].column_letter # Get the column name
        for cell in col:
            try: # Necessary to avoid error on empty cells
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        adjusted_width = (max_length + 2) * 1.2
        ws.column_dimensions[column].width = adjusted_width

    # Save the Excel file
    wb.save("sentiment_analysis_results.xlsx")



def generate_word_clouds(matched_keywords):
    wordclouds = {}

    for attribute, sentences_list in matched_keywords.items():
        text = " ".join(sentences_list)

        wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
        wordclouds[attribute] = wordcloud

        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(f"Word Cloud for {attribute}")
        plt.show()

    return wordclouds


In [8]:
def generate_pdf_with_sections(metadata, sentiment_results, wordclouds, output_file="Analysis_Report.pdf"):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add Metadata
    pdf.set_font("Arial", "B", 16)
    pdf.cell(200, 10, "Auto-Insight: YouTube Video Sentiment & Attribute Analysis Report", ln=True, align="C")
    pdf.ln(10)

    if metadata:
        pdf.set_font("Arial", "B", 14)
        pdf.cell(0, 10, "Video Metadata", ln=True)
        pdf.set_font("Arial", size=12)
        for key, value in metadata.items():
            pdf.cell(0, 10, f"{key.replace('_', ' ').title()}: {value}", ln=True)
        pdf.ln(10)

    # Add Sections for Each Attribute
    for attribute, sentiments in sentiment_results.items():
        pdf.add_page()
        pdf.set_font("Arial", "B", 14)
        pdf.cell(0, 10, f"Attribute: {attribute}", ln=True)
        pdf.ln(5)

        # Add Positive Sentiments
        pdf.set_font("Arial", "B", 12)
        pdf.cell(0, 10, "Positive Sentiments:", ln=True)
        pdf.set_font("Arial", size=12)
        for line, score in sentiments["positive"]:
            pdf.multi_cell(0, 10, f"Line: {line}\nScore: {score}")
            pdf.ln(2)

        # Add Negative Sentiments
        pdf.set_font("Arial", "B", 12)
        pdf.cell(0, 10, "Negative Sentiments:", ln=True)
        pdf.set_font("Arial", size=12)
        for line, score in sentiments["negative"]:
            pdf.multi_cell(0, 10, f"Line: {line}\nScore: {score}")
            pdf.ln(2)

        # Add Word Cloud
        if attribute in wordclouds:
            plt.imshow(wordclouds[attribute], interpolation='bilinear')
            plt.axis("off")
            plt.savefig(f"{attribute}_wordcloud.png")
            pdf.image(f"{attribute}_wordcloud.png", x=10, y=80, w=180)
            plt.close()

    pdf.output(output_file)
    return output_file

In [10]:
def process_keywords_and_video(url, excel_file):
    metadata, error = fetch_video_metadata(url)
    if error:
        return error, None

    transcript, error = fetch_transcript(url)
    if error:
        return error, None

    sentences = split_long_sentences(transcript)
    keywords, attributes = read_keywords(excel_file)
    matched_keywords = match_keywords_in_sentences(sentences, keywords)
    sentiment_results = analyze_sentiment_for_keywords(matched_keywords, sentences)
    wordclouds = generate_word_clouds(matched_keywords)
    pdf_file = generate_pdf_with_sections(metadata, sentiment_results, wordclouds)

    return "Processing completed successfully!", pdf_file

In [11]:
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from googleapiclient.discovery import build
from fpdf import FPDF
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import urllib.parse
import gradio as gr
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment

# Initialize Spacy and VADER
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# YouTube Data API key
YOUTUBE_API_KEY = "XXXXXXXXXXXXXXXXXXXXXXXX"

def process_keywords_and_video(url, excel_file):
    metadata, error = fetch_video_metadata(url)
    if error:
        return error, None

    transcript, error = fetch_transcript(url)
    if error:
        return error, None

    sentences = split_long_sentences(transcript)
    keywords, attributes = read_keywords(excel_file)
    matched_keywords = match_keywords_in_sentences(sentences, keywords)
    sentiment_results = analyze_sentiment_for_keywords(matched_keywords, sentences)
    wordclouds = generate_word_clouds(matched_keywords)
    pdf_file = generate_pdf_with_sections(metadata, sentiment_results, wordclouds)
    generate_excel(sentiment_results, attributes)

    return "Processing completed successfully!", pdf_file


# Gradio App
with gr.Blocks(
    css="""
    body {
        background: linear-gradient(120deg, #fdfbfb, #f8f6f7);
        font-family: 'Arial', sans-serif;
        color: #444;
    }
    .gr-textbox textarea, .gr-file input {
        border: 1px solid #ccc;
        border-radius: 8px;
        padding: 10px;
        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
        background: #fff;
    }
    .gr-textbox textarea:focus, .gr-file input:focus {
        outline: none;
        border-color: #ff9a9e;
        box-shadow: 0 4px 10px rgba(255,154,158,0.5);
    }
    .btn-custom {
        background-color: #ff6f61;
        color: white;
        padding: 12px 24px;
        font-size: 16px;
        font-weight: bold;
        border: none;
        border-radius: 8px;
        cursor: pointer;
    }
    .btn-custom:hover {
        background-color: #e65550;
    }
    .title-box {
        background-color: #ff6f61;
        color: white;
        padding: 20px;
        text-align: center;
        font-size: 2.5em;
        font-weight: bold;
        border-radius: 8px;
        box-shadow: 0 4px 10px rgba(0,0,0,0.2);
        margin-bottom: 20px;
    }
    """
) as iface:
    gr.Markdown('<div class="title-box">Auto-Insight: YouTube Video Analyzer for Automobiles</div>')
    with gr.Row():
        video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter the YouTube video URL")
        excel_file = gr.File(label="Upload Excel File with Keywords")
    with gr.Row():
        process_button = gr.Button("Analyze Video", elem_classes=["btn-custom"])
    with gr.Row():
        processing_status = gr.Textbox(label="Processing Status", interactive=False)
    with gr.Row():
        pdf_output = gr.File(label="Download Sentiment Report (PDF)")
        excel_output = gr.File(label="Download Sentiment Report (Excel)")

    def process_with_excel(url, excel_file):
        status, pdf_path = process_keywords_and_video(url, excel_file)
        return status, pdf_path, "sentiment_analysis_results.xlsx"

    process_button.click(
        process_with_excel,
        inputs=[video_url, excel_file],
        outputs=[processing_status, pdf_output, excel_output]
    )

iface.launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f5e27c3414970f8326.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [9]:
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from googleapiclient.discovery import build
from fpdf import FPDF
import pandas as pd
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import urllib.parse
import gradio as gr
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment

# Initialize Spacy and VADER
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# YouTube Data API key
YOUTUBE_API_KEY = "XXXXXXXXXXXXXX"



def process_keywords_and_video(url, excel_file):
    # Indent the code block within the function definition
    metadata, error = fetch_video_metadata(url)
    if error:
        return error, None

    transcript, error = fetch_transcript(url)
    if error:
        return error, None

    sentences = split_long_sentences(transcript)
    keywords, attributes = read_keywords(excel_file)
    matched_keywords = match_keywords_in_sentences(sentences, keywords)
    sentiment_results = analyze_sentiment_for_keywords(matched_keywords, sentences)
    wordclouds = generate_word_clouds(matched_keywords)
    pdf_file = generate_pdf_with_sections(metadata, sentiment_results, wordclouds)
    # Generate Excel file after processing the video to include results
    generate_excel(sentiment_results, attributes) # Added this line to generate an excel file after processing is completed

    return "Processing completed successfully!", pdf_file


# Gradio App
with gr.Blocks() as iface:
    gr.Markdown("<h1>Auto-Insight: YouTube Video Analyzer for Automobiles</h1>")
    video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter the YouTube video URL")
    excel_file = gr.File(label="Upload Excel File with Keywords")
    process_button = gr.Button("Analyze Video")
    processing_status = gr.Textbox(label="Processing Status", interactive=False)
    pdf_output = gr.File(label="Download Sentiment Report (PDF)")
    excel_output = gr.File(label="Download Sentiment Report (Excel)") #Added excel output

    def process_with_excel(url, excel_file):
      status, pdf_path = process_keywords_and_video(url, excel_file)
      return status, pdf_path, "sentiment_analysis_results.xlsx"  # Return the Excel file path

    process_button.click(
        process_with_excel,
        inputs=[video_url, excel_file],
        outputs=[processing_status, pdf_output, excel_output] #Modified the outputs
    )

iface.launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://004de2daac533efcc0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


