<a href="https://colab.research.google.com/github/azizhina51-svg/NLP/blob/main/emotion_tracker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Colab Notebook: KidWatch - automated YouTube emotional profiling using OpenAI or Google NLP

To run:
1) Runtime -> Change runtime type -> Ensure Python 3 and (optional) GPU if you plan to run local Whisper.
2) Install required packages (below).
3) Fill in API keys as instructed.
4) Run cells in order.

This notebook:
- Accepts a list of YouTube URLs (paste)
- Extracts title/description and captions (if available)
- If no captions, downloads audio and uses OpenAI Whisper (if OpenAI chosen)
- Uses either OpenAI Chat completions or Google Cloud Natural Language for emotion classification
- Produces CSV and simple charts

In [3]:
!pip install --quiet youtube-transcript-api pytube transformers openai google-cloud-language plotly streamlit matplotlib pandas

# 1) Imports
import os
import re
import csv
import time
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from pytube import YouTube
import matplotlib.pyplot as plt
from IPython.display import display
from collections import Counter
from datetime import datetime


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/6.9 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/6.9 MB[0m [31m42.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.9/6.9 MB[0m [31m78.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
try:
    from google.cloud import language_v1
except Exception:
    language_v1 = None

try:
    import openai
except Exception:
    openai = None

In [7]:
def extract_video_id(url):
    # supports youtu.be and youtube.com watch?v=
    m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
    return m.group(1) if m else None

def clean_text(s):
    if s is None:
        return ""
    s = s.replace("\n"," ").strip()
    s = re.sub(r'http\S+','', s)
    s = re.sub(r'\s+',' ', s)
    return s

In [8]:
def get_metadata_pytube(video_url):
    try:
        yt = YouTube(video_url)
        return yt.title, yt.description
    except Exception as e:
        return None, None

In [9]:
def fetch_captions(video_id, languages=['en']):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
        text = " ".join([t['text'] for t in transcript])
        return clean_text(text)
    except Exception as e:
        return None


In [11]:
def download_audio(video_url, out_path="/tmp"):
    yt = YouTube(video_url)
    stream = yt.streams.filter(only_audio=True).first()
    fn = stream.download(output_path=out_path)
    return fn  # file path

In [12]:
def transcribe_with_openai_whisper(audio_file_path, model="gpt-4o-mini-transcribe"):
    # NOTE: OpenAI's python client may change. This is a simple wrapper - check docs.
    # You must set OPENAI_API_KEY in env.
    if openai is None:
        raise RuntimeError("openai package not available. Install openai.")
    with open(audio_file_path, "rb") as f:
        # Use the API method matching your installed openai version
        resp = openai.Audio.transcriptions.create(
            file=f,
            model="whisper-1"
        )
    return resp["text"] if isinstance(resp, dict) and "text" in resp else resp.text

In [13]:
OPENAI_CLASS_PROMPT = """
You are an assistant that classifies short text into one of these parent-friendly categories:
- aggressive
- humorous
- calm
- overstimulating
- scary
- educational
Return ONLY a single-line JSON object {{"label":"<one of the above>", "confidence":0-1, "notes":"<one-sentence rationale>"}}
Make decisions focused on children's content; base it on the text provided.
"""

In [17]:
def classify_with_openai(text, max_tokens=120, model="gpt-4o-mini"):
    # returns dict with keys label, confidence, notes
    if openai is None:
        raise RuntimeError("OpenAI client not available")
    prompt = OPENAI_CLASS_PROMPT + "\n\nText:\n" + text
    resp = openai.ChatCompletion.create(
        model=model,
        messages=[{"role":"system","content":OPENAI_CLASS_PROMPT},
                  {"role":"user","content":text}],
        max_tokens=max_tokens,
        temperature=0.0
    )
    out = resp['choices'][0]['message']['content'].strip()

    import json
    try:
        obj = json.loads(out)
    except Exception:
        # fallback: heuristics
        obj = {"label": "neutral", "confidence": 0.6, "notes": out[:200]}
    return obj

In [18]:
# This cell was merged into bUwT8inLh99D to fix the NameError.

In [19]:
def classify_with_google(text):
    if language_v1 is None:
        raise RuntimeError("Google language client not available")
    client = language_v1.LanguageServiceClient()
    document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
    # sentiment
    sentiment = client.analyze_sentiment(request={'document': document}).document_sentiment
    score = sentiment.score  # -1 to 1
    mag = sentiment.magnitude
    # classify_text requires categories API and may need large text; we'll use heuristics
    # Heuristic mapping:
    if score < -0.4 or (mag>1.5 and score<0):
        label = "aggressive"
    elif score > 0.4 and mag>0.5:
        label = "humorous"
    elif mag>1.5 and score>0 and "laugh" in text.lower():
        label = "humorous"
    elif mag>1.5 and score>0 and "sing" in text.lower():
        label = "calm"
    elif mag>1.5 and score<0 and ("scream" in text.lower() or "fight" in text.lower()):
        label = "scary"
    else:
        label = "calm" if score>=0 else "overstimulating"
    confidence = min(0.99, 0.5 + abs(score)*0.5)
    notes = f"sentiment_score={score:.2f}, magnitude={mag:.2f}"
    return {"label": label, "confidence": confidence, "notes": notes}

In [20]:
def analyze_youtube_list(urls, provider="openai", whisper_if_missing=True, languages=['en']):
    results = []
    for url in urls:
        vid = extract_video_id(url)
        title, desc = get_metadata_pytube(url)
        captions = fetch_captions(vid, languages=languages)
        text_source = None
        if captions:
            text_source = captions
        else:
            if whisper_if_missing and provider=="openai":
                print(f"No captions for {vid}. Downloading audio and transcribing (may take time)...")
                audio_path = download_audio(url)
                text_source = transcribe_with_openai_whisper(audio_path)
            else:
                text_source = (title or "") + " " + (desc or "")
        combined_text = clean_text(" ".join(filter(None, [title, desc, text_source])))
        # classify
        if provider == "openai":
            label_obj = classify_with_openai(combined_text)
        elif provider == "google":
            label_obj = classify_with_google(combined_text)
        else:
            raise ValueError("provider must be 'openai' or 'google'")
        results.append({
            "url": url, "video_id": vid, "title": title, "description": desc,
            "text": combined_text, "label": label_obj.get("label"),
            "confidence": label_obj.get("confidence"),
            "notes": label_obj.get("notes")
        })
    return pd.DataFrame(results)


In [21]:
def show_summary(df, title="Emotional profile"):
    counts = df['label'].fillna("unknown").value_counts()
    fig, ax = plt.subplots(figsize=(6,4))
    counts.plot(kind='pie', autopct="%1.1f%%", ylabel="")
    plt.title(title)
    plt.show()

In [22]:
import streamlit as st
import pandas as pd
import os
import re
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import matplotlib.pyplot as plt
from collections import Counter


In [23]:
def extract_video_id(url):
    m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
    return m.group(1) if m else None

def clean_text(s):
    import re
    if not s:
        return ""
    s = s.replace("\n"," ").strip()
    s = re.sub(r'http\S+','', s)
    s = re.sub(r'\s+',' ', s)
    return s

def get_metadata_pytube(video_url):
    try:
        yt = YouTube(video_url)
        return yt.title, yt.description
    except Exception as e:
        return None, None

def fetch_captions(video_id, languages=['en']):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
        text = " ".join([t['text'] for t in transcript])
        return clean_text(text)
    except Exception:
        return None

In [30]:
st.set_page_config(page_title="KidWatch - Emotional Snapshot", layout="centered")
st.title("KidWatch — Emotional Snapshot for Children's YouTube")

st.markdown("""
Paste YouTube URLs (one per line). Choose a provider (OpenAI or Google Cloud) for classification.
**OpenAI** will also transcribe audio using Whisper when captions are missing (if you supply OpenAI key).
""")

urls_text = st.text_area("YouTube URLs", height=160, placeholder="https://youtu.be/xxxx\nhttps://www.youtube.com/watch?v=yyyy")
provider = st.selectbox("Classification provider", ["openai", "google"])
use_whisper = st.checkbox("If captions missing: transcribe audio with OpenAI Whisper (OpenAI only)", value=True)

col1, col2 = st.columns(2)
with col1:
    if provider == "openai":
        openai_key = st.text_input("OpenAI API key (sk-...)", type="password")
        if openai_key:
            os.environ["OPENAI_API_KEY"] = openai_key
with col2:
    if provider == "google":
        st.info("Set GOOGLE_APPLICATION_CREDENTIALS in your environment pointing to the JSON key file before running the app.")

if st.button("Analyze"):
    urls = [u.strip() for u in urls_text.splitlines() if u.strip()]
    if not urls:
        st.warning("Paste at least one YouTube URL.")
    else:
        st.info(f"Analyzing {len(urls)} videos (provider={provider}) — this may take a moment.")

        # Call the main analysis function defined earlier in the notebook
        # Ensure openai is imported if not already, especially if key was just set.
        # The functions themselves will check for client availability.
        try:
            results_df = analyze_youtube_list(urls, provider=provider, whisper_if_missing=use_whisper)

            st.subheader("Analysis Results")
            st.dataframe(results_df)

            st.subheader("Emotional Profile Summary")
            show_summary(results_df)

            # Optionally provide CSV download
            csv_export = results_df.to_csv(index=False).encode('utf-8')
            st.download_button(
                label="Download data as CSV",
                data=csv_export,
                file_name=f"kidwatch_analysis_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv",
                mime="text/csv",
            )
        except RuntimeError as e:
            st.error(f"Error during analysis: {e}. Please ensure API keys are correctly set and dependencies are installed.")
        except Exception as e:
            st.error(f"An unexpected error occurred: {e}")



In [32]:
# The content of this cell has been merged into cell H8wfdm7hkj_y
# to consolidate the Streamlit application logic and fix the NameError.
# The logic for iterating through URLs and performing analysis is now
# executed within the 'Analyze' button click block in the Streamlit app.

In [35]:
#   if provider == "openai":
#                 # prompt-based classification — returns JSON-like text; be cautious about model choice
#                 prompt = f"""
# Classify the following text into one label among: aggressive, humorous, calm, overstimulating, scary, educational.
# Return JSON exactly: {{\"label\":\"<label>\",\"confidence\":0.00,\"notes\":\"one-sentence rationale\"}}
# Text:
# {combined}
# """
#                 resp = openai.ChatCompletion.create(model="gpt-4o-mini", messages=[{"role":"user","content":prompt}], temperature=0.0)
#                 out = resp['choices'][0]['message']['content']
#                 import json
#                 try:
#                     obj = json.loads(out)
#                 except Exception:
#                     obj = {"label":"unknown","confidence":0.5,"notes":out[:150]}
#             else:

In [38]:
#   client = language_v1.LanguageServiceClient()
#                 doc = language_v1.Document(content=combined, type_=language_v1.Document.Type.PLAIN_TEXT)
#                 sentiment = client.analyze_sentiment(request={'document':doc}).document_sentiment
#                 score = sentiment.score
#                 mag = sentiment.magnitude
#                 if score < -0.4:
#                     label = "aggressive"
#                 elif score > 0.4:
#                     label = "humorous"
#                 else:
#                     label = "calm"
#                 obj = {"label":label,"confidence":min(0.99,0.5+abs(score)/2),"notes":f"sent={score:.2f},mag={mag:.2f}"}
#             rows.append({
#                 "url":url,
#                 "video_id":vid,
#                 "title":title,
#                 "label":obj.get("label"),
#                 "confidence":obj.get("confidence"),
#                 "notes":obj.get("notes")
#             })
#         df = pd.DataFrame(rows)
#         st.success("Analysis complete")
#         st.dataframe(df)
#         # show pie
#         counts = df['label'].value_counts()
#         fig, ax = plt.subplots(figsize=(5,4))
#         counts.plot(kind='pie', autopct="%1.1f%%", ylabel="")
#         st.pyplot(fig)
#         csv = df.to_csv(index=False).encode('utf-8')
#         st.download_button("Download CSV report", csv, file_name="kidwatch_report.csv", mime="text/csv")
# How these two pieces work tog