In [1]:
import pandas as pd
import numpy as np
from googleapiclient.discovery import build
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import os

In [2]:
# API key for YouTube Data API
api_key = 'YOUR_KEY_HERE'

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to extract YouTube video ID from trailer URL
def extract_video_id(url):
    if not url or pd.isna(url): 
        return None
    match = re.search(r'v=([^&]*)|embed/([^?]*)', url)
    if match:
        return match.group(1) or match.group(2)
    return None

In [3]:
# Function to fetch YouTube comments
def fetch_comments(youtube, video_id, max_results=100):
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_results,
            textFormat="plainText"
        )
        while request:
            response = request.execute()
            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append(comment)
            request = youtube.commentThreads().list_next(request, response)  # Handle pagination
            if len(comments) >= max_results:
                break
    except Exception as e:
        pass
    return comments

In [4]:
# Function to compute average sentiment score
def compute_sentiment(comments):
    if not comments:
        print("No comments found.")
        return None  # Return None if no comments are available

    # Calculate compound scores for all comments
    scores = [analyzer.polarity_scores(comment)['compound'] for comment in comments]
    
    # Calculate the raw average of the scores
    raw_average = sum(scores) / len(scores) if scores else 0.0

    return raw_average


In [5]:
def fetch_example_comment(comments, target_score):
    if not comments:
        return None

    # Compute sentiment scores for all comments
    scored_comments = [(comment, analyzer.polarity_scores(comment)['compound']) for comment in comments]

    # Find the comment with the closest sentiment score to the target
    closest_comment = min(scored_comments, key=lambda x: abs(x[1] - target_score))

    # Limit comment to two sentences
    truncated_comment = ".".join(closest_comment[0].split(".")[:2]).strip()

    # If the comment is longer than 90 characters, truncate and add "..."
    if len(truncated_comment) > 130:
        truncated_comment = truncated_comment[:127] + "..."

    return truncated_comment

In [6]:
# Function to process each movie trailer
def process_trailers(api_key, df):
    from googleapiclient.errors import HttpError

    youtube = build('youtube', 'v3', developerKey=api_key)

    for index, row in df.iterrows():
        video_url = row.get('trailer', None)
        video_id = extract_video_id(video_url)

        if not video_id:
            df.at[index, 'sentiment_score'] = None
            df.at[index, 'comment'] = None
            continue

        try:
            # Fetch comments and compute sentiment
            comments = fetch_comments(youtube, video_id)
            
            # If comments are fetched, compute sentiment score and example comment
            if comments:
                sentiment_score = compute_sentiment(comments)
                df.at[index, 'sentiment_score'] = sentiment_score

                example_comment = fetch_example_comment(comments, sentiment_score)
                df.at[index, 'comment'] = example_comment
            else:
                # No comments available
                df.at[index, 'sentiment_score'] = None
                df.at[index, 'comment'] = None

        except HttpError as e:
            # Handle cases where comments are disabled
            df.at[index, 'sentiment_score'] = None
            df.at[index, 'comment'] = None

    return df

In [7]:
# Calculate and scale sentiment scores using z-scores and sigmoid function
def calculate_scaled_scores(df):
    scores = df['sentiment_score'].dropna()

    # Calculate mean and standard deviation for z-scores
    mean = scores.mean()
    std = scores.std()

    # Apply z-score normalization
    def z_score(x):
        return (x - mean) / std if std != 0 else 0

    z_scores = scores.apply(z_score)

    # Apply sigmoid function to scale z-scores to [0, 1]
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    scaled_scores = z_scores.apply(sigmoid)

    # Truncate to 2 decimal places and pad with zeros
    def format_score(score):
        return f"{score:.2f}"

    df['sentiment_score'] = df['sentiment_score'].apply(
        lambda x: format_score(sigmoid(z_score(x))) if pd.notna(x) else None
    )
    return df

In [8]:
os.makedirs("csvs", exist_ok=True)

franchise_csvs = [file for file in os.listdir("csvs") if file.endswith('_movies.csv')]
for franchise_csv in franchise_csvs:
    franchise_name = franchise_csv.replace('_movies.csv', '')
    csv_path = os.path.join("csvs", franchise_csv) 

    movies = pd.read_csv(csv_path)
    movies['sentiment_score'] = None  # Add a new column for sentiment scores
    movies['comment'] = None  # Add a new column for sentiment scores

    movies = process_trailers(api_key, movies)
    movies = calculate_scaled_scores(movies)
    filename = f"csvs/{franchise_name}_movies.csv"
    movies.to_csv(filename, index=False)