In [None]:
import os
import pandas as pd
import gc
from yt_dlp import YoutubeDL
from youtube_transcript_api import YouTubeTranscriptApi

# Function to load the CSV file into a DataFrame
def load_csv_as_dataframe(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        return None

# Function to download YouTube videos and save to specified path
def download_youtube_videos(df, video_id_column, save_path, batch_size=10):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    failed_videos = set()  # Use a set to store video IDs that can't be downloaded
    video_details = {}

    # Ensure the required columns exist
    if 'video_title' not in df.columns:
        df['video_title'] = None
    if 'description' not in df.columns:
        df['description'] = None
    if 'transcriptions' not in df.columns:
        df['transcriptions'] = None

    ydl_opts = {
        'outtmpl': f'{save_path}/%(id)s.%(ext)s',
        'format': 'best',
        'quiet': True
    }

    # Process in batches
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch_df = df.iloc[start:end]
        
        for index, row in batch_df.iterrows():
            video_id = row[video_id_column]
            video_file_path = os.path.join(save_path, f"{video_id}.mp4")
            
            if video_id in failed_videos:
                continue

            if pd.isna(row['video_title']) or pd.isna(row['description']) or pd.isna(row['transcriptions']):
                # Extract metadata and transcript
                try:
                    print(f"Trying to get title and description for video ID {video_id} at index {index}")
                    with YoutubeDL({'quiet': True}) as ydl:
                        info_dict = ydl.extract_info(f'https://www.youtube.com/watch?v={video_id}', download=False)
                        title = info_dict.get('title', None)
                        description = info_dict.get('description', None)

                    # Attempt to get the transcript
                    try:
                        transcript = YouTubeTranscriptApi.get_transcript(video_id)
                        transcript_text = " ".join([item['text'] for item in transcript])
                    except Exception as e:
                        transcript_text = f"Transcript not available: {e}"

                    video_details[video_id] = {
                        'title': title,
                        'description': description,
                        'transcript': transcript_text
                    }
                except Exception as e:
                    print(f"Error retrieving metadata for video {video_id}: {e}")
                    failed_videos.add(video_id)
                    continue

            # Download video if it doesn't exist
            if not os.path.exists(video_file_path):
                try:
                    with YoutubeDL(ydl_opts) as ydl:
                        ydl.download([f'https://www.youtube.com/watch?v={video_id}'])
                    print(f"Downloaded: {video_id}")
                except Exception as e:
                    print(f"Error downloading video {video_id}: {e}")
                    failed_videos.add(video_id)
                    continue
        
        # Add new columns to the DataFrame using the video_details dictionary
        for video_id, details in video_details.items():
            df.loc[df[video_id_column] == video_id, 'video_title'] = details['title']
            df.loc[df[video_id_column] == video_id, 'description'] = details['description']
            df.loc[df[video_id_column] == video_id, 'transcriptions'] = details['transcript']

        # Clean up and free memory
        del batch_df
        gc.collect()
    
    return df, list(failed_videos)

path = ""

file_path = path + "NEW_IMSyPP_EN_YouTube_comments_evaluation_context_1517_PREPROCESSED.csv"
video_id_column = 'video_id'
save_path = path + "NEW_Hatebase_dataset_downloaded_videos_THIRD_TEST"
intermediate_path = path + "intermediate_csvs_TEST/"

os.makedirs(save_path, exist_ok=True)
os.makedirs(intermediate_path, exist_ok=True)

df = pd.read_csv(file_path, sep=';')

if df is not None:
    updated_df, failed_videos = download_youtube_videos(df, video_id_column, save_path)
    if failed_videos:
        updated_df = updated_df[~updated_df[video_id_column].isin(failed_videos)]
        updated_df.to_csv(path + 'NEW_IMSyPP_EN_YouTube_comments_evaluation_context_1517_PREPROCESSED_no_restricted_videos_inte_THIRD_TEST.csv', index=False)
    updated_df.to_csv(path + 'NEW_IMSyPP_EN_YouTube_comments_evaluation_context_1517_PREPROCESSED_no_restricted_videos_THIRD_TEST.csv', index=False)  # Save final output
    print(updated_df.head())
    print(f"Failed videos: {failed_videos}")


In [None]:
updated_df

In [None]:
#failed videos using  from yt_dlp import YoutubeDL are: 'lTSetpETzFA', '_MDo0UhgIys', '0gx2FJIITFo'

In [4]:
updated_df.to_csv(path + 'NEW_IMSyPP_EN_YouTube_comments_evaluation_context_1483_PREPROCESSED_no_restricted_videos_THIRD_TEST-redownloaded_with_yt_dlp.csv', index=False)

In [8]:
df1 = updated_df.copy()

In [9]:
import pandas as pd
# Set the options to display all rows and columns
pd.set_option('display.max_rows', None)  # None means show all rows
pd.set_option('display.max_columns', None)  # None means show all columns
pd.set_option('display.max_colwidth', None)

In [None]:
# Display the DataFrame before removing rows
print("DataFrame before removing empty 'video_title' rows:")
#print(df1)

# Remove rows where 'video_title' is empty or None
df2 = df1[df1['video_title'].notna() & (df1['video_title'] != '')]  
df_cleaned =df2[df2['transcriptions'].notna() & (df2['transcriptions'] != '')]

# Display the cleaned DataFrame
print("\nDataFrame after removing empty 'video_title' rows:")
print(len(df_cleaned))
#df_cleaned

In [None]:
# Function to remove rows with "Transcript not available:" in the transcriptions column
def remove_unavailable_transcripts(df, transcription_column='transcriptions'):
    # Filter out rows where the transcription contains "Transcript not available:"
    df_cleaned = df[~df[transcription_column].str.contains("Transcript not available:", na=False)]
    return df_cleaned



df_cleaned_trans = remove_unavailable_transcripts(df_cleaned)
len(df_cleaned_trans)

In [None]:

# Function to remove rows where "type_reply" is None or NaN
def remove_none_nan_type_reply(df):
    df_cleaned = df.dropna(subset=['type_reply'])
    df_cleaned = df_cleaned.dropna(subset=['transcriptions'])
    df_cleaned = df_cleaned.dropna(subset=['reply'])
    df_cleaned = df_cleaned.dropna(subset=['comment'])
    return df_cleaned


df_trans_nans = remove_none_nan_type_reply(df_cleaned_trans)
len(df_trans_nans)

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

df_trans_nans_token=df_trans_nans.copy()

# Tokenize each text and count the number of tokens
df_trans_nans_token['token_count'] = df_trans_nans_token['transcriptions'].apply(lambda x: len(x.split()))
df_trans_nans_token.head()

# Print detailed statistics
print("Detailed Statistics:")
print(df_trans_nans_token['token_count'].describe(percentiles=[.01, .05, .10, .25, .50, .75, .90, .95, .99]))

# Calculate the number of bins for 2 token steps
max_tokens = df_trans_nans_token['token_count'].max()
bin_size = 2000
num_bins = int((max_tokens / bin_size) + 1)

# Create an interactive plot with more bars
fig = px.histogram(df_trans_nans_token, x='token_count', nbins=num_bins, title='Granular Distribution of Token Counts per Transcription')

# Update layout for better readability
fig.update_layout(
    xaxis_title='Number of Tokens',
    yaxis_title='Frequency (Number of Texts)',
    bargap=0.2,
)

# Add hover data
fig.update_traces(
    hovertemplate='Number of Tokens: %{x}<br>Frequency: %{y}<extra></extra>',
)

# Show the plot
fig.show()

In [None]:
# Tokenize each text and count the number of tokens
df_trans_nans_token['token_count'] = df_trans_nans_token['transcriptions'].apply(lambda x: len(x.split()))

# Print the original DataFrame
print("Original DataFrame:")
#print(df)

# Remove rows where token count is greater than 2000
df_trans_nans_token = df_trans_nans_token[df_trans_nans_token['token_count'] <= 2000]

# Drop the token_count column as it's no longer needed
df_trans_nans_token_FINAL = df_trans_nans_token.drop(columns=['token_count'])
len(df_trans_nans_token_FINAL)

In [None]:
df_trans_nans_token_FINAL.to_csv('FINAL_IMSyPP_EN_697_PREPROCESSED_no_restricted_videos_features_2ktoken_THIRD_TEST-redownloaded_with_yt_dlp.csv', sep = ';')