#### combined_df has may 27th, 28th,29th data
#### text folder has all the transcribed text files

In [1]:
from pytube import YouTube, Channel
import whisper
from moviepy.editor import AudioFileClip
import os
from googleapiclient.discovery import build
from datetime import datetime, timedelta, timezone
import pandas as pd
import pytz
import numpy as np
import re
import asyncio
import requests

from dotenv import load_dotenv

In [2]:
# load environment variables from .env file
load_dotenv("API_KEYS.env")


True

In [3]:
API_KEY = os.getenv("YT_API_KEY")

### Create Data directories

In [6]:
def create_dirs(path):
    directory_path = path

    # Check if the directory exists
    if not os.path.exists(directory_path):
        # If it doesn't exist, create it
        os.makedirs(directory_path)
        print("Directory created:", directory_path)
    else:
        print("Directory already exists:", directory_path)

In [7]:
create_dirs('data')
create_dirs(os.path.join('data','audio'))
create_dirs(os.path.join('data','text'))

Directory already exists: data
Directory already exists: data\audio
Directory already exists: data\text


### Generate Video URLs

In [10]:
def generate_video_urls(channel_name, API_KEY= API_KEY):

    import requests

    #load your API key
    if not API_KEY:
        raise ValueError("API key not found. Please check your API_KEYS.env file.")

    # Create a YouTube API service
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # Set the YouTube channel ID
    search_channel_name = channel_name
    channel_id= requests.get(f'https://www.googleapis.com/youtube/v3/search?part=id&q={search_channel_name}&type=channel&key={API_KEY}').json()['items'][0]['id']['channelId']
    print(f"Channel ID for {search_channel_name}: {channel_id}")

    # Number of videos to fetch
    NUM_VIDEOS = 50

    # Fetch the latest videos for the channel
    request = youtube.search().list(
        part='snippet',
        
        channelId= channel_id,
        order='date',

        maxResults=NUM_VIDEOS,
        type='video',
        fields="items(id,snippet(publishedAt,channelId,channelTitle,title))"
    )
    response = request.execute()

    # we can also take the time right now and then fetch the videos released 24 hrs prior
    # tz='US/Eastern'
    # a= datetime.now()
    # a= a.astimezone(timezone(tz))

    # take the latest date when the video was released and fetch the videos released 24 hrs prior
    tz=pytz.timezone('US/Eastern')
    latest_date= datetime.strptime(response['items'][0]['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
    latest_date= latest_date.astimezone(tz) #converting datetime to EST timezone

    video_urls = []
    selected_response = []
    video_ids = []
    date_published = []
    for item in response['items']:
        t= datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
        t= t.astimezone(tz)

        # if latest_date - t <= timedelta(days= 2):
        if (latest_date - t).days==0:
            video_urls.append('https://www.youtube.com/watch?v=' + item['id']['videoId'])
            selected_response.append(item)
            video_ids.append(item['id']['videoId'])
            date_published.append(str(t.date()))

            

    return video_urls, selected_response, video_ids, date_published


In [None]:
ursl, res, id, date=generate_video_urls('Bloomberg Television')

#### Generate Video URLs for the given list of channels

In [None]:
# list the channels taht you want to use to fetch the videos
list_channels= ['Yahoo finance', 'Bloomberg television', 'World economics Forum', 'The Economist', 'Financial Times', 'Coin Bureau', 'Reuters', 
                'The Wall Street Journal', 'Al Jazeera English', 'Washington Post' ]

# creating a dataframe to collect metadat for the fetched videos
final_df = pd.DataFrame(columns= ['video_urls', 'response_metadata','video_ids', 'publishedAt'])

for channel in list_channels:
    urls, response, ids, date = generate_video_urls(channel)
    df= pd.DataFrame(columns= ['video_urls', 'response_metadata','video_ids', 'publishedAt'])
    df.video_urls= urls
    df.response_metadata= response
    df.video_ids= ids
    df.publishedAt= date
    final_df = pd.concat([final_df,df])

final_df = final_df.reset_index(drop = True)

In [159]:
final_df['channel_name']= final_df.response_metadata.apply(lambda x: x['snippet']['channelTitle'])
# final_df['video_title']= final_df.response_metadata.apply(lambda x: x['snippet']['title'])

In [160]:
final_df.to_pickle('final_df.pkl')

### Download Youtube Videos

In [3]:
final_df = pd.read_pickle('final_df.pkl')

In [162]:
# Download audio function and converts mp4 video to mp3 audio

def download_audio(url, output_path='data\\audio'):

    # Handle unaccepted special characters to save the audio file and replace spaces with underscore
    def _handle_filenames(filename):
        import re
        new_string = re.sub('[^A-z0-9 -.]', '', filename).lower().replace(" ", "_")
        return new_string

    # Create a YouTube object with the URL
    yt = YouTube(url)
    filename= yt.title

    # Get the audio stream with the highest quality
    audio_stream = yt.streams.get_audio_only()

    # if the audio file exists do not download
    if os.path.exists(os.path.join(output_path, audio_stream.default_filename)):
        print(f'File already exists : {os.path.join(output_path, audio_stream.default_filename)}')
        mp3_filename = _handle_filenames(f"{filename}.mp3")
        mp3_file_path = os.path.join(output_path, mp3_filename)
        print(mp3_file_path)
        return mp3_file_path

    # Download the audio stream
    temp_file = audio_stream.download(output_path=output_path)

    # Load the downloaded file
    audio_clip = AudioFileClip(temp_file)

    # Set the filename for the MP3 file
    mp3_filename = filename if filename.endswith('.mp3') else f"{filename}.mp3"

    mp3_filename = _handle_filenames(mp3_filename)
    
    print(mp3_filename)
    mp3_file_path = os.path.join(output_path, mp3_filename)

    # Convert to MP3 and save

    audio_clip.write_audiofile(mp3_file_path, codec="libmp3lame")

    # Remove the temporary file
    # os.remove(temp_file)

    print(f"Downloaded and converted to MP3: {mp3_file_path}")
    return mp3_file_path
# Example usage


In [163]:
# extracts the video title from the video urls in the dataframe

def get_video_title(url):
    # Create a YouTube object with the URL
    yt = YouTube(url)
    return yt.title


In [164]:
# extracts puclishedAt date from the video urls in the dataframe
# this function is not being used

def get_date(item):
    tz=pytz.timezone('US/Eastern')
    t= datetime.strptime(item['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ")
    t= t.astimezone(tz)
    return str(t.date())



#### Download audio files for the extracted Youtube video URLs

In [None]:
final_df['mp3_file_path'] = np.nan
for i,url in enumerate(final_df.video_urls):
    try:
        final_df.loc[i, 'video_title'] = get_video_title(url)
        file_path = download_audio(url= url)
        final_df.loc[i, 'mp3_file_path']= file_path
    except Exception as e:
        print(repr(e))
        continue

In [166]:
final_df.to_pickle('final_df.pkl')

In [167]:
final_df[final_df['mp3_file_path'].isna()]

Unnamed: 0,video_urls,response_metadata,video_ids,publishedAt,channel_name,mp3_file_path,video_title
35,https://www.youtube.com/watch?v=jljPk2qrw1k,"{'id': {'kind': 'youtube#video', 'videoId': 'j...",jljPk2qrw1k,2024-05-28,Bloomberg Television,,"Biden's chances at reelection ""diminished"" say..."
46,https://www.youtube.com/watch?v=g6uQU_LnCso,"{'id': {'kind': 'youtube#video', 'videoId': 'g...",g6uQU_LnCso,2024-05-28,Bloomberg Television,,Markets and iPhones | Bloomberg Surveillance |...
130,https://www.youtube.com/watch?v=kL_ookoWoIo,"{'id': {'kind': 'youtube#video', 'videoId': 'k...",kL_ookoWoIo,2024-05-28,Al Jazeera English,,Rafah's main hospital shuts as Israel attacks ...
142,https://www.youtube.com/watch?v=aGZqN4914jc,"{'id': {'kind': 'youtube#video', 'videoId': 'a...",aGZqN4914jc,2024-05-28,Al Jazeera English,,Kuwaiti Hospital in Rafah forced to shut down ...


In [168]:
# final_df[final_df['mp3_file_path'].isna()]                                                
final_df= final_df.dropna(how= 'any')
final_df= final_df.reset_index(drop=True)
final_df[final_df['mp3_file_path'].isna()]

Unnamed: 0,video_urls,response_metadata,video_ids,publishedAt,channel_name,mp3_file_path,video_title


In [169]:
final_df.to_pickle('final_df.pkl')

In [170]:
from IPython.display import Audio
Audio(final_df.mp3_file_path[0]) # To check whether the mp3 file plays

### Transcribe : OpenAI - Whisper using async IO

In [100]:
final_df['mp3_file_path'].tolist()[5:8]

['data\\audio\\israel-hamas_war_assault_on_rafah_kills_dozens__bloomberg_the_pulse_052724.mp3',
 "data\\audio\\south_africa_elections_tourism_minister_on_ramaphosa's_future_cabinet.mp3",
 "data\\audio\\why_uk_stock_market_is_'attractive'_according_to_lombard_odier.mp3"]

In [171]:
import whisper
import os
import asyncio

async def transcribe_audio(audio_file):
    model = whisper.load_model("small")  # You can choose other models like 'tiny', 'small', 'medium', 'large'
    result = model.transcribe(audio_file)

    return result

async def main():
    # audio_files = [f for f in os.listdir("data/audio") if f.endswith('.mp3')][0:2]
    audio_files = final_df['mp3_file_path'].tolist()
    tasks = [transcribe_audio(audio_file) for audio_file in audio_files]
    transcriptions = await asyncio.gather(*tasks)

    txt_file_paths = []
    for audio_file, transcription in zip(audio_files, transcriptions):
        base_path = os.path.join('data', 'text')
        filename = (audio_file.split('\\')[2]).replace('.mp3','.txt')
        file_path = os.path.join(base_path, filename)
        txt_file_paths.append(file_path)
        with open(file_path, "w") as f:
            f.write(transcription['text'])
        print(f'Text file saved to  : {file_path}')

    # final_df['txt_file_path'] = txt_file_paths

if __name__ == "__main__":
    asyncio.run(main())


Text file saved to  : data\text\stock_market_is_growth_oriented_with_room_for_earnings_to_expand,_analyst_says.txt
Text file saved to  : data\text\how_americans_feel_about_the_economy_versus_what_the_data_shows.txt
Text file saved to  : data\text\sam_bankman-fried's_doing_surprisingly_well_in_prison_reporter.txt
Text file saved to  : data\text\inflation_is_having_a_'creeper_effect'_on_all_consumer_income_levels,_analyst_says.txt
Text file saved to  : data\text\fed_could_start_cutting_rates_as_early_as_september,_asset_manager_says.txt
Text file saved to  : data\text\tesla_ceo_elon_musk_new_china_ev_tariffs_are_not_good_#shorts.txt
Text file saved to  : data\text\white_house_press_secretary_karine_jean-pierre_holds_briefing.txt
Text file saved to  : data\text\home_prices_hit_new_9th_consecutive_all-time-high_in_march_s&p_corelogic_case-shiller_index.txt
Text file saved to  : data\text\3_places_to_invest_your_money_right_now_cashing_in_on_a_rally.txt
Text file saved to  : data\text\stock

In [172]:
base_path = os.path.join('data', 'text')
final_df['txt_file_path'] = final_df['mp3_file_path'].apply(lambda x: 
                                                            os.path.join(base_path, x.split('\\')[2].
                                                            replace('.mp3', '.txt')))

In [173]:
final_df.to_pickle('final_df.pkl')

### Transcribe : OpenAI - Whisper

In [104]:
def transcribe_audio(audio_file):

    model = whisper.load_model("small")  # You can choose other models like 'tiny', 'small', 'medium', 'large'
    result = model.transcribe(audio_file)
   
    return result

In [None]:

final_df['txt_file_path'] = np.nan
for i in range(3):
    result= transcribe_audio(audio_file= final_df.loc[i, 'mp3_file_path'])
    # print(f'Transcription of file {final_df.loc[i, 'mp3_file_path']} complete')

    base_path = os.path.join('data', 'text')
    filename = (final_df.loc[i, 'video_title'])
    filename = re.sub('[^A-z0-9 -.]', '', filename).lower().replace(" ", "_")
    filename = filename + '.txt'
    file_path = os.path.join(base_path, filename)

    final_df.loc[i, 'txt_file_path']= file_path
    print(i)   
    
    with open(file_path, "w") as file:
        file.write(result['text'])
    print(f'Text file saved to  : {file_path}')



#### Extract Comments

In [179]:
def get_youtube_comments(video_id, max_results=50):
    # Build the YouTube API client
    API_KEY = "AIzaSyATIBuk9YgT0xznkIsLVGnsZTknMrVGrgE" #other email
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    
    # Make a request to the YouTube API to get comment threads
    request = youtube.commentThreads().list(
        part='snippet',
        videoId=video_id,
        maxResults=max_results,
        textFormat='plainText'
    )
    response = request.execute()
    
    comments = []
    
    # Extract comments from the response
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comments.append(comment)
    
    return comments

# Replace with your video ID
# video_id = 'VIDEO_ID'


In [None]:
final_df['list_of_comments'] = [[] for _ in range(len(final_df))]

# Function to retrieve comments and handle exceptions
def safe_get_comments(video_id):
    try:
        return get_youtube_comments(video_id)
    except Exception as e:
        print(f"Error retrieving comments for video ID {video_id}: {repr(e)}")
        return []

# Apply the function to each video ID in the DataFrame
final_df['list_of_comments'] = final_df['video_ids'].apply(safe_get_comments)

In [181]:
final_df['list_of_comments'].apply(len).max()

50

In [210]:
# final_df[final_df['list_of_comments'] == 'dGKsedppvgo']
final_df[final_df['list_of_comments'].apply(lambda x: len(x) == 0)]

Unnamed: 0,video_urls,response_metadata,video_ids,publishedAt,channel_name,mp3_file_path,video_title,txt_file_path,list_of_comments
9,https://www.youtube.com/watch?v=2cruhc8pyBc,"{'id': {'kind': 'youtube#video', 'videoId': '2...",2cruhc8pyBc,2024-05-28,Yahoo Finance,data\audio\stock_market_today_stocks_mixed_as_...,Stock market today: Stocks mixed as focus turn...,data\text\stock_market_today_stocks_mixed_as_f...,[]
22,https://www.youtube.com/watch?v=acXt2_ZbZVg,"{'id': {'kind': 'youtube#video', 'videoId': 'a...",acXt2_ZbZVg,2024-05-29,Bloomberg Television,data\audio\bloomberg_daybreak_asia_05292024.mp3,Bloomberg Daybreak: Asia 05/29/2024,data\text\bloomberg_daybreak_asia_05292024.txt,[]
...,...,...,...,...,...,...,...,...,...
103,https://www.youtube.com/watch?v=gkIXNspkJ34,"{'id': {'kind': 'youtube#video', 'videoId': 'g...",gkIXNspkJ34,2024-05-28,Reuters,data\audio\live_ukrainian_president_volodymyr_...,LIVE: Ukrainian President Volodymyr Zelenskiy ...,data\text\live_ukrainian_president_volodymyr_z...,[]
104,https://www.youtube.com/watch?v=TLxCW2nWcM0,"{'id': {'kind': 'youtube#video', 'videoId': 'T...",TLxCW2nWcM0,2024-05-28,Reuters,data\audio\live_ukrainian_president_volodymyr_...,LIVE: Ukrainian President Volodymyr Zelenskiy ...,data\text\live_ukrainian_president_volodymyr_z...,[]


In [183]:
final_df.to_pickle('final_df.pkl')

In [4]:
final_df_2= pd.read_pickle('final_df_2.pkl')

In [5]:
combined_df= pd.concat([final_df_2, final_df], ignore_index=True)

In [12]:
combined_df.to_pickle('combined_df.pkl')

In [8]:
combined_df[combined_df.duplicated(subset=['txt_file_path'])]

Unnamed: 0,video_urls,response_metadata,video_ids,publishedAt,channel_name,mp3_file_path,video_title,list_of_comments,txt_file_path
143,https://www.youtube.com/watch?v=dGKsedppvgo,"{'id': {'kind': 'youtube#video', 'videoId': 'd...",dGKsedppvgo,2024-05-27,World Economic Forum,data\audio\special_meeting_2024_is_education_r...,Special Meeting 2024: Is Education Ready for AI?,[],data\text\special_meeting_2024_is_education_re...
144,https://www.youtube.com/watch?v=2oOVIfdRLHw,"{'id': {'kind': 'youtube#video', 'videoId': '2...",2oOVIfdRLHw,2024-05-24,The Economist,data\audio\natos_boss_wants_to_free_ukraine_to...,NATO’s boss wants to free Ukraine to strike in...,"[A. This Clown is not a Boss of NATO, he's wor...",data\text\natos_boss_wants_to_free_ukraine_to_...
145,https://www.youtube.com/watch?v=hwglagHMQ_E,"{'id': {'kind': 'youtube#video', 'videoId': 'h...",hwglagHMQ_E,2024-05-27,Financial Times,data\audio\will_tiktok_be_banned_in_the_us__ft...,Will TikTok be banned in the US? | FT #shorts,"[But didn't Trump ban tik tok as well?, Remove...",data\text\will_tiktok_be_banned_in_the_us__ft_...


In [10]:
combined_df= combined_df.drop_duplicates(subset=['txt_file_path'])