In [2]:
from pytube import YouTube, Channel

import whisper

from moviepy.editor import AudioFileClip
import os


#### Create Data directories

In [35]:
def create_dirs(path):
    directory_path = path

    # Check if the directory exists
    if not os.path.exists(directory_path):
        # If it doesn't exist, create it
        os.makedirs(directory_path)
        print("Directory created:", directory_path)
    else:
        print("Directory already exists:", directory_path)

In [39]:
create_dirs('data')
create_dirs(os.path.join('data','audio'))
create_dirs(os.path.join('data','text'))

Directory created: data2
Directory created: data2\audio
Directory created: data2\text


#### Download Youtube Videos

In [15]:
def download_audio(url, output_path='data\\audio'):
    # Create a YouTube object with the URL
    yt = YouTube(url)
    filename= yt.title

    # Get the audio stream with the highest quality
    audio_stream = yt.streams.get_audio_only()

    # Download the audio stream
    temp_file = audio_stream.download(output_path=output_path)

    # Load the downloaded file
    audio_clip = AudioFileClip(temp_file)

    # Set the filename for the MP3 file
    mp3_filename = filename if filename.endswith('.mp3') else f"{filename}.mp3"

    mp3_file_path = os.path.join(output_path, mp3_filename)
    # mp3_file_path= mp3_file_path.replace(os.sep, '/')

    # Convert to MP3 and save
    audio_clip.write_audiofile(mp3_file_path, codec='mp3')

    # Remove the temporary file
    os.remove(temp_file)

    print(f"Downloaded and converted to MP3: {mp3_file_path}")
    return mp3_file_path
# Example usage


In [26]:
video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"  # Replace 'example' with the actual video ID
audio_file_path= download_audio(url= video_url)

MoviePy - Writing audio in data\audio\Never Gonna Give You Up.mp3


                                                                      

MoviePy - Done.
Downloaded and converted to MP3: data\audio\Never Gonna Give You Up.mp3




In [27]:
print(audio_file_path)
print(os.path.basename(audio_file_path).split('.mp3')[0])

data\audio\Never Gonna Give You Up.mp3
Never Gonna Give You Up


In [None]:
# print(yt.title)
# print(yt.publish_date)
# print(yt.check_availability())
# print(yt.rating)
# # print(yt.streaming_data)
# print(yt.author)

In [28]:
from IPython.display import Audio
Audio(audio_file_path)

In [33]:
base_path= 'data\\text'
filename= (os.path.basename(audio_file_path).split('.mp3')[0]) + '.txt'
os.path.join(base_path, filename)


'data\\text\\Never Gonna Give You Up.txt'

#### Transcribe : OpenAI - Whisper 

In [40]:
def transcribe_audio(audio_file):

    model = whisper.load_model("small")  # You can choose other models like 'tiny', 'small', 'medium', 'large'
    result = model.transcribe(audio_file)
    print(result["text"])
    
    base_path = os.path.join('data', 'text')
    filename= (os.path.basename(audio_file_path).split('.mp3')[0]) + '.txt'
    file_path = os.path.join(base_path, filename)
    with open(file_path, "w") as file:
        file.write(result["text"])
    print(f'Text file saved to  : {file_path}')

    return 

In [41]:
transcribe_audio(audio_file= audio_file_path)

#save text into a text file
# data\text\os.path.basename(audio_file_name)


 Music We're no strangers to love You know the rules and so do I I feel commitments while I'm thinking of You wouldn't get this from any other guy I just wanna tell you how I'm feeling Gotta make you understand Never gonna give you up Never gonna let you down Never gonna run around and desert you Never gonna make you cry Never gonna say goodbye Never gonna tell goodbye And hurt you We've known each other for so long Your heart's been aching but You're too shy to say it It's how we both know what's been going on We know the game and we're gonna play it And if you ask me how I'm feeling Don't tell me or to my disease Never gonna give you up Never gonna let you down Never gonna run around and desert you Never gonna make you cry Never gonna say goodbye Never gonna tell goodbye And hurt you Never gonna give you up Never gonna let you down Never gonna run around and desert you Never gonna make you cry Never gonna say goodbye Never gonna tell goodbye And hurt you Give you up Give you up Never

### Sandbox

In [71]:
from googleapiclient.discovery import build
from datetime import datetime, timedelta

# Set your API key
API_KEY = 'AIzaSyBTuh0ypeNqiDVhNIvn5ofhuIvx8qmt-r4'

# Set the YouTube channel ID
CHANNEL_ID = 'UCEAZeUIeJs0IjQiqTCdVSIg'

# Number of videos to fetch
NUM_VIDEOS = 120

# Create a YouTube API service
youtube = build('youtube', 'v3', developerKey=API_KEY)

target_date = datetime(2024, 5, 12)
target_date_str = target_date.strftime('%Y-%m-%dT00:00:00Z')

# Fetch the latest videos for the channel
request = youtube.search().list(
    part='snippet',
    
    channelId=CHANNEL_ID,
    order='date',

    maxResults=NUM_VIDEOS,
    type='video'
)
response = request.execute()

# Extract video URLs from the response
video_urls = ['https://www.youtube.com/watch?v=' + item['id']['videoId'] for item in response['items']]

# Print the list of video URLs
for url in video_urls:
    print(url)

#    publishedAfter=target_date_str,publishedBefore=target_date_str + 'T23:59:59Z',

https://www.youtube.com/watch?v=O9gGIZOFFCM
https://www.youtube.com/watch?v=L_wVEmFtnCc
https://www.youtube.com/watch?v=D88oKOE0GDw
https://www.youtube.com/watch?v=vpd-APW9R-4
https://www.youtube.com/watch?v=d38DvjSTLYc
https://www.youtube.com/watch?v=SPqUcP0dh2U
https://www.youtube.com/watch?v=dHaKddYAWoA
https://www.youtube.com/watch?v=HIZ9oNygHAk
https://www.youtube.com/watch?v=aPvIXQuGPMc
https://www.youtube.com/watch?v=jfT7rMVDy-E
https://www.youtube.com/watch?v=AQzV_cNrtsU
https://www.youtube.com/watch?v=RF7TG2HmMX0
https://www.youtube.com/watch?v=glyvUMEof3E
https://www.youtube.com/watch?v=gpI2fLVodEE
https://www.youtube.com/watch?v=Qe2wumx7M9w
https://www.youtube.com/watch?v=r_5QrepMjOs
https://www.youtube.com/watch?v=FLImrvqcchw
https://www.youtube.com/watch?v=in7wGXFe5Gw
https://www.youtube.com/watch?v=Ac7Z5A86emw
https://www.youtube.com/watch?v=FrNMDOI5iK0
https://www.youtube.com/watch?v=jhIW_e1Md_g
https://www.youtube.com/watch?v=T9lO_ksB3w8
https://www.youtube.com/watch?v=

In [72]:
d=[]
for item in response['items']:
    # print(item['snippet']['publishTime'])
    print((datetime.strptime(item['snippet']['publishTime'], "%Y-%m-%dT%H:%M:%SZ")))
    d.append((datetime.strptime(item['snippet']['publishTime'], "%Y-%m-%dT%H:%M:%SZ")))
    

2024-05-14 02:00:13
2024-05-14 02:00:13
2024-05-14 00:00:04
2024-05-13 23:00:33
2024-05-13 22:43:02
2024-05-13 22:28:04
2024-05-13 22:23:36
2024-05-13 22:07:32
2024-05-13 22:02:17
2024-05-13 21:57:28
2024-05-13 21:12:11
2024-05-13 20:04:30
2024-05-13 19:40:10
2024-05-13 19:27:50
2024-05-13 18:53:22
2024-05-13 18:48:47
2024-05-13 18:32:29
2024-05-13 18:28:59
2024-05-13 16:54:38
2024-05-13 16:45:13
2024-05-13 16:43:25
2024-05-13 15:52:37
2024-05-13 15:27:14
2024-05-13 14:49:45
2024-05-13 14:02:03
2024-05-13 12:43:09
2024-05-12 20:00:14
2024-05-12 18:00:20
2024-05-12 16:00:39
2024-05-12 14:00:03
2024-05-12 12:00:05
2024-05-12 00:00:07
2024-05-11 20:00:02
2024-05-11 18:00:18
2024-05-11 16:00:17
2024-05-11 14:00:43
2024-05-11 02:00:24
2024-05-11 02:00:02
2024-05-11 00:45:21
2024-05-10 23:00:15
2024-05-10 21:25:36
2024-05-10 21:22:05
2024-05-10 21:17:43
2024-05-10 21:09:47
2024-05-10 20:09:02
2024-05-10 19:51:24
2024-05-10 18:45:45
2024-05-10 18:00:13
2024-05-10 16:38:00
2024-05-10 16:29:58


In [74]:
for item in response['items']:
    # print(item['id']['videoId'])

    t= datetime.strptime(item['snippet']['publishTime'], "%Y-%m-%dT%H:%M:%SZ")

    if t >= d[0] + timedelta(days=-1):
        print('https://www.youtube.com/watch?v=' + item['id']['videoId']) 
        print(t)

https://www.youtube.com/watch?v=O9gGIZOFFCM
2024-05-14 02:00:13
https://www.youtube.com/watch?v=L_wVEmFtnCc
2024-05-14 02:00:13
https://www.youtube.com/watch?v=D88oKOE0GDw
2024-05-14 00:00:04
https://www.youtube.com/watch?v=vpd-APW9R-4
2024-05-13 23:00:33
https://www.youtube.com/watch?v=d38DvjSTLYc
2024-05-13 22:43:02
https://www.youtube.com/watch?v=SPqUcP0dh2U
2024-05-13 22:28:04
https://www.youtube.com/watch?v=dHaKddYAWoA
2024-05-13 22:23:36
https://www.youtube.com/watch?v=HIZ9oNygHAk
2024-05-13 22:07:32
https://www.youtube.com/watch?v=aPvIXQuGPMc
2024-05-13 22:02:17
https://www.youtube.com/watch?v=jfT7rMVDy-E
2024-05-13 21:57:28
https://www.youtube.com/watch?v=AQzV_cNrtsU
2024-05-13 21:12:11
https://www.youtube.com/watch?v=RF7TG2HmMX0
2024-05-13 20:04:30
https://www.youtube.com/watch?v=glyvUMEof3E
2024-05-13 19:40:10
https://www.youtube.com/watch?v=gpI2fLVodEE
2024-05-13 19:27:50
https://www.youtube.com/watch?v=Qe2wumx7M9w
2024-05-13 18:53:22
https://www.youtube.com/watch?v=r_5QrepM

In [59]:
d[0] + timedelta(days=-2) 

datetime.datetime(2024, 5, 12, 2, 0, 13)

In [29]:
response['items']

[{'kind': 'youtube#searchResult',
  'etag': 'Q8SfxNPXTae-Xe0GWLLV0UkSpYI',
  'id': {'kind': 'youtube#video', 'videoId': 'O9gGIZOFFCM'},
  'snippet': {'publishedAt': '2024-05-14T02:00:13Z',
   'channelId': 'UCEAZeUIeJs0IjQiqTCdVSIg',
   'title': 'Inflation history: How past price increases and monetary policy impacted the consumer',
   'description': 'Inflation is not a new phenomenon for the U.S. economy. Although the Federal Reserve has sustained a higher-for-longer inflation ...',
   'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/O9gGIZOFFCM/default.jpg',
     'width': 120,
     'height': 90},
    'medium': {'url': 'https://i.ytimg.com/vi/O9gGIZOFFCM/mqdefault.jpg',
     'width': 320,
     'height': 180},
    'high': {'url': 'https://i.ytimg.com/vi/O9gGIZOFFCM/hqdefault.jpg',
     'width': 480,
     'height': 360}},
   'channelTitle': 'Yahoo Finance',
   'liveBroadcastContent': 'none',
   'publishTime': '2024-05-14T02:00:13Z'}},
 {'kind': 'youtube#searchResult',
  'etag': 

In [17]:
import requests
from bs4 import BeautifulSoup
import re

def get_channel_id(channel_url):
    response = requests.get(channel_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    script_tag = soup.find('script', string=re.compile(r'"channelId":'))
    if script_tag:
        match = re.search(r'"channelId":"([^"]+)"', script_tag.string)
        if match:
            return match.group(1)
    return None

channel_url = "https://www.youtube.com/c/YahooFinance"
channel_id = get_channel_id(channel_url)
print(f"Channel ID: {channel_id}")


Channel ID: UCEAZeUIeJs0IjQiqTCdVSIg


In [None]:

from datetime import datetime

# Function to download videos from a channel on a given date
def download_videos_on_date(channel_url, target_date):
    # Create a Channel object
    channel = Channel(channel_url)

    # Iterate through the videos of the channel
    for video_url in channel.video_urls:
        try:
            # Create a YouTube object for the video
            yt = YouTube(video_url)

            # Get the publication date of the video
            publication_date = datetime.strptime(yt.publish_date, "%Y-%m-%d")

            # Check if the publication date matches the target date
            if publication_date.date() == target_date:
                # Download the video
                stream = yt.streams.get_highest_resolution()
                stream.download(output_path='videos/')
                print(f"Downloaded: {yt.title}")
        except Exception as e:
            print(f"Error downloading {video_url}: {e}")

# Define the channel URL and the target date
channel_url = "https://www.youtube.com/c/YahooFinance"
target_date = datetime(2024, 5, 13).date()

# Download videos from the channel on the target date
download_videos_on_date(channel_url, target_date)
