In [None]:
!pip install pytube --user
!pip install requests
!pip install pandas
!pip install numpy

#### Testing video download

In [None]:
import os
import json
from pytube import YouTube
import re

def remove_special_characters(input_string):
    # Using regex to keep only alphanumeric characters and spaces
    clean_string = re.sub(r'[^a-zA-Z0-9\s]', '', input_string)
    return clean_string

def download_video_info(video_url, output_directory='downloads'):
    try:
        # Create a YouTube object
        yt = YouTube(video_url)

        # Create a directory for downloads if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        # Get the highest resolution audio stream
        audio_stream = yt.streams.filter(only_audio=True).first()

        # Download the audio stream
        
        file_name = remove_special_characters(yt.title)
        file_name = file_name.replace(" ","_")
        file_name = file_name.replace("…","_")
        file_name = file_name.replace(",","_")
        
        audio_stream.download(output_directory,filename=f'{file_name}.wav')
        audio_path = os.path.join(f"{output_directory}/{file_name}.wav")
        print(f"Downloading audio to {audio_path}...")

        # Collect video information
        video_info = {
            'title': yt.title,
            'duration': yt.length,
            'author': yt.author,
            'views': yt.views,
            'description': yt.description,
            'audio_path': audio_path
        }

        return video_info

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def process_video_links(file_path):
    with open(file_path, 'r') as file:
        video_links = file.readlines()

    video_data_list = []
    
    # video_links = ["https://www.youtube.com/watch?v=CgruI1RjH_c"]

    for video_link in video_links:
        video_link = video_link.strip()
        video_info = download_video_info(video_link)
        
        if video_info:
            video_data_list.append(video_info)

    # Save video data to a JSON file
    output_json_path = 'video_data.json'
    with open(output_json_path, 'w') as json_file:
        json.dump(video_data_list, json_file, indent=2)

    print(f'Video data saved to {output_json_path}')

# Replace 'YOUR_TEXT_FILE_PATH' with the path to your text file containing video links
text_file_path = "./downloads/Fireship_clone/@Fireship-shorts.txt"

# Process video links and save data to JSON
process_video_links(text_file_path)

#### Testing Audio Transcription api

In [2]:
import json
from deepgram import DeepgramClient, PrerecordedOptions

def transcribe_audio(audio_file_path):

    # Your Deepgram API Key
    DEEPGRAM_API_KEY = ''

    # Initialize the Deepgram SDK
    deepgram = DeepgramClient(DEEPGRAM_API_KEY)

    # Call the transcribe_file method on the prerecorded class
    with open(audio_file_path, "rb") as file:
        buffer_data = file.read()

    payload = {
        "buffer": buffer_data,
    }

    options = PrerecordedOptions(
        model="nova-2",
        language="en",
        smart_format=True,
        punctuate=True,
        paragraphs=True,
        diarize=True,
        summarize="v2",
        detect_topics=True,
        filler_words=True,
    )

    file_response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
    file_response = file_response.to_json()

    json_final = json.loads(file_response)

    with open(f"test.json", "w") as file:
        json.dump(json_final, file, indent=4)
        
    return json_final
    

# # Example usage:
# audio_file_path = "./downloads/Fireship_clone/100+_Computer_Science_Concepts_Explained.wav"
# transcribe_audio(audio_file_path)
# print("Transcribing completed successfully")


#### Download Youtube video transcribe it and save the transcribe

In [None]:
import os
import json
from pytube import YouTube
from tqdm import tqdm
from deepgram import DeepgramClient, PrerecordedOptions

def download_and_transcribe_video(video_url, output_directory='downloads'):
    try:
        # Create a YouTube object
        yt = YouTube(video_url)

        # Create a directory for downloads if it doesn't exist
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        # Get the highest resolution audio stream
        audio_stream = yt.streams.filter(only_audio=True).first()

        # Download the audio stream with tqdm progress bar
        file_name = remove_special_characters(yt.title)
        file_name = file_name.replace(" ","_")
        file_name = file_name.replace("…","_")
        file_name = file_name.replace(",","_")
        audio_path = os.path.join(output_directory, f'{file_name}.wav')
        print(f"Downloading audio to {audio_path}...")
        # with tqdm(total=audio_stream.filesize, unit='B', unit_scale=True, desc=f'Downloading {file_name}') as bar:
        #     def on_progress(chunk, _):
        #         bar.update(len(chunk))

        audio_stream.download(output_directory, filename=f'{file_name}.wav')

        transcript = transcribe_audio(audio_path)

        # Collect video information
        video_info = {
            'link': video_url,
            'title': yt.title,
            'duration': yt.length,
            'author': yt.author,
            'views': yt.views,
            'description': yt.description,
            'audio_path': audio_path,
            'transcript': transcript
        }
        
        save_transcript_to_json(video_info, f'{output_directory}/{file_name}_transcript.json')
        append_transcript_to_json(video_info, f'final_json_transcript_final.json')

        return video_info

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def save_transcript_to_json(transcript, json_path):
    with open(json_path, 'w') as file:
        json.dump(transcript, file, indent=4)
    print(f'Transcript saved to {json_path}')

def append_transcript_to_json(transcript, json_path):
    # Create an empty list if the file doesn't exist yet
    if not os.path.exists(json_path):
        with open(json_path, 'w') as file:
            json.dump([], file)

    # Load existing data from the file
    with open(json_path, 'r') as file:
        data = json.load(file)

    # Append the new transcript to the list
    data.append(transcript)

    # Save the updated list to the file
    with open(json_path, 'w') as file:
        json.dump(data, file, indent=4)

    print(f'Transcript appended to {json_path}')

def process_video_links(file_path):
    with open(file_path, 'r') as file:
        video_list = file.readlines()
        
    video_data_list = []
    videos_to_process = [video_line.strip().split(",") for video_line in video_list if video_line.strip().endswith(',0')]

    # for video_link in tqdm(video_list, desc='Processing videos', unit='video'):
    # for idx, video_line in enumerate(tqdm(video_list, desc='Processing videos', unit='video')):
    for video_link, progress in tqdm(videos_to_process, desc='Processing videos', unit='video'):
        # video_link, progress = video_line.split(",")
        if int(progress) == 0:
            video_link = video_link.strip()
            print(f'\nDownloading and transcribing: {video_link}')
            try:
                video_info = download_and_transcribe_video(video_link)
                video_data_list.append(video_info)
                idx = video_list.index(f'{video_link},0\n')
                video_list[idx] = f'{video_link},1\n'
            except:
                # save failed video links in a text file
                print(f'\nError processing video',video_link)
                with open("logs_file.txt", 'a') as log_file:
                    log_file.write(video_link)
        else:
            print("Video already downloaded and processed")
            
        with open(file_path, "w") as file:
            file.writelines(video_list)

# Replace 'YOUR_TEXT_FILE_PATH' with the path to your text file containing video links
text_file_path = "./downloads/Fireship_clone_2/@Fireship-videos-remaining.txt"

# Process video links and save data to JSON
process_video_links(text_file_path)


In [4]:
# combine all the json files into a single file

import os
import json

def combine_json_files(directory_name, output_file='combined.json'):
    combined_data = []

    # Check if the directory exists
    if not os.path.exists(directory_name) or not os.path.isdir(directory_name):
        print(f"Error: {directory_name} is not a valid directory.")
        return

    # Loop through all files in the directory
    for filename in os.listdir(directory_name):
        file_path = os.path.join(directory_name, filename)

        # Check if the file is a JSON file
        if os.path.isfile(file_path) and filename.endswith('.json'):
            with open(file_path, 'r') as file:
                try:
                    # Load JSON data from the file
                    json_data = json.load(file)

                    # Append the loaded data to the combined_data list
                    combined_data.append(json_data)

                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file {filename}: {e}")

    # Write the combined_data to a new JSON file
    with open(output_file, 'w') as output_file:
        json.dump(combined_data, output_file, indent=2)

    print(f"Combined JSON data saved to {output_file.name}")

# Example usage:
directory_name = './downloads/'
combine_json_files(directory_name)

Combined JSON data saved to combined.json


#### Huggingface Dataset prepraration

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="./combined.json")
# dataset2 = load_dataset("json", data_files="./final_json_transcript_final.json")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['author', 'duration', 'description', 'transcript', 'audio_path', 'link', 'title', 'views'],
        num_rows: 522
    })
})

In [87]:
# Code to see which videos links have failed to download and transcribe

# Assuming you have the text file named 'input.txt' and the list of video links
# named 'video_link_list'


input_file_path = './downloads/Fireship_clone_2/@Fireship-videos.txt'
output_file_path = 'output.txt'

# Read the existing links from the text file
with open(input_file_path, 'r') as file:
    existing_links = [line.split(',')[0] for line in file]

# Filter out the links that are not in video_link_list
new_links = [link for link in existing_links if link not in dataset["train"]["link"]]

# Write the new links to the output file
with open(output_file_path, 'w') as output_file:
    for link in new_links:
        output_file.write(f"{link},0\n")

print(f"New links written to {output_file_path}")

New links written to output.txt


In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['author', 'duration', 'description', 'transcript', 'audio_path', 'link', 'title', 'views'],
        num_rows: 522
    })
})

In [10]:
import pandas as pd
df = pd.DataFrame(dataset['train'])

In [11]:
df.head()

Unnamed: 0,author,duration,description,transcript,audio_path,link,title,views
0,Fireship,787,Learn the fundamentals of Computer Science wit...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Computer_Science_Concepts_Expla...,https://www.youtube.com/watch?v=-uleG_Vecis,100+ Computer Science Concepts Explained,2110216
1,Fireship,743,The ultimate 10 minute JavaScript course that ...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_JavaScript_Concepts_you_Need_to...,https://www.youtube.com/watch?v=lkIFF4maKMU,100+ JavaScript Concepts you Need to Know,1642938
2,Fireship,798,WebDev 101 is a complete introduction into the...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Web_Development_Things_you_Shou...,https://www.youtube.com/watch?v=erEgovG9WBs,100+ Web Development Things you Should Know,1296840
3,Fireship,1471,Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...,"{'metadata': {'channels': 1, 'created': '2024-...","downloads\100_Firebase_Tips,_Tricks,_and_Screw...",https://www.youtube.com/watch?v=iWEgpdVSZyg,"100 Firebase Tips, Tricks, and Screw-ups",177364
4,Fireship,246,Google made a ton of exciting announcements at...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\10_crazy_announcements_from_Google_I...,https://www.youtube.com/watch?v=nmfRDRNjCnM,10 crazy announcements from Google I/O,968111


In [12]:
df.rename(columns={'transcript': 'transcript_json'}, inplace=True)
df.head()

Unnamed: 0,author,duration,description,transcript_json,audio_path,link,title,views
0,Fireship,787,Learn the fundamentals of Computer Science wit...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Computer_Science_Concepts_Expla...,https://www.youtube.com/watch?v=-uleG_Vecis,100+ Computer Science Concepts Explained,2110216
1,Fireship,743,The ultimate 10 minute JavaScript course that ...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_JavaScript_Concepts_you_Need_to...,https://www.youtube.com/watch?v=lkIFF4maKMU,100+ JavaScript Concepts you Need to Know,1642938
2,Fireship,798,WebDev 101 is a complete introduction into the...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Web_Development_Things_you_Shou...,https://www.youtube.com/watch?v=erEgovG9WBs,100+ Web Development Things you Should Know,1296840
3,Fireship,1471,Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...,"{'metadata': {'channels': 1, 'created': '2024-...","downloads\100_Firebase_Tips,_Tricks,_and_Screw...",https://www.youtube.com/watch?v=iWEgpdVSZyg,"100 Firebase Tips, Tricks, and Screw-ups",177364
4,Fireship,246,Google made a ton of exciting announcements at...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\10_crazy_announcements_from_Google_I...,https://www.youtube.com/watch?v=nmfRDRNjCnM,10 crazy announcements from Google I/O,968111


In [13]:
json_string = df.loc[0, 'transcript_json']

# Display the loaded JSON object
print(json_string["results"]["channels"][0]["alternatives"][0]["transcript"])

What's the first thing you should do when your code throws an error? Obviously, you should change nothing and try to run it again a few times. If that doesn't work, you're gonna need a computer science degree. The awesome thing about software engineering is that you can learn to code and get a high paying job, while literally having no idea how anything actually works. It all just feels like magic. Like a pilot driving a giant metal tube in the sky while knowing nothing about aerodynamics. Mother of God, no. Holy shit. Shit. Welcome to computer science 101. In today's video, you'll learn the science behind the garbage code you've been writing by learning 101 different computer science terms and concepts. This is a computer. It's just a piece of tape that holds ones and zeros along with a device that can read and write to it. It's called a Turing machine and in theory, it can compute anything, like the graphics in this video or the algorithm that recommended that you watch it. At the co

In [14]:
import pandas as pd
import json

# Assuming your DataFrame is named df

def parse_json(row):
    try:
        transcript_json = row['transcript_json']
        if transcript_json["results"]["summary"]["result"] == "success":
            transcript = str(transcript_json["results"]["channels"][0]["alternatives"][0]["transcript"])
            summary = str(transcript_json["results"]["summary"]["short"])
            return transcript, summary
        else:
            print("an error occurred")
            return None, None
    except (json.JSONDecodeError, KeyError):
        print("an exception occurred")
        return None, None

# Apply the custom function to each row
df[['transcript', 'summary']] = df.apply(parse_json, axis=1, result_type='expand')

# Display the updated DataFrame
# print(df.head())

In [15]:
from datasets import Dataset
import pandas as pd
final_dataset = Dataset.from_pandas(df)

In [16]:
final_dataset

Dataset({
    features: ['author', 'duration', 'description', 'transcript_json', 'audio_path', 'link', 'title', 'views', 'transcript', 'summary'],
    num_rows: 522
})

In [None]:
final_dataset.push_to_hub("Huggingface-userId/FS_transcribe_summary")

#### Prompt formatting to the following format
```
[INST]
You are youtuber called {author} you make engaging high-intensity and entertaining coding tutorials and tech news. 
you covers a wide range of topics relevant to programmers, aiming to help them learn and improve their skills quickly.

Given the title of the video : {title} 
and a small summary : {video_summary}
[/INST]

Generate the video : {video_transcript}
```

In [20]:
import pandas as pd
import json

# Assuming your DataFrame is named df

def create_prompt(row):
    try:
        author = row["author"]
        title = row["title"]
        video_transcript = row["transcript"]
        video_summary = row["summary"]
        # transcript_json = row['transcript_json']
        text = f"""
        [INST]
        You are youtuber called {author} you make engaging high-intensity and entertaining coding tutorials and tech news. 
        you covers a wide range of topics relevant to programmers, aiming to help them learn and improve their skills quickly.
        
        Given the title of the video : {title} 
        and a small summary : {video_summary}
        [/INST]
        
        Generate the video : {video_transcript}
        """        
        return text

    except (json.JSONDecodeError, KeyError):
        print("an exception occurred")
        return None

# Apply the custom function to each row
df['text'] = df.apply(create_prompt, axis=1, result_type='expand')

# Display the updated DataFrame
df.head()

Unnamed: 0,author,duration,description,transcript_json,audio_path,link,title,views,transcript,summary,text
0,Fireship,787,Learn the fundamentals of Computer Science wit...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Computer_Science_Concepts_Expla...,https://www.youtube.com/watch?v=-uleG_Vecis,100+ Computer Science Concepts Explained,2110216,What's the first thing you should do when your...,The importance of hardware and memory for a co...,\n [INST]\n You are youtuber cal...
1,Fireship,743,The ultimate 10 minute JavaScript course that ...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_JavaScript_Concepts_you_Need_to...,https://www.youtube.com/watch?v=lkIFF4maKMU,100+ JavaScript Concepts you Need to Know,1642938,JavaScript. It's a wonderful programming langu...,The speaker explains that JavaScript is a prog...,\n [INST]\n You are youtuber cal...
2,Fireship,798,WebDev 101 is a complete introduction into the...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\100+_Web_Development_Things_you_Shou...,https://www.youtube.com/watch?v=erEgovG9WBs,100+ Web Development Things you Should Know,1296840,Web development is the best job in the world. ...,The internet is a collection of machines conne...,\n [INST]\n You are youtuber cal...
3,Fireship,1471,Top 100 Firebase Pro Tips 🔥💯. Optimize your ap...,"{'metadata': {'channels': 1, 'created': '2024-...","downloads\100_Firebase_Tips,_Tricks,_and_Screw...",https://www.youtube.com/watch?v=iWEgpdVSZyg,"100 Firebase Tips, Tricks, and Screw-ups",177364,Welcome to my top 10 Firebase tips. Welcome to...,The speakers discuss how to build successful r...,\n [INST]\n You are youtuber cal...
4,Fireship,246,Google made a ton of exciting announcements at...,"{'metadata': {'channels': 1, 'created': '2024-...",downloads\10_crazy_announcements_from_Google_I...,https://www.youtube.com/watch?v=nmfRDRNjCnM,10 crazy announcements from Google I/O,968111,"It is May 11, 2023, and you're watching the Co...","In this video, the speakers discuss Google's u...",\n [INST]\n You are youtuber cal...


In [21]:
from datasets import Dataset
import pandas as pd
final_dataset = Dataset.from_pandas(df)

In [None]:
final_dataset.push_to_hub("Huggingface-userId/FS_transcribe_summary_prompt")

#### Parsing operation for the api response from deepgram

In [31]:
with open("dummy.json","r") as f:
    transcribe_json_list = json.load(f)

In [36]:
transcribe_json_list[0]["results"]["channels"][0]["alternatives"][0]["transcript"]

"Have you ever woken up in the middle of the night in a panic wondering how to extract a polygonal mesh of an isosurface from a 3 dimensional discrete scalar field? Yeah. I didn't think so. But back in 87, 2 programmers at General Electric did. They created and patented the marching cubes algorithm, an algorithm that has likely saved countless lives by allowing doctors to visualize data from CT and MRI scans. Whenever you instruct a machine to solve a problem with code, you're creating an algorithm, a procedure for rearranging ones and zeros that can make animals talk and vacuums walk. Most algorithms belong in a dumpster, but some are fast, skin. Some are beautiful and some are so weird, they're indistinguishable from magic. Today, we'll look at 10 of the most interesting algorithms ever engineered sphere, and how they're used to solve very interesting problems in the real world. 1st on the list, we have wave function collapse. One of the weirdest things in all of science is the doubl

In [38]:
transcribe_json_list[0]["results"]["summary"]["result"]

'success'

In [39]:
transcribe_json_list[0]["results"]["summary"]["short"]

'The speakers discuss the use of algorithms in scientific research, including random random algorithms like BOGO sort and BOGO sort to solve problems in scientific research, and the potential uses of these algorithms in optimizing algorithms and algorithms for algorithms. They also touch on the use of quantum algorithms in machine design and the future of digital security, including the use of random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random random r

In [None]:
from tqdm import tqdm

final_dataset = []
# for video_link in tqdm(video_links, desc='Processing videos', unit='video'):
for transcribe_json in tqdm(transcribe_json_list,desc='Processing transcribe'):
    transcribe = transcribe_json["results"]["channels"][0]["alternatives"][0]["transcript"]
    if transcribe_json["results"]["summary"]["result"]=="success":
        summary = transcribe_json["results"]["summary"]["short"]
    final_json = {
        "transcribe": transcribe,
        "summary": summary
    }
    final_dataset.append(final_json)

with open("transcribe_data_final_processed.json", "w") as output:
    json.dump(final_dataset, output)
    

In [None]:
transcribe_json_list[0]["channels"]

In [None]:
final_dataset_transcribe = load_dataset("json",data_files="./transcribe_data_final.json")

In [None]:
final_dataset_transcribe.push_to_hub("Huggingface-userId/FS_transcribe_summary")

In [None]:
final_dataset_transcribe

In [None]:
final_dataset_transcribe["train"][1]

In [None]:
import json
with open("./video_data_and_transcripts.json") as F:
    json_data = json.load(F)

In [None]:
len(json_data)