In [18]:
import os
import ssl
from pytube import Playlist

# Disable SSL verification
os.environ['PYTHONHTTPSVERIFY'] = '0'
ssl._create_default_https_context = ssl._create_unverified_context

def scrape_youtube_playlist(playlist_url):
    playlist = Playlist(playlist_url)
    
    # Populate video_urls and video_titles lists
    video_urls = [video.watch_url for video in playlist.videos]
    video_titles = [video.title for video in playlist.videos]
    
    return list(zip(video_urls, video_titles))

# Usage:
playlist_url = 'https://www.youtube.com/playlist?list=PL8evaQZnDGAdCKg7XzBVKhFa5kGL7Up5l'
videos = scrape_youtube_playlist(playlist_url)
for video_url, video_title in videos:
    print(f'{video_title}: {video_url}')



Revit Course for Beginners: https://youtube.com/watch?v=0iz7wQPm1MU
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast | Part 1 - Setup: https://youtube.com/watch?v=3JNQh6hyEXg
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast | Part 2 - Floors: https://youtube.com/watch?v=d9sI1yVP99s
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast | Part 3 - Stairs: https://youtube.com/watch?v=Wr8h20FUxwc
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast | Part 4 - Walls: https://youtube.com/watch?v=fuWNwtMHRPc
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast | Part 5 - Ceilings: https://youtube.com/watch?v=Qst2PW9mk0Y
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast | Part 6 – Doors, Windows, Openings: https://youtube.com/watch?v=gKpsyQ76pWY
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast | Part 7 – Roof: https://youtube.com/watch?v=_7TlYlEWGCQ
Revit Course for Beginners – Revit Tutorials to Learn BIM Fast

In [13]:
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi

def get_transcript_df(video_id):
    try:
        # Fetch the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        
        paragraphs = []
        current_paragraph = []
        cumulative_duration = 0
        
        for entry in transcript:
            cumulative_duration += entry['duration']
            
            if cumulative_duration >= 300:  # 300 seconds = 5 minutes
                # If cumulative_duration reaches or exceeds 5 minutes
                text = ' '.join([p['text'] for p in current_paragraph])
                paragraphs.append({
                    'text': text,
                    'timestamp': f"{current_paragraph[0]['start']}-{current_paragraph[-1]['start'] + current_paragraph[-1]['duration']}",
                    'word_count': len(text.split())
                })
                current_paragraph = []
                cumulative_duration = entry['duration']  # reset cumulative_duration for the next group
                
            current_paragraph.append(entry)
        
        # Add any remaining content to paragraphs
        if current_paragraph:
            text = ' '.join([p['text'] for p in current_paragraph])
            paragraphs.append({
                'text': text,
                'timestamp': f"{current_paragraph[0]['start']}-{current_paragraph[-1]['start'] + current_paragraph[-1]['duration']}",
                'word_count': len(text.split())
            })
        
        # Create a DataFrame
        df = pd.DataFrame(paragraphs)
        
        return df
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

video_id = video_url.split("v=")[1]
df = get_transcript_df(video_id)
print(df)


                                                 text   
0   welcome to the beginner's revit course where y...  \
1   family on the other hand is an individually de...   
2   modeling up something that is representing som...   
3   this is where you change properties of the ind...   
4   architectural documentation set every change y...   
5   the properties of whatever you have selected a...   
6   never going to touch that dimensions this is w...   
7   three stories it had a nice big garden out the...   
8   forget our keys after school and we would just...   
9   button which is actually up here in the modify...   
10  4 000 there was a lot of these changes which m...   
11  i'm gonna go full screen here so i can actuall...   
12  steps are about 200 millimeters high that's th...   
13  level is going to be fourteen hundred plus fou...   
14  locks on it so that now moves with it we're go...   

                      timestamp  word_count  
0                    0.0-152.72         5

In [23]:
import pandas as pd
from pytube import Playlist
from youtube_transcript_api import YouTubeTranscriptApi


def process_playlist(playlist_url):
    playlist = Playlist(playlist_url)
    all_data = []

    for video in playlist.videos:
        video_id = video.video_id
        video_title = video.title
        video_url = video.watch_url
        timestamp = video.publish_date

        df = get_transcript_df(video_id)
        if df is not None:
            for index, row in df.iterrows():
                timestamp_range = row['timestamp']
                timestamp_link = f"{video_url}&t={timestamp_range.split('-')[0]}s"
                all_data.append([
                    video_title,
                    row['text'],
                    timestamp_range,
                    row['word_count'],
                    video_url,
                    timestamp_link
                ])

    final_df = pd.DataFrame(
        all_data,
        columns=['video_title', 'content', 'timestamp', 'word_count', 'video_url', 'timestamp_link']
    )
    return final_df

# Usage:
playlist_url = 'https://www.youtube.com/playlist?list=PL8evaQZnDGAdCKg7XzBVKhFa5kGL7Up5l'
final_df = process_playlist(playlist_url)
print(final_df)

# Save this data frame
final_df.to_csv("final_df.csv",index_label= False)

                                           video_title   
0                           Revit Course for Beginners  \
1    Revit Course for Beginners – Revit Tutorials t...   
2    Revit Course for Beginners – Revit Tutorials t...   
3    Revit Course for Beginners – Revit Tutorials t...   
4    Revit Course for Beginners – Revit Tutorials t...   
..                                                 ...   
113  Revit Quick Start Tutorial for Beginners - Sta...   
114  Revit Quick Start Tutorial for Beginners - Sta...   
115  Revit Quick Start Tutorial for Beginners - Sta...   
116  Revit Quick Start Tutorial for Beginners - Sta...   
117  Revit Quick Start Tutorial for Beginners - Sta...   

                                               content   
0    welcome to the beginner's revit course where y...  \
1    welcome to the beginner's revit course where y...   
2    family on the other hand is an individually de...   
3    modeling up something that is representing som...   
4    this is 

convert the data frame to json

In [4]:
import pandas as pd
import json
import re
import jieba

# Load the data
final_df = pd.read_csv("final_df.csv")

In [5]:

# Initialize the transformed data structure
transformed_data = {
    "current_date": "2023-03-01",
    "author": "Your name here",
    "url": "Your URL here",
    "length": 0,  # Placeholder, as we don't have the total content length
    "tokens": 0,  # Placeholder, as we don't have the total token count
    "essays": []
}

# Iterate through each row of the DataFrame
for index, row in final_df.iterrows():
    video_title = row['video_title']
    content = row['content']
    essay_url = row['timestamp_link']  # Assuming timestamp_link is the column with the video link + timestamp
    content_length = len(content)  # Character count of content
    content_tokens = len(content.split())  # Word count of content

    # Construct the essay dictionary
    essay = {
        "title": video_title,
        "url": essay_url,
        "date": "NA",  # Placeholder, as we don't have the essay date
        "thanks": "NA",  # Placeholder, as we don't have the thanks data
        "content": content,
        "content_length": content_length,
        "content_tokens": content_tokens,
        "chunks": [{
            "essay_title": video_title,
            "essay_url": essay_url,
            "essay_date": "NA",
            "essay_thanks": "NA",
            "content": content,
            "content_length": content_length,
            "content_tokens": content_tokens,
            "embedding": []
        }],
        "embedding": []
    }
    
    # Append the essay dictionary to the essays list
    transformed_data["essays"].append(essay)

# Save the transformed data to a JSON file
with open('pg.json', 'w') as f:
    json.dump(transformed_data, f, ensure_ascii=False, indent=4)
