In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
import json


class ExtractLinksCS224n:
    """
    A class to download and extract material links from the CS224n syllabus page.
    """

    def __init__(self, url):
        """
        Initialize the class with the syllabus URL.

        Args:
            url (str): The URL of the syllabus page.
        """
        self.url = url
        self.base_url = self.construct_base_url()
        self.html_content = None
        self.soup = None
        self.data = []

    def construct_base_url(self):
        """
        Construct the base URL from the provided URL for resolving relative links.

        Returns:
            str: The base URL.
        """
        parsed_url = urlparse(self.url)
        base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rsplit('/', 1)[0]}/"
        return base

    def fetch_html(self):
        """
        Fetch the HTML content from the syllabus URL.

        Raises:
            requests.HTTPError: If the HTTP request returned an unsuccessful status code.
        """
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raise an error for bad status
            self.html_content = response.text
            print(f"Successfully fetched HTML content from {self.url}")
        except requests.HTTPError as http_err:
            print(f"HTTP error occurred while fetching HTML: {http_err}")
            raise
        except Exception as err:
            print(f"An error occurred while fetching HTML: {err}")
            raise

    def parse_html(self):
        """
        Parse the fetched HTML content using BeautifulSoup.

        Raises:
            ValueError: If HTML content is empty.
        """
        if not self.html_content:
            raise ValueError("HTML content is empty. Please fetch the HTML first.")
        self.soup = BeautifulSoup(self.html_content, 'html.parser')
        print("HTML content successfully parsed.")

    def determine_type(self, name, link):
        """
        Determine the type of the link based on its name or link.

        Args:
            name (str): The text associated with the link.
            link (str): The URL of the link.

        Returns:
            str: The category/type of the link.
        """
        name_lower = name.lower()
        if 'slide' in name_lower:
            return 'slides'
        elif 'note' in name_lower:
            return 'notes'
        elif 'code' in name_lower or 'colab' in name_lower:
            return 'code'
        elif 'handout' in name_lower or 'latex' in name_lower:
            return 'handout'
        elif 'assignment' in name_lower:
            return 'assignment'
        else:
            return 'readings'

    def process_links(self, cell, group, lecture_title, date_text):
        """
        Process all links within a table cell.

        Args:
            cell (bs4.element.Tag): The table cell containing links.
            group (str): The group/category of the links (e.g., 'description', 'course materials').
            lecture_title (str): The title of the lecture associated with the links.
            date_text (str): The date associated with the links.

        Returns:
            list: A list of dictionaries containing link information.
        """
        links = []
        for a in cell.find_all('a'):
            link_text = a.get_text().strip('[]').strip()
            link_href = a.get('href')
            if link_href:
                full_link = urljoin(self.base_url, link_href)
                link_type = self.determine_type(link_text, link_href)
                # Adjust 'name' if link_text is 'slides', 'notes', or 'code'
                if link_text.lower() in ['slides', 'notes', 'code']:
                    link_name = lecture_title
                else:
                    link_name = link_text
                link_dict = {
                    'link': full_link,
                    'name': link_name,
                    'date': date_text,
                    'type': link_type,
                    'group': group
                }
                links.append(link_dict)
        return links

    def extract_links(self):
        """
        Extract all material links from the schedule table.

        Raises:
            ValueError: If the schedule table cannot be found in the HTML.
        """
        if not self.soup:
            raise ValueError("Soup object is empty. Please parse the HTML first.")

        # Use CSS selector to find the table inside div#schedule
        schedule_div = self.soup.find('div', id='schedule')
        if not schedule_div:
            raise ValueError("Could not find the schedule div in the HTML content.")

        schedule_table = schedule_div.find('table')
        if not schedule_table:
            raise ValueError("Could not find the schedule table in the HTML content.")

        print("Schedule table found. Beginning extraction of links...")

        # Iterate over the rows of the table
        for row in schedule_table.find_all('tr'):
            # Get all the cells in the row
            cells = row.find_all('td')
            if not cells:
                continue  # Skip rows with no 'td's

            # Assign cells based on position
            date_cell = cells[0] if len(cells) > 0 else None
            description_cell = cells[1] if len(cells) > 1 else None
            materials_cell = cells[2] if len(cells) > 2 else None
            events_cell = cells[3] if len(cells) > 3 else None
            deadlines_cell = cells[4] if len(cells) > 4 else None

            # Extract date
            if date_cell:
                date_lines = date_cell.get_text(separator='\n').split('\n')
                date_line = date_lines[-1].strip() if date_lines else ''
            else:
                date_line = ''

            # Extract the lecture title
            if description_cell:
                description_text = description_cell.get_text(separator='\n').strip()
                lecture_title = description_text.split('\n')[0].strip()
            else:
                lecture_title = ''

            # Process links in description_cell
            if description_cell:
                description_links = self.process_links(description_cell, 'description', lecture_title, date_line)
                self.data.extend(description_links)

            # Process links in materials_cell
            if materials_cell:
                materials_links = self.process_links(materials_cell, 'course materials', lecture_title, date_line)
                self.data.extend(materials_links)

            # Process links in events_cell
            if events_cell:
                events_links = self.process_links(events_cell, 'events', lecture_title, date_line)
                self.data.extend(events_links)

            # Process links in deadlines_cell
            if deadlines_cell:
                deadlines_links = self.process_links(deadlines_cell, 'events', lecture_title, date_line)
                self.data.extend(deadlines_links)

        print(f"Extraction complete. Total links extracted: {len(self.data)}")


    def get_data(self):
        """
        Retrieve the extracted link data.

        Returns:
            list: A list of dictionaries containing link information.
        """
        return self.data

    def get_dataframe(self):
        """
        Convert the extracted data into a pandas DataFrame.

        Returns:
            pd.DataFrame: A pandas DataFrame containing the link data.
        """
        if not self.data:
            raise ValueError("No data available. Please run the extraction process first.")
        return pd.DataFrame(self.data)

    def save_json(self, filename):
        """
        Save the extracted link data to a JSON file.

        Args:
            filename (str): The filename for the JSON output.
        """
        try:
            with open(filename, 'w') as f:
                json.dump(self.data, f, indent=2)
            print(f"Data successfully saved to {filename}")
        except Exception as e:
            print(f"An error occurred while saving JSON: {e}")
            raise

    def run(self):
        """
        Execute the full process of fetching, parsing, and extracting links.
        """
        self.fetch_html()
        self.parse_html()
        self.extract_links()

In [6]:
import os
import requests
from pathlib import Path
from urllib.parse import urlparse, parse_qs, urlunparse
import zipfile
import time
import re
import json
import pandas as pd

class DownloadMaterials:
    def __init__(self, download_dir='website_materials'):
        """
        Initialize the DownloadMaterials class.

        Args:
            download_dir (str): The directory where downloaded files will be saved.
        """
        self.download_dir = Path(download_dir)
        self.download_dir.mkdir(exist_ok=True)
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'Mozilla/5.0'})

    @staticmethod
    def url_to_filename(url):
        """
        Convert a URL into a sanitized filename.

        Args:
            url (str): The URL to sanitize.

        Returns:
            str: A valid filename.
        """
        url = re.sub(r'^https?:\/\/', '', url)  # Remove the protocol
        filename = re.sub(r'[\/:*?"<>|]', '_', url)  # Replace unsafe characters
        # filename = re.sub(r'[\/:*?"<>|].', '_', url)  # Replace unsafe characters
        return filename[:255]  # Limit filename length to 255 characters

    @staticmethod
    def strip_extension(url):
        """
        Remove the file extension from a URL's path.

        Args:
            url (str): The URL from which to remove the extension.

        Returns:
            str: The URL without the file extension.
        """
        parsed = urlparse(url)
        path, ext = os.path.splitext(parsed.path)
        # Reconstruct the URL without the extension
        new_parsed = parsed._replace(path=path)
        return urlunparse(new_parsed)

    @staticmethod
    def process_url(url):
        """
        Process a URL to determine its file extension and adjust the URL if necessary.

        Args:
            url (str): The URL to process.

        Returns:
            tuple: A tuple containing the processed URL and its file extension.
        """
        parsed_url = urlparse(url)
        netloc = parsed_url.netloc.lower()
        path = parsed_url.path
        query = parsed_url.query

        # Special handling for arXiv 'abs' links
        if 'arxiv.org' in netloc and '/abs/' in path:
            paper_id = path.split('/abs/')[-1]
            url = f'https://arxiv.org/pdf/{paper_id}.pdf'
            ext = '.pdf'
        # Special handling for Google Colab links
        elif 'colab.research.google.com' in netloc:
            notebook_id = None
            if '/drive/' in path:
                notebook_id = path.split('/drive/')[-1].split('/')[0]
            elif 'id' in parse_qs(query):
                notebook_id = parse_qs(query)['id'][0]
            if notebook_id:
                url = f'https://drive.google.com/uc?export=download&id={notebook_id}'
                ext = '.ipynb'
            else:
                ext = '.html'
        # Special handling for ACL Anthology links
        elif 'aclanthology.org' in netloc or 'aclweb.org' in netloc:
            # Handle URLs like 'http://www.aclweb.org/anthology/Q15-1016'
            paper_id = path.strip('/').split('/')[-1]  # Extract 'Q15-1016'
            url = f'https://aclanthology.org/{paper_id}.pdf'
            ext = '.pdf'
        else:
            _, ext = os.path.splitext(path)
            ext = ext.lower() if ext else '.html'
        return url, ext

    def download_file(self, download_url, file_name, ext, retries=1):
        """
        Download a file from a URL.

        Args:
            download_url (str): The URL to download from.
            file_name (str): The name to save the file as (without extension).
            ext (str): The file extension.
            retries (int): The number of retries in case of failure.

        Returns:
            Path or None: The path to the downloaded file or None if the download failed.
        """

        # Correctly append the extension without duplicating it
        file_path = self.download_dir / f"{file_name}{ext}"
        if file_path.exists():
            print(f"File already exists: {file_path}")
            return file_path

        # Determine if the download is from ACL Anthology
        is_acl_anthology = 'aclanthology.org' in urlparse(download_url).netloc or 'aclweb.org' in urlparse(download_url).netloc

        # Set specific headers for ACL Anthology to avoid 406 errors
        headers = self.session.headers.copy()
        if is_acl_anthology and ext == '.pdf':
            headers.update({'Accept': 'application/pdf'})

        for attempt in range(retries):
            try:
                response = self.session.get(download_url, stream=True, timeout=10, headers=headers)
                response.raise_for_status()

                # Handle redirects
                if response.history:
                    final_url = response.url
                    file_name = self.url_to_filename(self.strip_extension(final_url))
                    _, final_ext = os.path.splitext(urlparse(final_url).path)
                    final_ext = final_ext.lower() if final_ext else ext  # Fallback to original ext
                    file_path = self.download_dir / f"{file_name}{final_ext}"
                    ext = final_ext  # Update the extension

                # Save HTML content if content type is text/html
                content_type = response.headers.get('Content-Type', '')
                if 'text/html' in content_type:
                    html_path = file_path.with_suffix('.html')
                    with open(html_path, 'wb') as f:
                        f.write(response.content)
                    print(f"Downloaded HTML instead of {ext} for URL: {download_url} as {html_path}")
                    return html_path  # Return the path to the HTML file

                # Validate .ipynb content
                if ext == '.ipynb':
                    try:
                        json_content = response.content.decode('utf-8')
                        json.loads(json_content)  # Validate JSON
                    except json.JSONDecodeError:
                        print(f"Invalid JSON for URL: {download_url}")
                        return None
                    
                # Append .pdf suffix if the file is from arXiv
                if 'arxiv' in file_name:
                    print(ext)
                    print(file_path)
                    print(str(file_path) + '.pdf')
                    file_path = (Path(str(file_path).replace('.', '_') + '.pdf'))

                # Write the content to the file
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded: {file_path}")
                return file_path
            except requests.RequestException as e:
                print(f"Error downloading {download_url}: {e}. Retrying ({attempt + 1}/{retries})...")
                time.sleep(2)
        print(f"Failed to download {download_url} after {retries} attempts.")
        return None

    @staticmethod
    def extract_zip(file_path):
        """
        Extract a ZIP file.

        Args:
            file_path (Path): The path to the ZIP file.
        """
        try:
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                extract_path = file_path.parent / file_path.stem
                extract_path.mkdir(exist_ok=True)
                zip_ref.extractall(extract_path)
            print(f"Extracted ZIP: {file_path} to {extract_path}")
        except zipfile.BadZipFile as e:
            print(f"Failed to extract ZIP file {file_path}: {e}")

    def process_urls(self, urls):
        """
        Process a list of URLs to prepare them for downloading.

        Args:
            urls (list): The list of URLs.

        Returns:
            list: A list of dictionaries with processed URLs and metadata.
        """
        processed_urls = []
        for url in urls:
            download_url, ext = self.process_url(url)
            # Strip extension from the original URL to avoid duplication
            url_no_ext = self.strip_extension(url)
            file_name = self.url_to_filename(url_no_ext)
            processed_urls.append({
                'original_url': url,
                'download_url': download_url,
                'file_name': file_name,
                'extension': ext
            })
        return processed_urls

    def download_all_files(self, processed_urls):
        """
        Download all files from a list of processed URLs.

        Args:
            processed_urls (list): A list of dictionaries with processed URLs and metadata.
        """

        list_file_paths = []

        for item in processed_urls:
            print(f"Downloading: {item['download_url']}")
            file_path = self.download_file(
                download_url=item['download_url'],
                file_name=item['file_name'],
                ext=item['extension']
            )
            if file_path and file_path.suffix == '.zip':
                self.extract_zip(file_path)
            print()
            time.sleep(1)
            list_file_paths.append(str(file_path))

        return list_file_paths

    def merge_with_dataframe(self, df, processed_urls):
        """
        Merge the processed URLs with the original DataFrame.

        Args:
            df (pd.DataFrame): The original DataFrame.
            processed_urls (list): The processed URLs.

        Returns:
            pd.DataFrame: The merged DataFrame.
        """
        processed_df = pd.DataFrame(processed_urls)
        return df.merge(processed_df, left_on='link', right_on='original_url', how='left')

In [7]:
import yt_dlp
from pathlib import Path

class YouTubeSubtitlesDownloader:
    """
    A class to download only subtitles from YouTube playlists using yt_dlp.
    """

    def __init__(self, playlist_url, download_path, subtitles_lang='en'):
        """
        Initialize the downloader with playlist details and options.

        Args:
            playlist_url (str): URL of the YouTube playlist to download.
            download_path (str): Path where subtitles will be downloaded.
            subtitles_lang (str): Language for subtitles (default is 'en').
        """
        self.playlist_url = playlist_url
        self.download_path = Path(download_path)
        self.download_path.mkdir(parents=True, exist_ok=True)
        self.subtitles_lang = subtitles_lang

    def get_download_options(self):
        """
        Generate yt_dlp options for downloading subtitles.

        Returns:
            dict: A dictionary of yt_dlp options.
        """
        options = {
            'skip_download': True,  # Skip downloading video and audio
            'writesubtitles': True,  # Enable subtitle download
            'writeautomaticsub': True,  # Download auto-generated subtitles if no manual ones are available
            'subtitleslangs': [self.subtitles_lang],  # Specify the language of subtitles
            'outtmpl': str(self.download_path / '%(playlist_title)s/%(title)s.%(ext)s'),  # Save in organized folders
        }
        return options

    def download_subtitles(self):
        """
        Download the subtitles for the YouTube playlist.

        Raises:
            yt_dlp.utils.DownloadError: If there is an issue during the download.
        """
        options = self.get_download_options()
        with yt_dlp.YoutubeDL(options) as ydl:
            ydl.download([self.playlist_url])
        print(f"Subtitles downloaded successfully to: {self.download_path}")

In [8]:
import yt_dlp
from pathlib import Path

class YouTubePlaylistDownloader:
    """
    A class to download YouTube playlists using yt_dlp with specified options.
    """

    def __init__(self, playlist_url, download_path, resolution='1080', subtitles=True, subtitles_lang='en', output_format='mp4'):
        """
        Initialize the downloader with playlist details and options.

        Args:
            playlist_url (str): URL of the YouTube playlist to download.
            download_path (str): Path where videos will be downloaded.
            resolution (str): Desired video resolution (default is 1080p).
            subtitles (bool): Whether to download subtitles (default is True).
            subtitles_lang (str): Language for subtitles (default is 'en').
            output_format (str): Output format for merged files (default is 'mp4').
        """
        self.playlist_url = playlist_url
        self.download_path = Path(download_path)
        self.download_path.mkdir(parents=True, exist_ok=True)
        self.resolution = resolution
        self.subtitles = subtitles
        self.subtitles_lang = subtitles_lang
        self.output_format = output_format

    def get_download_options(self):
        """
        Generate yt_dlp options based on the specified parameters.

        Returns:
            dict: A dictionary of yt_dlp options.
        """
        options = {
            'format': f'bestvideo[height={self.resolution}]+bestaudio/best[height={self.resolution}]',
            'outtmpl': str(self.download_path / '%(playlist_title)s/%(title)s.%(ext)s'),
            'merge_output_format': self.output_format,
        }
        if self.subtitles:
            options.update({
                'subtitleslangs': [self.subtitles_lang],
                'writesubtitles': True,
                'writeautomaticsub': True,
            })
        return options

    def download_playlist(self):
        """
        Download the YouTube playlist with the configured options.

        Raises:
            yt_dlp.utils.DownloadError: If there is an issue during the download.
        """
        options = self.get_download_options()
        with yt_dlp.YoutubeDL(options) as ydl:
            ydl.download([self.playlist_url])
        print(f"Playlist downloaded successfully to: {self.download_path}")

# 1. CS224n - 2024

### extract links from html

In [None]:
# Instantiate the class with the syllabus URL
links_extractor = ExtractLinksCS224n('https://web.stanford.edu/class/cs224n/index.html#schedule')

# Run the extraction process
links_extractor.run()

# Retrieve the extracted data as a pandas DataFrame
df_materials = links_extractor.get_dataframe()

In [8]:
def is_selected_url(url: str) -> bool:
    """
    Determines if a given URL matches any of the specified criteria:
    
    1) Contains 'stanford.edu' in the URL
    2) Ends with '.pdf'
    3) Contains 'arxiv' in the URL
    4) Contains 'aclweb' in the URL
    5) Contains 'colab.research.google' in the URL
    6) Contains 'github.io' in the URL
    
    Args:
        url (str): The URL to evaluate.
    
    Returns:
        bool: True if the URL matches any of the criteria, False otherwise.
    """
    if not isinstance(url, str):
        raise ValueError("Input must be a string representing a URL.")
    
    # Convert the URL to lowercase for case-insensitive matching
    url_lower = url.lower()
    
    # Define the conditions
    conditions = [
        'stanford.edu' in url_lower,
        url_lower.endswith('.pdf'),
        'arxiv' in url_lower,
        'aclweb' in url_lower,
        'colab.research.google' in url_lower,
        'github.io' in url_lower
    ]
    
    # Debug: Print conditions (optional)
    # print(f"URL: {url}")
    # print(f"Conditions: {conditions}")
    
    # Return True if any condition is met
    return any(conditions)

In [6]:
df_materials = df_materials[df_materials['link'].apply(is_selected_url)]

### download materials from links

In [7]:
materials_downloader = DownloadMaterials(download_dir='cs224n-2024/website_materials')
urls = df_materials['link'].tolist()
processed_urls = materials_downloader.process_urls(urls)
df_materials = materials_downloader.merge_with_dataframe(df_materials, processed_urls)

In [None]:
list_files_paths = materials_downloader.download_all_files(df_materials.to_dict(orient='records'))
df_materials['file_path'] = list_files_paths

expand df_materials with files in unziped folders

In [20]:
df_materials = df_materials[['link', 'name', 'date', 'type', 'group', 'file_path', 'extension']]
df_materials = df_materials[df_materials['file_path'] != 'None']
df_materials = df_materials.reset_index(drop=True)

In [7]:
import os
import zipfile
from pathlib import Path
import pandas as pd

def extract_zip_files(df, zip_column='file_path', root_dir='extracted_files'):
    """
    Extract contents of ZIP files in the DataFrame and extend the DataFrame with their contents,
    inheriting `date` and `link` from the ZIP file.

    Args:
        df (pd.DataFrame): The original DataFrame containing ZIP file paths.
        zip_column (str): The column in the DataFrame that contains ZIP file paths.
        root_dir (str): Directory where ZIP files will be extracted.

    Returns:
        pd.DataFrame: Extended DataFrame including the contents of ZIP files.
    """
    # Directory to extract ZIP files
    root_path = Path(root_dir)
    root_path.mkdir(parents=True, exist_ok=True)

    # df_zip = df[df['extension'] == '.zip']
    # df = df[df['extension'] != '.zip']

    # List to collect rows for new files
    new_rows = []

    # Iterate through the DataFrame rows
    for idx, row in df.iterrows():
        file_path = Path(row[zip_column])

        if file_path.suffix == '.zip' and file_path.exists():
            try:
                # Create a directory for this ZIP file
                extract_dir = root_path / file_path.stem
                extract_dir.mkdir(parents=True, exist_ok=True)

                # Extract the ZIP file
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_dir)

                # Recursively collect file information
                for extracted_file in extract_dir.rglob('*'):  # Recursively go through extracted files
                    if extracted_file.is_file():  # Only process files
                        new_rows.append({
                            'link': row['link'],  # Inherit link from the parent ZIP file
                            'name': extracted_file.name,
                            'date': row['date'],  # Inherit date from the parent ZIP file
                            'type': row['type'],  # Inherit type from the parent ZIP file
                            'group': row['group'],  # Inherit group from the parent ZIP file
                            'file_path': str(extracted_file),
                            'extension': extracted_file.suffix
                        })
            except zipfile.BadZipFile as e:
                print(f"Failed to extract {file_path}: {e}")

    # Create a DataFrame for new rows
    new_df = pd.DataFrame(new_rows)

    # Remove zip files from the original DataFrame
    df = df[df['extension'] != '.zip']

    # Append new rows to the original DataFrame
    extended_df = pd.concat([df, new_df], ignore_index=True)

    extended_df = extended_df[extended_df['extension'] != ''].reset_index(drop=True)

    return extended_df

In [69]:
df_materials = extract_zip_files(df_materials, zip_column='file_path', root_dir='/Users/artemvolgin/Repos/gemini-long/cs224n-2024/website_materials')

In [73]:
df_materials.to_pickle('cs224n-2024/df_materials.pkl')

### download youtube videos and subs

In [None]:
# Define the playlist URL
playlist_url = 'https://www.youtube.com/playlist?list=PLoROMvodv4rMFqRtEuo6SGjY4XbRIVRd4'

# Define the path for downloading
download_path = 'cs224n-2024/youtube'

# Initialize the downloader with desired options
youtube_downloader = YouTubePlaylistDownloader(
    playlist_url=playlist_url,
    download_path=download_path,  # Specify the download path
    resolution='1080',  # Download videos in 1080p resolution
    subtitles=True,  # Enable subtitles
    subtitles_lang='en',  # Subtitles language
    output_format='mp4',  # Ensure merged output format is mp4
)

# Start the download
youtube_downloader.download_playlist()

In [78]:
path_to_playlist = 'cs224n-2024/youtube/Stanford CS224N： Natural Language Processing with Deep Learning ｜ 2023'
list_youtube_paths = [path_to_playlist + '/' + x for x in os.listdir(path_to_playlist)]
df_youtube = pd.DataFrame(list_youtube_paths, columns=['file_path'])
df_youtube['extension'] = df_youtube['file_path'].apply(lambda x: Path(x).suffix)
df_youtube['name'] = df_youtube['file_path'].apply(lambda x: Path(x).stem)
df_youtube.to_pickle('cs224n-2024/df_youtube.pkl')

# 2. CS224n - 2019

### extract links from html

In [None]:
# Instantiate the class with the syllabus URL
links_extractor = ExtractLinksCS224n('https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/index.html#schedule') 
links_extractor.run()
df_materials = links_extractor.get_dataframe()

### download materials from links

In [11]:
materials_downloader = DownloadMaterials(download_dir='cs224n-2019/website_materials')
urls = df_materials['link'].tolist()
processed_urls = materials_downloader.process_urls(urls)
df_materials = materials_downloader.merge_with_dataframe(df_materials, processed_urls)

In [101]:
# >>> Select only the slides pdf <<<
# df_materials = df_materials[(df_materials['type'] == 'slides') & (df_materials['extension'] == '.pdf')]

In [None]:
list_files_paths = materials_downloader.download_all_files(df_materials.to_dict(orient='records'))
df_materials['file_path'] = list_files_paths

In [14]:
df_materials = df_materials.reset_index(drop=True)
df_materials = df_materials[['link', 'name', 'date', 'type', 'group', 'file_path', 'extension']]

In [15]:
df_materials.to_pickle('cs224n-2019/df_materials.pkl')

In [16]:
df_materials = pd.read_pickle('cs224n-2019/df_materials.pkl')

### Download subs

In [3]:
# Define the playlist URL
playlist_url = 'https://www.youtube.com/watch?v=8rXD5-xhemo&list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z'

# Define the path for downloading
download_path = 'cs224n-2019/youtube'

# Initialize the downloader with desired options
youtube_downloader = YouTubeSubtitlesDownloader(
    playlist_url=playlist_url,
    download_path=download_path,  # Specify the download path
    subtitles_lang='en',  # Subtitles language
)

# Start the download
youtube_downloader.download_subtitles()

[youtube:tab] Extracting URL: https://www.youtube.com/watch?v=8rXD5-xhemo&list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z
[youtube:tab] Downloading playlist PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z - add --no-playlist to download just the video 8rXD5-xhemo
[youtube:tab] PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z: Downloading webpage
[youtube:tab] Extracting URL: https://www.youtube.com/playlist?list=PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z
[youtube:tab] PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z: Downloading webpage
[youtube:tab] PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z: Redownloading playlist API JSON with unavailable videos
[download] Downloading playlist: Stanford CS224N: Natural Language Processing with Deep Learning Course | Winter 2019
[youtube:tab] PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z page 1: Downloading API JSON




[youtube:tab] PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z page 1: Downloading API JSON




[youtube:tab] PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z page 1: Downloading API JSON




[youtube:tab] PLoROMvodv4rOhcuXMZkNm7j3fVwBBY42z page 1: Downloading API JSON




[youtube:tab] Playlist Stanford CS224N: Natural Language Processing with Deep Learning Course | Winter 2019: Downloading 22 items of 22
[download] Downloading item 1 of 22
[youtube] Extracting URL: https://www.youtube.com/watch?v=8rXD5-xhemo
[youtube] 8rXD5-xhemo: Downloading webpage
[youtube] 8rXD5-xhemo: Downloading ios player API JSON
[youtube] 8rXD5-xhemo: Downloading mweb player API JSON
[youtube] 8rXD5-xhemo: Downloading m3u8 information
[info] 8rXD5-xhemo: Downloading subtitles: en
[info] 8rXD5-xhemo: Downloading 1 format(s): 137+251
[info] Writing video subtitles to: /Users/artemvolgin/Repos/gemini-long/cs224n-2019/youtube/Stanford CS224N： Natural Language Processing with Deep Learning Course ｜ Winter 2019/Stanford CS224N： NLP with Deep Learning ｜ Winter 2019 ｜ Lecture 1 – Introduction and Word Vectors.en.vtt
[download] Destination: /Users/artemvolgin/Repos/gemini-long/cs224n-2019/youtube/Stanford CS224N： Natural Language Processing with Deep Learning Course ｜ Winter 2019/Stanf

In [9]:
path_to_playlist = 'cs224n-2019/youtube/Stanford CS224N： Natural Language Processing with Deep Learning Course ｜ Winter 2019'
list_youtube_paths = [path_to_playlist + '/' + x for x in os.listdir(path_to_playlist)]
df_youtube = pd.DataFrame(list_youtube_paths, columns=['file_path'])
df_youtube['extension'] = df_youtube['file_path'].apply(lambda x: Path(x).suffix)
df_youtube['name'] = df_youtube['file_path'].apply(lambda x: Path(x).stem)
df_youtube.to_pickle('cs224n-2019/df_youtube.pkl')

# 3. CS231n - 2024

In [17]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
import json

class ExtractLinksCS231n:
    """
    A class to download and extract material links from a webpage.
    """

    def __init__(self, url):
        """
        Initialize the class with the syllabus URL.

        Args:
            url (str): The URL of the syllabus page.
        """
        self.url = url
        self.base_url = self.construct_base_url()
        self.html_content = None
        self.soup = None
        self.data = []

    def construct_base_url(self):
        """
        Construct the base URL from the provided URL for resolving relative links.

        Returns:
            str: The base URL.
        """
        parsed_url = urlparse(self.url)
        base = f"{parsed_url.scheme}://{parsed_url.netloc}/"
        return base

    def fetch_html(self):
        """
        Fetch the HTML content from the syllabus URL.
        """
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raise an error for bad status
            self.html_content = response.text
            print(f"Successfully fetched HTML content from {self.url}")
        except requests.RequestException as e:
            print(f"An error occurred while fetching HTML: {e}")
            raise

    def parse_html(self):
        """
        Parse the fetched HTML content using BeautifulSoup.
        """
        if not self.html_content:
            raise ValueError("HTML content is empty. Please fetch the HTML first.")
        self.soup = BeautifulSoup(self.html_content, 'html.parser')
        print("HTML content successfully parsed.")

    def determine_type(self, name, link):
        """
        Determine the type of the link based on its name or link.
        """
        name_lower = name.lower()
        if 'slide' in name_lower:
            return 'slides'
        elif 'note' in name_lower:
            return 'notes'
        elif 'code' in name_lower:
            return 'code'
        elif 'handout' in name_lower:
            return 'handout'
        elif 'assignment' in name_lower:
            return 'assignment'
        else:
            return 'readings'

    def find_schedule_table(self):
        """
        Find the schedule table on the webpage, considering potential changes in structure.
        """
        # First, look for the div with id 'schedule'
        schedule_div = self.soup.find('div', id='schedule')
        if schedule_div:
            return schedule_div.find('table')

        # If not found, try finding a table with specific classes (based on the observed structure)
        schedule_table = self.soup.find('table', class_='table')
        if schedule_table:
            return schedule_table

        # Fall back to searching for the first table
        return self.soup.find('table')

    def process_links(self, cell, group, lecture_title, date_text):
        """
        Process all links within a table cell.
        """
        links = []
        for a in cell.find_all('a'):
            link_text = a.get_text().strip('[]').strip()
            link_href = a.get('href')
            if link_href:
                full_link = urljoin(self.base_url, link_href)
                link_type = self.determine_type(link_text, link_href)
                link_dict = {
                    'link': full_link,
                    'name': link_text if link_text.lower() not in ['slides', 'notes', 'code'] else lecture_title,
                    'date': date_text,
                    'type': link_type,
                    'group': group
                }
                links.append(link_dict)
        return links

    def extract_links(self):
        """
        Extract all material links from the schedule table.
        """
        if not self.soup:
            raise ValueError("Soup object is empty. Please parse the HTML first.")

        # Locate the schedule table
        schedule_table = self.find_schedule_table()
        if not schedule_table:
            raise ValueError("Could not find the schedule table in the HTML content.")

        print("Schedule table found. Beginning extraction of links...")

        # Iterate over the rows of the table
        for row in schedule_table.find_all('tr'):
            cells = row.find_all('td')
            if not cells:
                continue  # Skip rows with no 'td's

            # Assign cells based on position
            date_cell = cells[0] if len(cells) > 0 else None
            description_cell = cells[1] if len(cells) > 1 else None
            materials_cell = cells[2] if len(cells) > 2 else None

            # Extract date
            date_text = date_cell.get_text(strip=True) if date_cell else ''

            # Extract the lecture title
            lecture_title = description_cell.get_text(strip=True).split('\n')[0] if description_cell else ''

            # Process links in description_cell
            if description_cell:
                description_links = self.process_links(description_cell, 'description', lecture_title, date_text)
                self.data.extend(description_links)

            # Process links in materials_cell
            if materials_cell:
                materials_links = self.process_links(materials_cell, 'course materials', lecture_title, date_text)
                self.data.extend(materials_links)

        print(f"Extraction complete. Total links extracted: {len(self.data)}")

    def get_dataframe(self):
        """
        Convert the extracted data into a pandas DataFrame.
        """
        if not self.data:
            raise ValueError("No data available. Please run the extraction process first.")
        return pd.DataFrame(self.data)

    def run(self):
        """
        Execute the full process of fetching, parsing, and extracting links.
        """
        self.fetch_html()
        self.parse_html()
        self.extract_links()

In [None]:
# Instantiate the class with the syllabus URL
links_extractor = ExtractLinksCS231n('https://cs231n.stanford.edu/schedule.html') 
links_extractor.run()
df_materials = links_extractor.get_dataframe()

### download materials from links

In [19]:
materials_downloader = DownloadMaterials(download_dir='cs231n-2024/website_materials')
urls = df_materials['link'].tolist()
processed_urls = materials_downloader.process_urls(urls)
df_materials = materials_downloader.merge_with_dataframe(df_materials, processed_urls)

In [20]:
# >>> Select only the slides pdf <<<
# df_materials = df_materials[(df_materials['type'] == 'slides') & (df_materials['extension'] == '.pdf')].reset_index(drop=True)

In [None]:
list_files_paths = materials_downloader.download_all_files(df_materials.to_dict(orient='records'))
df_materials['file_path'] = list_files_paths

In [22]:
df_materials = df_materials[['link', 'name', 'date', 'type', 'group', 'file_path', 'extension']]

In [23]:
df_materials.to_pickle('cs231n-2024/df_materials.pkl')

In [24]:
df_materials = pd.read_pickle('cs231n-2024/df_materials.pkl')

In [10]:
# Define the playlist URL
playlist_url = 'https://www.youtube.com/playlist?list=PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk'

# Define the path for downloading
download_path = 'cs231n-2024/youtube'

# Initialize the downloader with desired options
youtube_downloader = YouTubeSubtitlesDownloader(
    playlist_url=playlist_url,
    download_path=download_path,  # Specify the download path
    subtitles_lang='en',  # Subtitles language
)

# Start the download
youtube_downloader.download_subtitles()

[youtube:tab] Extracting URL: https://www.youtube.com/playlist?list=PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk
[youtube:tab] PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk: Downloading webpage
[youtube:tab] PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk: Redownloading playlist API JSON with unavailable videos
[download] Downloading playlist: Stanford University CS231n, Spring 2017
[youtube:tab] PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk page 1: Downloading API JSON




[youtube:tab] PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk page 1: Downloading API JSON




[youtube:tab] PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk page 1: Downloading API JSON




[youtube:tab] PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk page 1: Downloading API JSON




[youtube:tab] Playlist Stanford University CS231n, Spring 2017: Downloading 16 items of 16
[download] Downloading item 1 of 16
[youtube] Extracting URL: https://www.youtube.com/watch?v=vT1JzLTH4G4
[youtube] vT1JzLTH4G4: Downloading webpage
[youtube] vT1JzLTH4G4: Downloading ios player API JSON
[youtube] vT1JzLTH4G4: Downloading mweb player API JSON
[youtube] vT1JzLTH4G4: Downloading m3u8 information
[info] vT1JzLTH4G4: Downloading subtitles: en
[info] vT1JzLTH4G4: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: /Users/artemvolgin/Repos/gemini-long/cs231n-2024/youtube/Stanford University CS231n, Spring 2017/Lecture 1 ｜ Introduction to Convolutional Neural Networks for Visual Recognition.en.vtt
[download] Destination: /Users/artemvolgin/Repos/gemini-long/cs231n-2024/youtube/Stanford University CS231n, Spring 2017/Lecture 1 ｜ Introduction to Convolutional Neural Networks for Visual Recognition.en.vtt
[download] 100% of   75.83KiB in 00:00:00 at 1.37MiB/s
[download] Dow

In [11]:
path_to_playlist = 'cs231n-2024/youtube/Stanford University CS231n, Spring 2017'
list_youtube_paths = [path_to_playlist + '/' + x for x in os.listdir(path_to_playlist)]
df_youtube = pd.DataFrame(list_youtube_paths, columns=['file_path'])
df_youtube['extension'] = df_youtube['file_path'].apply(lambda x: Path(x).suffix)
df_youtube['name'] = df_youtube['file_path'].apply(lambda x: Path(x).stem)
df_youtube.to_pickle('cs231n-2024/df_youtube.pkl')