In [1]:
import os
import dotenv
import google.generativeai as genai
from google.generativeai import caching
import datetime
import time
import pandas as pd
from PyPDF2 import PdfReader
from tqdm import tqdm
import webvtt
import pprint
from IPython.display import display, Markdown
import json
import networkx as nx
from pyvis.network import Network


# 1. Upload files

## 1.1 Read files from local drive

In [2]:
df_materials_cs224n_2019 = pd.read_pickle('cs224n-2019/df_materials.pkl')
df_materials_cs224n_2024 = pd.read_pickle('cs224n-2024/df_materials.pkl')
df_materials_cs231n_2024 = pd.read_pickle('cs231n-2024/df_materials.pkl')
df_youtube_cs224n_2024 = pd.read_pickle('cs224n-2024/df_youtube.pkl')
df_youtube_cs224n_2019 = pd.read_pickle('cs224n-2019/df_youtube.pkl')
df_youtube_cs231n_2024 = pd.read_pickle('cs231n-2024/df_youtube.pkl')

df_materials_cs224n_2019['course'] = 'cs224n-2019'
df_materials_cs224n_2024['course'] = 'cs224n-2024'
df_materials_cs231n_2024['course'] = 'cs231n-2024'

df_youtube_cs224n_2024['course'] = 'cs224n-2024'
df_youtube_cs224n_2024['type'] = 'youtube'

df_youtube_cs224n_2019['course'] = 'cs224n-2019'
df_youtube_cs224n_2019['type'] = 'youtube'

df_youtube_cs231n_2024['course'] = 'cs231n-2024'
df_youtube_cs231n_2024['type'] = 'youtube'

In [3]:
df_materials = pd.concat([df_materials_cs224n_2019,
                          df_materials_cs224n_2024,
                          df_materials_cs231n_2024,
                          df_youtube_cs224n_2024,
                          df_youtube_cs224n_2019,
                          df_youtube_cs231n_2024], ignore_index=True)

In [4]:
# Remove non-convertable extensions
remove_extensions = {'.pkl', '.mp4', '.png', '.sh', '.jpg', '.bat', '.en', '.conll', '.yml', '.zh',
                     '.es', '.tsv', '.vocab', '.json', '.model', '.PNG', '.DS_Store', '.zip'}

In [5]:
df_materials = df_materials[(df_materials['file_path'] != 'None')]

In [6]:
df_materials = df_materials[~df_materials['extension'].isin(remove_extensions)].reset_index(drop=True)

In [None]:
# TODO: Why do we have duplicates?
df_materials = df_materials.drop_duplicates().reset_index(drop=True)
print(len(df_materials))


In [8]:
# Remove youtube links
df_materials = df_materials[~df_materials['file_path'].str.contains('www.youtube.com')].reset_index(drop=True)

In [None]:
len(df_materials)

In [None]:
import os
import nbformat
from nbconvert import ScriptExporter

def convert_ipynb_to_py(ipynb_file, output_dir=None):
    """
    Convert a Jupyter Notebook (.ipynb) file to a Python (.py) file.
    
    Args:
        ipynb_file (str): Path to the .ipynb file to convert.
        output_dir (str, optional): Directory to save the .py file. Defaults to the current directory.
        
    Returns:
        str: Path to the generated .py file.
        
    Raises:
        FileNotFoundError: If the .ipynb file does not exist.
    """
    if not os.path.exists(ipynb_file):
        raise FileNotFoundError(f"The file '{ipynb_file}' does not exist.")
    
    if output_dir is None:
        output_dir = os.path.dirname(ipynb_file)
    
    # Create an instance of ScriptExporter
    script_exporter = ScriptExporter()
    
    # Parse the notebook content
    with open(ipynb_file, 'r', encoding='utf-8') as f:
        notebook_content = nbformat.read(f, as_version=4)
    
    # Convert the notebook to script
    script_content, _ = script_exporter.from_notebook_node(notebook_content)
    
    # Generate output file path
    base_name = os.path.splitext(os.path.basename(ipynb_file))[0]
    py_file_path = os.path.join(output_dir, f"{base_name}.py")
    
    # Write the script content to the .py file
    with open(py_file_path, 'w', encoding='utf-8') as f:
        f.write(script_content)
    
    return py_file_path

# Convert all Jupyter Notebooks to Python scripts
list_ipynb = df_materials[df_materials['extension'] == '.ipynb']['file_path'].tolist()

# Example usage:
for ipynb_path in list_ipynb:
    try:
        py_file = convert_ipynb_to_py(ipynb_path, 'cs224n-2024/website_materials' )
        print(f"Converted to Python file: {py_file}")
    except FileNotFoundError as e:
        print(e)

df_materials.loc[df_materials['extension'] == '.ipynb', 'extension'] = '.py'
df_materials.loc[df_materials['extension'] == '.py', 'file_path'] = df_materials.loc[
    df_materials['extension'] == '.py', 'file_path'
].apply(lambda x: x.replace('.ipynb', '.py'))

In [11]:
def extract_text_from_pdf(file_path):
    """
    Extracts text from a PDF file.

    Parameters:
    file_path (str): Path to the PDF file.

    Returns:
    str: Extracted text from the PDF.
    """
    try:
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        raise Exception(f"An error occurred while processing the PDF: {e}")

In [12]:
import webvtt

def convert_vtt_to_string(vtt_file_path, include_timecodes=False):
    """
    Converts a .vtt file to a string, optionally including timecodes.

    Args:
        vtt_file_path (str): Path to the .vtt file to be converted.
        include_timecodes (bool): Whether to include timecodes in the output.

    Returns:
        str: The processed content of the .vtt file as a string.
    """
    if not vtt_file_path.endswith(".vtt"):
        raise ValueError("Input file must be a .vtt file.")
    
    transcript = []
    previous_line = None

    # Read and process the .vtt file
    vtt = webvtt.read(vtt_file_path)

    for caption in vtt:
        # Get timecodes if required
        timecode = f"{caption.start} --> {caption.end}" if include_timecodes else ""

        # Clean and split the caption text
        lines = caption.text.strip().splitlines()
        for line in lines:
            line = line.strip()
            # Skip duplicate lines
            if line and line != previous_line:
                if include_timecodes:
                    transcript.append(f"{timecode} {line}")
                else:
                    transcript.append(line)
                previous_line = line

    # Join the transcript lines into a single string
    return "\n".join(transcript)

In [13]:
def read_file_as_string(file_path):
    """
    Reads the content of a file and returns it as a string.

    Parameters:
    file_path (str): The path to the file.

    Returns:
    str: The content of the file as a string.
    """

    if file_path.endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    
    if file_path.endswith('.vtt'):
        return convert_vtt_to_string(file_path)

    else:
        try:
            # Open the file and read its content
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            return content
        except FileNotFoundError:
            raise FileNotFoundError(f"Error: The file at {file_path} was not found.")
        except Exception as e:
            raise Exception(f"An error occurred while reading the file: {e}")


In [None]:
list_files_str = []
for i in tqdm(range(len(df_materials))):
    try:
        file_str = read_file_as_string(df_materials.loc[i, 'file_path'])
    except Exception as e:
        print(f"Error: {e}")
        print(i)
        file_str = None
    list_files_str.append(file_str)

In [16]:
df_materials['file_str'] = list_files_str

In [None]:
df_materials.columns

In [None]:
len(df_materials)

In [19]:
df_materials = df_materials[df_materials['file_str'].notna()].reset_index(drop=True)

In [None]:
len(df_materials)

### Count tokens

In [None]:
GEMINI_API_KEY = 'GEMINI_API_KEY'
genai.configure(api_key=GEMINI_API_KEY)

GEMINI_MODEL = 'models/gemini-1.5-flash-002'
# GEMINI_MODEL = 'models/gemini-1.5-pro-002'

model = genai.GenerativeModel(model_name=GEMINI_MODEL)
print(f'Model {model.model_name} loaded successfully')

In [None]:
list_tokens = []
retry_attempts = 1  # Number of retry attempts

for file in tqdm(df_materials['file_str'].tolist(), desc="Calculating number of tokens"):
    
    if file:

        attempt = 0
        success = False
    
        while attempt < retry_attempts and not success:
            try:
                tokens = model.count_tokens(file).total_tokens
                success = True
            except Exception as e:
                attempt += 1
                if attempt < retry_attempts:
                    print(f"Error with {file[:100]}, retrying... ({attempt}/{retry_attempts})")
                    time.sleep(1)  # Optional: small delay between retries
                else:
                    tokens = 0
                    print(f"Failed to process {file[:100]} after {retry_attempts} attempts. Error: {e}")
    
    else:
        tokens = 0
    
    list_tokens.append(tokens)

In [24]:
df_materials['num_tokens'] = list_tokens

In [None]:
df_materials['file'] = df_materials['file_path'].apply(lambda x: x.split('/')[-1])
df_materials['file'] = df_materials['file'].str.replace('web.stanford.edu_class_', '')
df_materials['file'] = df_materials['file'].str.replace('drive.usercontent.google.com_download_id=', '')
df_materials['file'] = df_materials['course'] + '/' + df_materials['file']

def add_file_name_course(row):
    return f"=== File: {row['file']} === \n === Name: {row['name']} === \n === Course: {row['course']} === \n\n {row['file_str']} "

In [27]:
df_materials['file_str'] = df_materials.apply(add_file_name_course, axis=1)

In [28]:
df_materials.to_pickle('df_materials_str.pkl')