In [1]:
!pip install -q -U google-generativeai
!apt install poppler-utils

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.8/146.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.5/664.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.3 [186 kB]
Fetched 186 kB in 0s (1,006 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 131015 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.3_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.3) ...
Setting up poppler-utils (22.02.0-2ubun

In [4]:
# Packages
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [5]:
# Used to securely store your API key
from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [45]:
video_file_name = "/content/roman.mp4"
import cv2
import os
import shutil

# Create or cleanup existing extracted image frames directory.
FRAME_EXTRACTION_DIRECTORY = "/content/frames"
FRAME_PREFIX = "_frame"
def create_frame_output_dir(output_dir):
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)
  else:
    shutil.rmtree(output_dir)
    os.makedirs(output_dir)

def extract_frame_from_video(video_file_path):
  print(f"Extracting {video_file_path} at 1 frame per second. This might take a bit...")
  create_frame_output_dir(FRAME_EXTRACTION_DIRECTORY)
  vidcap = cv2.VideoCapture(video_file_path)
  fps = vidcap.get(cv2.CAP_PROP_FPS)
  print(fps)
  frame_duration = 1 / fps  # Time interval between frames (in seconds)
  output_file_prefix = os.path.basename(video_file_path).replace('.', '_')
  frame_count = 0
  count = 0
  while vidcap.isOpened():
      success, frame = vidcap.read()
      if not success: # End of video
          break
      if int(count / fps) == frame_count: # Extract a frame every second
          min = frame_count // 60
          sec = frame_count % 60
          time_string = f"{min:02d}:{sec:02d}"
          image_name = f"{output_file_prefix}{FRAME_PREFIX}{time_string}.jpg"
          output_filename = os.path.join(FRAME_EXTRACTION_DIRECTORY, image_name)
          cv2.imwrite(output_filename, frame)
          frame_count += 1
      count += 1
  vidcap.release() # Release the capture object\n",
  print(f"Completed video frame extraction!\n\nExtracted: {frame_count} frames")

extract_frame_from_video(video_file_name)


Extracting /content/roman.mp4 at 1 frame per second. This might take a bit...
25.0
Completed video frame extraction!

Extracted: 207 frames


In [46]:
import os

class File:
  def __init__(self, file_path: str, display_name: str = None):
    self.file_path = file_path
    if display_name:
      self.display_name = display_name
    self.timestamp = get_timestamp(file_path)

  def set_file_response(self, response):
    self.response = response

def get_timestamp(filename):
  """Extracts the frame count (as an integer) from a filename with the format
     'output_file_prefix_frame00:00.jpg'.
  """
  parts = filename.split(FRAME_PREFIX)
  if len(parts) != 2:
      return None  # Indicates the filename might be incorrectly formatted
  return parts[1].split('.')[0]

# Process each frame in the output directory
files = os.listdir(FRAME_EXTRACTION_DIRECTORY)
files = sorted(files)
files_to_upload = []
for file in files:
  files_to_upload.append(
      File(file_path=os.path.join(FRAME_EXTRACTION_DIRECTORY, file)))

# Upload the files to the API
# Only upload a 10 second slice of files to reduce upload time.
# Change full_video to True to upload the whole video.
full_video = True

uploaded_files = []
print(f'Uploading {len(files_to_upload) if full_video else 10} files. This might take a bit...')

for file in files_to_upload if full_video else files_to_upload[40:50]:
  print(f'Uploading: {file.file_path}...')
  response = genai.upload_file(path=file.file_path)
  file.set_file_response(response)
  uploaded_files.append(file)

print(f"Completed file uploads!\n\nUploaded: {len(uploaded_files)} files")

Uploading 207 files. This might take a bit...
Uploading: /content/frames/roman_mp4_frame00:00.jpg...
Uploading: /content/frames/roman_mp4_frame00:01.jpg...
Uploading: /content/frames/roman_mp4_frame00:02.jpg...
Uploading: /content/frames/roman_mp4_frame00:03.jpg...
Uploading: /content/frames/roman_mp4_frame00:04.jpg...
Uploading: /content/frames/roman_mp4_frame00:05.jpg...
Uploading: /content/frames/roman_mp4_frame00:06.jpg...
Uploading: /content/frames/roman_mp4_frame00:07.jpg...
Uploading: /content/frames/roman_mp4_frame00:08.jpg...
Uploading: /content/frames/roman_mp4_frame00:09.jpg...
Uploading: /content/frames/roman_mp4_frame00:10.jpg...
Uploading: /content/frames/roman_mp4_frame00:11.jpg...
Uploading: /content/frames/roman_mp4_frame00:12.jpg...
Uploading: /content/frames/roman_mp4_frame00:13.jpg...
Uploading: /content/frames/roman_mp4_frame00:14.jpg...
Uploading: /content/frames/roman_mp4_frame00:15.jpg...
Uploading: /content/frames/roman_mp4_frame00:16.jpg...
Uploading: /content

In [51]:
!rm -rf output

In [52]:
pdf_name = "wikipedia.pdf"
first = 1
last = 39

In [53]:
path = pathlib.Path(pdf_name).exists()
if not path:
  print("PDF not found!")

In [54]:
# Extract PDF images
out_dir = "output"
!mkdir {out_dir}
!pdftoppm {pdf_name} -f {first} -l {last} {out_dir}/images -jpeg
!ls output

images-01.jpg  images-08.jpg  images-15.jpg  images-22.jpg  images-29.jpg  images-36.jpg
images-02.jpg  images-09.jpg  images-16.jpg  images-23.jpg  images-30.jpg  images-37.jpg
images-03.jpg  images-10.jpg  images-17.jpg  images-24.jpg  images-31.jpg  images-38.jpg
images-04.jpg  images-11.jpg  images-18.jpg  images-25.jpg  images-32.jpg  images-39.jpg
images-05.jpg  images-12.jpg  images-19.jpg  images-26.jpg  images-33.jpg
images-06.jpg  images-13.jpg  images-20.jpg  images-27.jpg  images-34.jpg
images-07.jpg  images-14.jpg  images-21.jpg  images-28.jpg  images-35.jpg


In [55]:
import PIL.Image
images = []
for i in range(first, last):
  id = f"0{i}" if i < 10 else i
  img = PIL.Image.open(f"{out_dir}/images-{id}.jpg")
  img.thumbnail([600, 600])
  images.append(img)

In [56]:
# Extract text from those images
for page_number in range(first,last+1):
  page_number = f"{page_number:02d}"
  ! pdftotext -f {page_number} -l {page_number} {pdf_name} test.txt
  ! mv test.txt {out_dir}/text-{page_number}.txt

In [57]:
!ls output

images-01.jpg  images-13.jpg  images-25.jpg  images-37.jpg  text-10.txt  text-22.txt  text-34.txt
images-02.jpg  images-14.jpg  images-26.jpg  images-38.jpg  text-11.txt  text-23.txt  text-35.txt
images-03.jpg  images-15.jpg  images-27.jpg  images-39.jpg  text-12.txt  text-24.txt  text-36.txt
images-04.jpg  images-16.jpg  images-28.jpg  text-01.txt    text-13.txt  text-25.txt  text-37.txt
images-05.jpg  images-17.jpg  images-29.jpg  text-02.txt    text-14.txt  text-26.txt  text-38.txt
images-06.jpg  images-18.jpg  images-30.jpg  text-03.txt    text-15.txt  text-27.txt  text-39.txt
images-07.jpg  images-19.jpg  images-31.jpg  text-04.txt    text-16.txt  text-28.txt
images-08.jpg  images-20.jpg  images-32.jpg  text-05.txt    text-17.txt  text-29.txt
images-09.jpg  images-21.jpg  images-33.jpg  text-06.txt    text-18.txt  text-30.txt
images-10.jpg  images-22.jpg  images-34.jpg  text-07.txt    text-19.txt  text-31.txt
images-11.jpg  images-23.jpg  images-35.jpg  text-08.txt    text-20.txt 

In [58]:
import tqdm

files = []
image_files = list(pathlib.Path("output").glob('images-*.jpg'))
for img in tqdm.tqdm(image_files):
    files.append(genai.upload_file(img))

texts = [t.read_text() for t in pathlib.Path("output").glob('text-*.txt')]
textbook = []
for page, (text, image) in enumerate(zip(texts, files)):
  textbook.append(f'## Page {first+page} ##')
  textbook.append(text)
  textbook.append(image)

100%|██████████| 39/39 [00:37<00:00,  1.03it/s]


In [59]:
model = genai.GenerativeModel(model_name='gemini-1.5-pro-latest')

In [60]:
config = {
  "response_mime_type": "application/json",
}
options = {
    "timeout": 600
}

In [142]:
# First task: summarize the key bullet points in the paper
prompt = "Give me 10 bulletpoints from this text: "

In [143]:
response = model.generate_content(contents = [prompt] + textbook, generation_config=config, request_options=options )

In [144]:
from IPython.display import Markdown
import json
json_data = json.loads(response.text)

In [145]:
json_data

['The Roman Empire spanned a vast period, from the fall of the Roman Republic in 27 BC to the fall of Constantinople in 1453 AD.',
 'Augustus, the first Roman Emperor, established principles of dynastic succession, leading to the Julio-Claudian dynasty.',
 'The Roman Empire reached its peak size under Trajan in 117 AD, encompassing territory from Britain to the Persian Gulf.',
 'The Crisis of the Third Century, marked by invasions, civil strife, and economic instability, led to the division of the Empire into Eastern and Western halves.',
 "Diocletian's reforms stabilized the Empire by dividing it into a Tetrarchy, with four co-rulers, and establishing a new administrative system.",
 'Constantine the Great reunited the Empire and established Constantinople as the new capital, marking the beginning of the Byzantine Empire.',
 'The Western Roman Empire declined due to Germanic migrations and invasions, culminating in its fall in 476 AD.',
 'The Eastern Roman Empire, also known as the Byz

In [83]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def display_image(frame_id: str):
  # Load the image from file
  file = "roman_mp4_frame"
  img = mpimg.imread(f'/content/frames/{file}{frame_id}.jpg')

  # Display the image
  plt.imshow(img)
  plt.axis('off')  # Turn off axis
  plt.show()

In [91]:
statements = " ".join(json_data)

In [185]:
# Asking for matching frames
# Create the prompt.
prompt = f"Give me 10 unique frames and its timestamp that match these 10 sentences: {statements}."

In [186]:
# Set the model to Gemini 1.5 Pro.
model = genai.GenerativeModel(model_name="models/gemini-1.5-pro-latest")

# Make GenerateContent request with the structure described above.
def make_request(prompt, files):
  request = [prompt]
  for file in files:
    request.append(file.timestamp)
    request.append(file.response)
  return request

# Make the LLM request.
request = make_request(prompt, uploaded_files)
config = {
  "response_mime_type": "application/json",
}
options = {
    "timeout": 600
}
response = model.generate_content(request,request_options=options, generation_config=config)
print(response.text)

[
    {"frame": "The Roman Empire began in 27 BC with the fall of the Roman Republic and the rise of Emperor Augustus.", "timestamp": "00:59"},
    {"frame": "The empire expanded significantly beyond the Italian Peninsula, conquering territories across Europe, Africa, and Asia.", "timestamp": "01:15"},
    {"frame": "The Julio-Claudian dynasty, starting with Augustus, ruled for several decades, followed by the Flavian dynasty and the Nerva-Antonine dynasty.", "timestamp": "01:10"},
    {"frame": "The Five Good Emperors, including Nerva, Trajan, Hadrian, Antoninus Pius, and Marcus Aurelius, marked a period of peace and prosperity.", "timestamp": "01:08"}, 
    {"frame": "The reign of Commodus, son of Marcus Aurelius, is often considered the beginning of the Roman Empire's decline.", "timestamp": "01:09"},
    {"frame": "The Crisis of the Third Century was a period of instability and turmoil, with numerous emperors being assassinated or overthrown. ", "timestamp": "02:53"},
    {"frame":

In [74]:
!pip install gtts

Collecting gtts
  Downloading gTTS-2.5.1-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.1


In [111]:
!pip install mutagen

Collecting mutagen
  Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m184.3/194.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mutagen
Successfully installed mutagen-1.47.0


In [112]:
from mutagen.mp3 import MP3
def get_audio_duration(file_path):
    audio = MP3(file_path)
    duration_in_seconds = audio.info.length
    return duration_in_seconds


In [192]:
from gtts import gTTS
from IPython.display import Audio
from IPython.display import display
import time

def play_audio(text: str):
  tts = gTTS(f"{text}".format(text))
  tts.save(f'1_{text}.wav')
  sound_file = f'1_{text}.wav'
  wn = Audio(sound_file, autoplay=True) ##
  display(wn)
  # Wait for the duration of the audio file
  time.sleep(get_audio_duration(sound_file)+0.2)

In [187]:
timestamps = json.loads(response.text)

In [205]:
from IPython.display import Image, display, Audio, clear_output
import time

def plot_video_book():
  print("Procesing content.......")
  time.sleep(1)
  print("Procesing content.......")
  time.sleep(1)
  for element in timestamps:
    clear_output(wait=True)
    frame_id = element["timestamp"]
    frame_summary = element["frame"]
    # # Display image
    display_image(frame_id)
    play_audio(frame_summary)

In [206]:
from ipywidgets import widgets, VBox, Output
from IPython.display import display

# Define a function to process user input
def process_input(text):
    # Process the user input
    plot_video_book()

# Define a function to handle button click
def on_button_clicked(b):
    with out:
        out.clear_output()
        # Get the user input
        user_input = text_input.value
        # Process the user input
        processed_input = process_input(user_input)
        print(f"Processed input: {processed_input}")

# Create a text input widget
text_input = widgets.Text(placeholder='PDF Name')

# Create a button widget
button = widgets.Button(description="Visualize the PDF!")

# Register the button click event
button.on_click(on_button_clicked)

# Create an output widget to display results
out = Output()

# Display the text input, button, and output widget vertically
display(VBox([text_input, button, out]))

VBox(children=(Text(value='', placeholder='PDF Name'), Button(description='Visualize the PDF!', style=ButtonSt…