diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..017cc24
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,49 @@
+name: Build and Deploy Documentation
+
+on:
+ push:
+ branches: ["master"]
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+ contents: read
+ pages: write
+ id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+ group: "pages"
+ cancel-in-progress: false
+
+jobs:
+
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v1
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+ - name: Install Python Packages
+ run: pip install sphinx && pip install -r docs/requirements.txt && pip install -r requirements.txt
+ - name: Build Docs
+ run: sphinx-build -M html ./docs ./docs/_build/
+ - name: Upload artifact
+ uses: actions/upload-pages-artifact@v3
+ with:
+ path: ./docs/_build/html/
+
+
+ deploy:
+ environment:
+ name: github-pages
+ url: ${{ steps.deployment.outputs.page_url }}
+ runs-on: ubuntu-latest
+ needs: build
+ steps:
+ - name: Deploy to GitHub Pages
+ id: deployment
+ uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
index 4697ddb..479c4d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,12 +12,19 @@ ffmpeg.exe
ffprobe.exe
*.ttf
*.mp4
+# Sphinx
+docs/_build/
+docs/_autosummary/
+docs/_static/
+docs/doctrees/
+.vscode
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
+
# C extensions
*.so
diff --git a/.vscode/launch.json b/.vscode/launch.json
deleted file mode 100644
index b10f2f0..0000000
--- a/.vscode/launch.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
- // Use IntelliSense to learn about possible attributes.
- // Hover to view descriptions of existing attributes.
- // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
- "version": "0.2.0",
- "configurations": [
- {
- "name": "Python: Current File",
- "type": "debugpy",
- "request": "launch",
- "program": "create_sitcom.py",
- "console": "integratedTerminal",
- "justMyCode": true
- }
- ]
-}
\ No newline at end of file
diff --git a/README.md b/README.md
index 8c2c3e2..0faaf0d 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,17 @@
# Sitcom Simulator
A highly-customizable tool that automatically creates AI-generated meme videos
+## Update
+
+Sitcom Simulator's web app has moved to [https://sitcom-simulator.net](https://sitcom-simulator.net), instead of `.com` due to the old domain expiring and getting scalped 😅
+
`pip install sitcom-simulator`
-## Examples
+## Documentation
+
+[View the documentation](https://joshmoody24.github.io/sitcom-simulator/) for setup instructions and code samples.
+
+## Example Videos
@@ -48,7 +56,7 @@ Sitcom Simulator is designed to be simple to use, but also supports extreme cust
### Command Line
-The most basic usage is simply running the `sitcom-simulator` command with no arguments. Many optional arguments are supported as well. Note that you must [set your API key environment variables](#environment-variables) before it will work.
+The most basic usage is simply running the `sitcom-simulator` command with no arguments. Many optional arguments are supported as well. Note that you must [set your API key environment variables](https://joshmoody24.github.io/sitcom-simulator/installation.html#environment-variables) before it will work.
```bash
sitcom-simulator --prompt "Elon Musk teleports a toaster into the ocean" --style "beautiful renaissance oil painting"
@@ -67,92 +75,23 @@ create_sitcom(
)
```
-Power users can completely customize the video creation process:
-
-```python
-from sitcom_simulator import (
- script_from_file,
- add_voices,
- add_images,
- add_music,
- render_video,
-)
-
-def upload_to_s3(index, file_path):
- ... # arbitrary code
-
-initial_script = script_from_file("custom_script.toml")
-
-script_with_voices = add_voices(
- initial_script,
- engine="fakeyou",
- on_voice_generated=upload_to_s3)
-
-script_with_images = add_images(
- script_with_voices,
- engine="stability",
- on_image_generated=upload_to_s3)
-
-script_with_music = add_music(script_with_images)
-
-render_video(
- script=final_script,
- font="Papyrus",
- output_path=f"./{final_script.metadata.title}.mp4")
-```
-
-More documentation on the advanced features will be coming soon.
-
-## Getting Started
-
-Several things must be completed before running Sitcom Simulator for the first time.
-
-### Prerequisites
-- Python 3
-- [ffmpeg](https://ffmpeg.org/download.html) (see below for more details)
-- Stability API key (get one [here](https://beta.dreamstudio.ai/membership?tab=apiKeys))
-- OpenAI API key (get one [here](https://openai.com/api/))
-
-#### FFmpeg
-
-The ffmpeg command must be accessible on your machine. This will vary depending on your system, but you can install it from the [official download page](https://ffmpeg.org/download.html) or various package managers, e.g., `apt install ffmpeg` on Debian/Ubuntu, `brew install ffmpeg` on Mac, etc.
-
-Alternatively, instead of installing ffmpeg on your system, you can place the `ffmpeg` and `ffprobe` binaries in your project's root directory, which will work equally well.
-
-### Environment Variables
-
-This package requires API keys from OpenAI and Stability AI to be stored in environment variables.
-
-First, acquire API keys for OpenAI and Stability AI (see [prerequisites](#prerequisites))
-
-How you set the environment variables will depend on your use case:
-
-#### Comamnd Line
-
-Set the environments in the terminal, i.e., `export OPENAI_API_KEY=` (Linux) `set OPENAI_API_KEY=` (Windows)
-
-#### Python Projects
-
-Create a `.env` file in your project's root directory, with the following structure:
-
-```bash
-STABILITY_API_KEY='your_key_here'
-OPENAI_API_KEY='your_key_here'
-```
-
-The `.env` file will be automatically detected by the program.
-
## How it Works
-Sitcom Simulator is essentially duct tape that combines multiple different AI tools into one unholy abomination.
+Sitcom Simulator is essentially duct tape that combines various AI tools into one unholy abomination.
1. [ChatGPT](https://chat.openai.com/) generates the video script
2. [FakeYou](https://fakeyou.com) generates voices for the characters
-3. [Stable Diffusion](https://stability.ai/stable-image) generates images for the characters
+3. [Stable Diffusion](https://stability.ai/stable-image) generates images of the characters
4. [Freepd](https://freepd.com/) provides the background music
5. [FFmpeg](https://ffmpeg.org/) connects the images and voices into a movie
## Contributions
-Want to help work on this project? I'm down! Feel free to reach out to me if you want to contribute or have any questions :)
+Want to help work on this project? I'm down! [Contact me](https://joshmoody.org/contact/) if you want to contribute or have any questions :)
+
+Have fun!!!
+
+## Links
-Have fun!!!
\ No newline at end of file
+- [Documentation](https://joshmoody24.github.io/sitcom-simulator/)
+- [sitcom-simulator.net](https://sitcom-simulator.net)
+ - Formerly `sitcom-simulator.com`
diff --git a/config.toml b/config.toml
deleted file mode 100644
index ee203bc..0000000
--- a/config.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-# video font
-font = "Arial"
-
-# how long to wait between starting FakeYou voice jobs
-# (anything shorter than 30 seconds is likely to get rate limited and crash the app)
-job_delay = 30 # this setting currently does nothing, working on it
-
-# how long to wait between polling pending FakeYou voice jobsd
-poll_delay = 10 # this setting currently does nothing, working on it
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/api.rst b/docs/api.rst
new file mode 100644
index 0000000..4ecc569
--- /dev/null
+++ b/docs/api.rst
@@ -0,0 +1,8 @@
+API
+===
+
+.. autosummary::
+ :toctree: _autosummary
+ :recursive:
+
+ sitcom_simulator
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..4214d7a
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,69 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import toml
+import os
+import sys
+sys.path.insert(0, os.path.abspath('..')) # enable importing sitcom_simulator
+
+pyproject = toml.load('../pyproject.toml')
+
+project = pyproject['project']['name']
+author = pyproject['project']['authors'][0]['name']
+copyright = f'2024, {author}'
+version = pyproject['project']['version']
+release = version
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+ 'sphinx.ext.duration',
+ 'sphinx.ext.doctest',
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.autosummary',
+ 'sphinx_autodoc_typehints',
+ ]
+
+autosummary_generate = True
+autodoc_typehints = "description" # description, signature, none
+autodoc_typehints_format = "short"
+autodoc_default_options = {
+ 'members': True,
+ 'member-order': 'bysource',
+ 'special-members': '__init__',
+ 'undoc-members': True,
+ 'exclude-members': '__weakref__',
+ # make function params alphabetical
+}
+
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'furo' # 'alabaster'
+html_static_path = ['_static']
+
+html_logo = "sitcom-simulator-logo.png"
+html_theme_options = {
+ "light_css_variables": {
+ "color-brand-primary": "green",
+ "color-brand-content": "green",
+ "color-admonition-background": "green",
+ },
+ "dark_css_variables": {
+ "color-brand-primary": "springgreen",
+ "color-brand-content": "springgreen",
+ "color-admonition-background": "green",
+ },
+}
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..88338eb
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,36 @@
+.. Sitcom Simulator documentation master file, created by
+ sphinx-quickstart on Sat Feb 10 23:00:30 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Sitcom Simulator
+============================================
+
+**Sitcom Simulator** is a highly-customizable tool for automatically creating AI-generated meme videos. It combines numerous generative AI tools like `ChatGPT `_, `Stable Diffusion `_, and `FakeYou `_ to create short, funny videos in many styles.
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Getting Started
+
+ overview
+ installation
+ usage
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Reference
+
+ llm_instructions
+ api
+
+Index
+----------------
+
+* :ref:`genindex`
+* :ref:`modindex`
+
+External Links
+----------------
+
+* `GitHub `_
+* `PyPi `_
\ No newline at end of file
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..f112bfa
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,51 @@
+Installation
+=============
+
+``pip install sitcom-simulator``
+
+Dependencies
+------------
+
+You will need the following dependencies before running Sitcom Simulator for the first time:
+
+* `Python 3.11 `_ or later
+* `FFmpeg `_
+* `Stability API key `_
+* `OpenAI API key `_
+
+FFmpeg
+^^^^^^
+
+The ``ffmpeg`` command must be accessible on your machine. This will vary depending on your system, but you can install it from the `official FFmpeg download page `_ or various package managers, e.g., ``apt install ffmpeg`` on Debian/Ubuntu, ``brew install ffmpeg`` on Mac, etc.
+
+Alternatively, instead of installing ffmpeg on your system, you can place the ``ffmpeg`` and ``ffprobe`` binaries in your project's root directory, which will work equally well.
+
+Environment Variables
+---------------------
+
+This package requires API keys from OpenAI and Stability AI to be stored in environment variables.
+
+How you set the environment variables will depend on your use case, as explained below.
+
+Command Line
+^^^^^^^^^^^^
+
+Set the environments in the terminal:
+
+Linux: ``export OPENAI_API_KEY=``
+
+Windows: ``set OPENAI_API_KEY=``
+
+Python Projects
+^^^^^^^^^^^^^^^
+
+Create a ``.env`` file in your project's root directory, with the following structure:
+
+.. code-block:: bash
+
+ STABILITY_API_KEY='your_key_here'
+ OPENAI_API_KEY='your_key_here'
+
+The ``.env`` file will be automatically detected by the program.
+
+You're ready to make your first meme video!
diff --git a/docs/llm_instructions.rst b/docs/llm_instructions.rst
new file mode 100644
index 0000000..c8415ca
--- /dev/null
+++ b/docs/llm_instructions.rst
@@ -0,0 +1,25 @@
+LLM Instructions
+========================
+
+Sitocm Simulator uses large language models (LLMs) in several places.
+The default prompts are shown below.
+They are overridable.
+They are subject to change frequently as improvements are discovered.
+
+Script Writing
+--------------
+
+Parameters: ``characters``, ``music_categories``, ``prompt``
+
+.. literalinclude:: ..//sitcom_simulator/script/llm_instructions.txt
+ :language: text
+ :caption: llm_instructions.txt
+
+Character Extraction
+--------------------
+
+Parameters: ``prompt``
+
+.. literalinclude:: ..//sitcom_simulator/script/integrations/fakeyou/character_extraction_instructions.txt
+ :language: text
+ :caption: llm_instructions.txt
\ No newline at end of file
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..954237b
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.https://www.sphinx-doc.org/
+ exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/overview.rst b/docs/overview.rst
new file mode 100644
index 0000000..e2bed5c
--- /dev/null
+++ b/docs/overview.rst
@@ -0,0 +1,30 @@
+Overview
+================
+
+What is Sitcom Simulator?
+-----------------------------
+
+Sitcom Simulator is a tool for auto-generating meme videos from text prompts.
+The user enters a prompt, say, ``Mario and Luigi summon a demon``,
+and the program generates a short video on that topic.
+
+Sitcom Simulator design is focused on the following goals:
+
+* **Ease of use**: The user should be able to generate a video with minimal effort.
+* **Customization**: The user should be able to customize the video extensively.
+* **Quality**: The user should be able to generate a video that is at least somewhat entertaining.
+* **Speed**: The user should be able to generate a video within a few minutes.
+* **Cost-effectiveness**: The user should be able to generate a video for pennies at most.
+
+How does it work?
+-----------------------------
+
+Sitcom Simulator is essentially duct tape that combines various AI tools into one unholy abomination.
+
+#. `ChatGPT `_ generates the video script
+#. `FakeYou `_ generates voices for the characters
+#. `Stable Diffusion `_ generates images of the characters
+#. `Freepd `_ provides the background music
+#. `FFmpeg `_ connects the images and voices into a movie
+
+Sitcom Simulator is available as a command line tool or as a python module. Continue following the documentation to learn how to use install and use it.
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..8e8fd36
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+furo
+sphinx-autodoc-typehints
\ No newline at end of file
diff --git a/docs/sitcom-simulator-logo.png b/docs/sitcom-simulator-logo.png
new file mode 100644
index 0000000..f1bfc39
Binary files /dev/null and b/docs/sitcom-simulator-logo.png differ
diff --git a/docs/usage.rst b/docs/usage.rst
new file mode 100644
index 0000000..6d306de
--- /dev/null
+++ b/docs/usage.rst
@@ -0,0 +1,63 @@
+Usage
+=====
+
+Sitcom Simulator is designed to be simple to use, but also supports extreme customizability for power users who know exactly what they want. Sitcom Simulator can be used from the command line or can be imported in Python scripts.
+
+Command Line
+------------
+
+.. code-block:: bash
+
+ sitcom-simulator --prompt "Elon Musk teleports a toaster into the ocean" --style "beautiful renaissance oil painting"
+
+Python
+------
+
+Sitcom Simulator can also be imported in Python scripts:
+
+.. code-block:: python
+
+ from sitcom_simulator import create_sitcom
+
+ # generate a short meme video and save it in the project folder
+ create_sitcom(
+ prompt="Mario hits Luigi with a stapler",
+ )
+
+Power users can fully customize the video creation process:
+
+.. code-block:: python
+
+ from sitcom_simulator import (
+ script_from_file,
+ add_voices,
+ add_images,
+ add_music,
+ render_video,
+ )
+
+ def upload_to_s3(index, file_path):
+ ... # arbitrary code
+
+ initial_script = script_from_file("custom_script.toml")
+
+ script_with_voices = add_voices(
+ initial_script,
+ engine="fakeyou",
+ on_voice_generated=upload_to_s3)
+
+ script_with_images = add_images(
+ script_with_voices,
+ engine="stability",
+ on_image_generated=upload_to_s3)
+
+ script_with_music = add_music(script_with_images)
+
+ render_video(
+ script=final_script,
+ font="Papyrus",
+ output_path=f"./{final_script.metadata.title}.mp4")
+
+Now you know how to use Sitcom Simulator!
+
+Enjoy making terrible meme videos 🐢
\ No newline at end of file
diff --git a/example.env b/example.env
index 24addf0..ded40f5 100644
--- a/example.env
+++ b/example.env
@@ -1,3 +1,7 @@
# copy this file to '.env' and replace the values with your personal API keys
STABILITY_API_KEY='your_key_here'
-OPENAI_API_KEY='your_key_here'
\ No newline at end of file
+OPENAI_API_KEY='your_key_here'
+
+# optional, but speeds up voice generation
+FAKEYOU_USERNAME='your_username_or_email_here'
+FAKEYOU_PASSWORD='your_password_here'
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index eebd0f9..b86844a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,10 @@
[project]
name = "sitcom-simulator"
-version = "0.0.7"
+version = "0.6.2"
authors = [
{ name = "Josh Moody", email = "josh@joshmoody.org" },
]
-description = "A highly-cusotmizable tool that automatically creates AI-generated meme videos"
+description = "A highly-customizable tool for automatically creating AI-generated meme videos"
readme = "README.md"
license = {file = "LICENSE"}
keywords = ["ai", "video", "meme", "generator", "sitcom", "simulator"]
@@ -14,30 +14,34 @@ classifiers = [
"Operating System :: OS Independent",
]
dependencies = [
- "stability-sdk==0.8.4",
- "python-dotenv==1.0.0",
- "tqdm==4.66.1",
- "openai==0.28.0",
- "fakeyou==1.2.5",
- "image==1.5.33",
- "ffmpeg-python==0.2.0",
- "gtts==2.3.2",
- "moviepy==1.0.3",
- "Pillow==10.0.1",
- "beautifulsoup4==4.12.2",
- "requests==2.31.0",
- "mypy==1.8.0",
+ "stability-sdk~=0.8.5",
+ "python-dotenv~=1.0.0",
+ "tqdm~=4.66.1",
+ "openai~=0.28.0",
+ "ffmpeg-python~=0.2.0",
+ "gtts~=2.5.1",
+ "Pillow~=10.0.1",
+ "beautifulsoup4~=4.12.2",
+ "requests~=2.31.0",
+ "thefuzz~=0.22.1",
+ "toml",
+ # "fakeyou==1.2.5", Currently using raw HTTP requests instead
+ # "moviepy==1.0.3", No longer supported due to lack of features. Using ffmpeg-python instead
]
-requires-python = ">= 3.9"
+requires-python = ">=3.11,<3.13"
[project.optional-dependencies]
dev = [
"mypy",
+ "sphinx",
+ "furo",
+ "sphinx-autodoc-typehints",
+ "mypy",
]
[project.urls]
homepage = "https://github.com/joshmoody24/sitcom-simulator"
-documentation = "https://github.com/joshmoody24/sitcom-simulator"
+documentation = "https://joshmoody24.github.io/sitcom-simulator/"
repository = "https://github.com/joshmoody24/sitcom-simulator"
[build-system]
@@ -48,4 +52,4 @@ build-backend = "hatchling.build"
include = ["sitcom_simulator/**/*"]
[project.scripts]
-sitcom-simulator = "sitcom_simulator.cli:main"
\ No newline at end of file
+sitcom-simulator = "sitcom_simulator.cli:main"
diff --git a/requirements.txt b/requirements.txt
index 7decee4..bde37e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,12 @@
-stability-sdk==0.8.4
-python-dotenv==1.0.0
-tqdm==4.66.1
-openai==0.28.0
-fakeyou==1.2.5
-image==1.5.33
-ffmpeg-python==0.2.0
-gtts==2.3.2
-moviepy==1.0.3
-Pillow==10.0.1
-beautifulsoup4==4.12.2
-requests==2.31.0
-mypy==1.8.0
\ No newline at end of file
+stability-sdk~=0.8.5
+python-dotenv~=1.0.0
+tqdm~=4.66.1
+openai~=0.28.0
+ffmpeg-python~=0.2.0
+gtts~=2.5.1
+Pillow~=10.0.1
+beautifulsoup4~=4.12.2
+requests~=2.31.0
+mypy~=1.8.0
+thefuzz~=0.22.1
+toml
\ No newline at end of file
diff --git a/scripts/example_basic.toml b/scripts/example_basic.toml
new file mode 100644
index 0000000..50951f2
--- /dev/null
+++ b/scripts/example_basic.toml
@@ -0,0 +1,14 @@
+[[characters]]
+name = "Mario"
+voice_token = "TM:6c6d3a8tavv6"
+
+[[clips]]
+speaker = "Mario"
+speech = "Luigi, I'm hungry. Do you mind if I have a bite of your soul?"
+image_prompt = "Mario with red cap and mustache attempting to eat Luigi's soul, horror"
+image_path = "C:\\Users\\joshm\\Pictures\\fb photo.jpg"
+
+[metadata]
+title = "A Soulful Snack Encounter"
+bgm_style = "upbeat"
+art_style = "cinematic bokeh blur"
\ No newline at end of file
diff --git a/scripts/example_mario.toml b/scripts/example_mario.toml
index b7b3570..ea93e68 100644
--- a/scripts/example_mario.toml
+++ b/scripts/example_mario.toml
@@ -1,3 +1,15 @@
+[[characters]]
+name = "Mario"
+voice_token = "TM:6c6d3a8tavv6"
+
+[[characters]]
+name = "Luigi"
+voice_token = "TM:fp4fcyja6mk1"
+
+[[characters]]
+name = "Narrator"
+voice_token = "TM:xrk8qhm6cb6r"
+
[[clips]]
speaker = "Mario"
speech = "Luigi, I'm hungry. Do you mind if I have a bite of your soul?"
@@ -21,16 +33,4 @@ image_prompt = "Luigi offering a truce, Mario looking contemplative, comedy"
[metadata]
title = "A Soulful Snack Encounter"
bgm_style = "upbeat"
-art_style = "cinematic bokeh blur"
-
-[[characters]]
-name = "Mario"
-voice_token = "TM:6c6d3a8tavv6"
-
-[[characters]]
-name = "Luigi"
-voice_token = "TM:fp4fcyja6mk1"
-
-[[characters]]
-name = "Narrator"
-voice_token = "TM:xrk8qhm6cb6r"
+art_style = "cinematic bokeh blur"
\ No newline at end of file
diff --git a/sitcom_simulator/__init__.py b/sitcom_simulator/__init__.py
index bef9ab9..954fe5b 100644
--- a/sitcom_simulator/__init__.py
+++ b/sitcom_simulator/__init__.py
@@ -1,9 +1,9 @@
from dotenv import load_dotenv
load_dotenv()
-from .script_generator import write_script, script_from_file
-from .speech_generator import add_voices, generate_voices
-from .image_generator import add_images, generate_images
-from .video_generator import render_video
-from .music_generator import add_music, generate_music
-from .sitcom_creator import create_sitcom
\ No newline at end of file
+from .script import write_script, script_from_file
+from .speech import add_voices, generate_voices
+from .image import add_images, generate_images
+from .video import render_video
+from .music import add_music, generate_music
+from .auto import create_sitcom
\ No newline at end of file
diff --git a/sitcom_simulator/auto.py b/sitcom_simulator/auto.py
new file mode 100644
index 0000000..b575c8e
--- /dev/null
+++ b/sitcom_simulator/auto.py
@@ -0,0 +1,139 @@
+from typing import Literal
+
+def create_sitcom(
+ prompt:str | None = None,
+ art_style:str | None = None,
+ script_path:str | None = None,
+ debug_images:bool=False,
+ debug_audio:bool=False,
+ font:str = 'Arial',
+ max_tokens:int=2048,
+ approve_script:bool=False,
+ manual_select_characters:bool=True,
+ upload_to_yt=False,
+ audio_job_delay:int=30,
+ audio_poll_delay:int=10,
+ caption_bg_style:Literal['box_shadow', 'text_shadow', 'none']='text_shadow',
+ save_script:bool=False,
+ speed:float=1,
+ pan_and_zoom:bool=True,
+ orientation:Literal["landscape", "portrait", "square"]="portrait",
+ resolution:int=1080,
+ narrator_dropout:bool=False,
+ music_url:str|None=None,
+ audio_codec:Literal['mp3', 'aac']='mp3',
+):
+ """
+ Generates a sitcom video based on a prompt or a script file.
+ It combines the script generation, voice generation, image generation, music generation, and video rendering steps into a single function.
+
+ :param prompt: The prompt to generate the video script. If not provided, a script path must be provided.
+ :param art_style: The art style to use for the video. If not provided, the art style will be selected by the language model.
+ :param script_path: The path to a TOML script file to use for the video. The TOML must map to the Script model. If not provided, a prompt must be provided.
+ :param debug: If True, the video will be generated using the debug mode, which uses the GTTS and Pillow engines instead of the FakeYou and Stability engines to increase speed and reduce costs.
+ :param font: The font to use for the video. This font must be installed on the system.
+ :param max_tokens: The maximum number of tokens to use for the language model. This will affect the length of the generated script.
+ :param approve_script: If True, the script must be approved by the user before generating the video.
+ :param manual_select_characters: If True, the user will be prompted to select the characters for the video. If False, the characters will be selected automatically by the language model.
+ :param upload_to_yt: If True, the video will be uploaded to YouTube after it is generated. NOTE: currently does not work.
+ :param audio_job_delay: The number of seconds to wait between starting audio generation jobs. Lower values render faster but are more likely to get rate limited. (FakeYou only)
+ :param audio_poll_delay: The number of seconds to wait between polling for audio generation job completion. (FakeYou only)
+ :param caption_bg_style: The style of the background behind the captions.
+ :param save_script: If True, the generated script will be saved to a file.
+ :param speed: The speed of the final video. 1.0 is normal speed.
+ :param pan_and_zoom: If True, the pan and zoom effect on images will be enabled.
+ :param orientation: The orientation of the video. "landscape", "portrait", or "square".
+ :param resolution: The width of the video to render assuming portrait mode. This takes into account the orientation parameter.
+ :param narrator_dropout: If True, the narrator will be forcibly removed from the script (ChatGPT often goes heavy on the narrators).
+ :param music_url: A URL to a music track to use for the video.
+ :param audio_codec: The audio codec to use for the video. mp3 seems to be more compatible with more video players, but aac is higher quality and is necessary for viewing videos in an iPhone browser.
+ """
+ from .models import VideoResult
+ from .script import write_script
+ from .speech import add_voices
+ from .image import add_images
+ from .music import add_music
+ from .video import render_video
+ from .script import script_from_file
+ from .social.yt_uploader import upload_to_yt
+
+ if(prompt == None and script_path == None):
+ prompt = input("Enter a prompt to generate the video script: ")
+
+ assert prompt or script_path, "You must provide a prompt or a script path"
+ assert orientation in ["landscape", "portrait", "square"], "Orientation must be 'landscape', 'portrait', or 'square'"
+
+ if prompt and not script_path:
+ initial_script = write_script(
+ prompt=prompt,
+ manual_character_selection=manual_select_characters,
+ max_tokens=max_tokens,
+ require_approval=approve_script,
+ fakeyou_characters=not debug_audio,
+ narrator_dropout=narrator_dropout,
+ )
+ elif script_path and not prompt:
+ initial_script = script_from_file(script_path)
+ else:
+ raise ValueError("You must provide a prompt or a script path, not both")
+
+ if art_style:
+ initial_script = initial_script.replace(metadata=initial_script.metadata.replace(art_style=art_style))
+
+ script_with_voices = add_voices(
+ initial_script,
+ engine="fakeyou" if not debug_audio else "gtts",
+ fakeyou_job_delay=audio_job_delay,
+ fakeyou_poll_delay=audio_poll_delay,
+ )
+
+ # image gen could theoretically be done in parallel with the audio
+ script_with_images = add_images(
+ script_with_voices,
+ engine="stability" if not debug_images else "pillow",
+ orientation=orientation,
+ )
+
+ script_with_music = add_music(
+ script=script_with_images,
+ music_url=music_url,
+ )
+
+ final_script = script_with_music
+
+ filename = final_script.metadata.title[:50].strip() or 'render' if final_script.metadata.title else 'render'
+ output_path = f"./{filename}.mp4"
+
+ final_video_path = render_video(
+ script=final_script,
+ font=font,
+ output_path=output_path,
+ caption_bg_style=caption_bg_style,
+ resolution=resolution,
+ orientation=orientation,
+ speed=speed,
+ pan_and_zoom=pan_and_zoom,
+ audio_codec=audio_codec,
+ )
+
+ result = VideoResult(
+ path=final_video_path,
+ title=final_script.metadata.title if final_script.metadata.title else filename,
+ description=prompt or 'an AI-generated meme video created with Sitcom Simulator'
+ )
+
+ print(f"Video generated at {final_video_path}")
+
+ if save_script:
+ import toml
+ from dataclasses import asdict
+ with open(f"./{filename}.toml", 'w') as f:
+ f.write(toml.dumps(asdict(final_script)))
+ print(f"Script saved at ./{filename}.toml")
+
+ # if upload_to_yt:
+ # title = prompt
+ # keywords = [word for word in prompt.split(' ') if len(word) > 3] if prompt else ["sitcom", "funny", "comedy", "ai", "deepfake"]
+ # upload_to_yt(result.path, result.title, result.description, keywords, "24", "public")
+
+ return result
\ No newline at end of file
diff --git a/sitcom_simulator/cli.py b/sitcom_simulator/cli.py
index c5b99bd..b28d9a3 100644
--- a/sitcom_simulator/cli.py
+++ b/sitcom_simulator/cli.py
@@ -1,8 +1,7 @@
-from .sitcom_creator import create_sitcom
+from .auto import create_sitcom
import argparse
-import tomllib
-def parse_args():
+def _parse_args():
parser = argparse.ArgumentParser(
prog = "Sitcom Simulator",
description = "A tool that creates bad sitcoms using AI tools",
@@ -17,28 +16,51 @@ def parse_args():
parser.add_argument('-u', '--upload', action="store_true", help="upload the generated video to YouTube")
parser.add_argument('-m', '--manual-select-characters', action="store_true", help="manually select characters instead of using the AI to select them")
parser.add_argument('-d', '--debug', action='store_true', help="skip expensive API calls, generating robotic TTS and blank images instead.")
-
+ parser.add_argument('--debug-images', action='store_true', help="skip expensive image generation API calls, generating blank images instead.")
+ parser.add_argument('--debug-audio', action='store_true', help="skip slow voice generation API calls, generating robotic TTS instead.")
+ parser.add_argument('--font', type=str, help="the font to use for the video", default='Arial')
+ parser.add_argument('--audio-job-delay', type=int, default=30, help="the number of seconds to wait between starting audio generation jobs. Lower values render faster but are more likely to get rate limited")
+ parser.add_argument('--audio-poll-delay', type=int, default=10, help="the number of seconds to wait between polling for audio generation job completion")
+ parser.add_argument('--box-shadow', action='store_true', help="use box background for captions instead of text shadow")
+ parser.add_argument('--save-script', action='store_true', help="save the generated script to a file")
+ parser.add_argument('--speed', type=float, default=1, help="speed up the final video by this factor (1.0 is normal speed)")
+ parser.add_argument('--no-pan-and-zoom', action='store_true', help="disable pan and zoom effect on images")
+ parser.add_argument('--resolution', type=int, default=1080, help="the resolution of the video (passing in 1080 means 1080p)")
+ parser.add_argument('--orientation', type=str, default='portrait', help="the orientation of the video (landscape, portrait, or square)")
+ parser.add_argument('--no-narrators', action='store_true', help="disable narrator characters")
+ parser.add_argument('--music-url', type=str, help="a URL to a music track to use for the video")
+ parser.add_argument('--audio-codec', type=str, help="the audio codec to use for the video: mp3 or aac", default='mp3')
args = parser.parse_args()
return args
def main():
+ """
+ The main entry point for the CLI, invoked when the module is run as a script.
+ """
print("\nSitcom Simulator\nBy Josh Moody\n")
-
- try:
- with open("config.toml", "rb") as f:
- config = tomllib.load(f)
- except FileNotFoundError:
- # no big deal
- config = {}
- args = parse_args()
+ args = _parse_args()
# do the magic
create_sitcom(
prompt=args.prompt,
art_style=args.style,
script_path=args.script_path,
- debug=args.debug,
- font=config.get("font", 'Arial'),
+ debug_images=args.debug_images or args.debug,
+ debug_audio=args.debug_audio or args.debug,
+ font=args.font,
manual_select_characters=args.manual_select_characters,
max_tokens=args.max_tokens,
+ approve_script=args.approve_script,
+ upload_to_yt=args.upload,
+ audio_job_delay=args.audio_job_delay,
+ audio_poll_delay=args.audio_poll_delay,
+ caption_bg_style="box_shadow" if args.box_shadow else "text_shadow",
+ save_script=args.save_script,
+ speed=args.speed,
+ pan_and_zoom=not args.no_pan_and_zoom,
+ orientation=args.orientation,
+ resolution=args.resolution,
+ narrator_dropout=args.no_narrators,
+ music_url=args.music_url,
+ audio_codec=args.audio_codec,
)
\ No newline at end of file
diff --git a/sitcom_simulator/image_generator/__init__.py b/sitcom_simulator/image/__init__.py
similarity index 100%
rename from sitcom_simulator/image_generator/__init__.py
rename to sitcom_simulator/image/__init__.py
diff --git a/sitcom_simulator/image/image_generator.py b/sitcom_simulator/image/image_generator.py
new file mode 100644
index 0000000..86d86a2
--- /dev/null
+++ b/sitcom_simulator/image/image_generator.py
@@ -0,0 +1,77 @@
+from tqdm import tqdm
+from typing import List, Optional, Callable, Literal
+from sitcom_simulator.models import Script
+import os
+import atexit
+
+Engine = Literal["stability", "pillow"]
+Orientation = Literal["landscape", "portrait", "square"]
+
+def generate_images(
+ script: Script,
+ orientation:Orientation="portrait",
+ on_image_generated: Optional[Callable[[int, str], None]] = None,
+ engine:Engine="stability",
+ ):
+ """
+ Generates and returns a list of image paths for the given script.
+
+ More procedural in nature than add_images.
+
+ :param script: The script to generate images for
+ :param orientation: The orientation of the images to generate
+ :param on_image_generated: A callback to call after each image is generated which takes the clip index and path to the generated image
+ :param engine: The engine to use for generating images
+ """
+ width, height = {
+ "landscape": (1344, 768),
+ "portrait": (768, 1344),
+ "square": (1024, 1024),
+ }[orientation]
+ from .integrations import stability, pillow
+ image_paths: List[str | None] = []
+ for i, clip in tqdm(enumerate(script.clips), desc="Generating images", total=len(script.clips)):
+ image_prompt = clip.image_prompt
+ if not image_prompt:
+ image_paths.append(None)
+ continue
+ if clip.image_path:
+ image_paths.append(clip.image_path)
+ continue
+ if engine == "stability":
+ full_prompt = f'{image_prompt}{", " + script.metadata.art_style if script.metadata.art_style else ""}'
+ image_path = stability.generate_image(prompt=full_prompt, width=width, height=height)
+ else: # debug engine
+ image_path = pillow.generate_image(width, height)
+ atexit.register(os.remove, image_path)
+ image_paths.append(image_path)
+ if on_image_generated:
+ on_image_generated(i, image_path)
+
+ return image_paths
+
+def add_images(
+ script: Script,
+ orientation:Orientation="portrait",
+ on_image_generated: Optional[Callable[[int, str], None]] = None,
+ engine:Engine="stability",
+ ) -> Script:
+ """
+ Given a script, returns the same script but with the image paths filled in.
+
+ More functional in nature than generate_images.
+
+ :param script: The script to add images to
+ :param orientation: The orientation of the images to generate
+ :param on_image_generated: A callback to call after each image is generated which takes the clip index and path to the generated image
+ :param engine: The engine to use for generating images
+ """
+ image_paths = generate_images(
+ script=script,
+ orientation=orientation,
+ on_image_generated=on_image_generated,
+ engine=engine)
+ return script.replace(
+ clips=[clip.replace(image_path=image_path) for clip, image_path in zip(script.clips, image_paths)],
+ metadata=script.metadata.replace(orientation=orientation)
+ )
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/integrations/chatgpt/__init__.py b/sitcom_simulator/image/integrations/__init__.py
similarity index 100%
rename from sitcom_simulator/script_generator/integrations/chatgpt/__init__.py
rename to sitcom_simulator/image/integrations/__init__.py
diff --git a/sitcom_simulator/image_generator/integrations/pillow.py b/sitcom_simulator/image/integrations/pillow.py
similarity index 62%
rename from sitcom_simulator/image_generator/integrations/pillow.py
rename to sitcom_simulator/image/integrations/pillow.py
index aa94015..5edb1fa 100644
--- a/sitcom_simulator/image_generator/integrations/pillow.py
+++ b/sitcom_simulator/image/integrations/pillow.py
@@ -1,8 +1,15 @@
-from PIL import Image
import random
import tempfile
def generate_image(width:int=720, height:int=1280):
+ """
+ Generates a random solid-color image and returns the path to the image file.
+ Intended for use in debugging and testing.
+
+ :param width: The width of the image to generate
+ :param height: The height of the image to generate
+ """
+ from PIL import Image
# Generate a random color
color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
diff --git a/sitcom_simulator/image_generator/integrations/stability.py b/sitcom_simulator/image/integrations/stability.py
similarity index 68%
rename from sitcom_simulator/image_generator/integrations/stability.py
rename to sitcom_simulator/image/integrations/stability.py
index f4e3ab8..0d50c7a 100644
--- a/sitcom_simulator/image_generator/integrations/stability.py
+++ b/sitcom_simulator/image/integrations/stability.py
@@ -1,5 +1,3 @@
-from stability_sdk.client import StabilityInference, process_artifacts_from_answers
-import stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation
import tempfile
import mimetypes
import os
@@ -7,14 +5,21 @@
STABILITY_HOST = "grpc.stability.ai:443"
-def generate_image(prompt: str, width=1024, height=1024):
+def generate_image(prompt:str, width:int=1024, height:int=1024):
"""
- generates an image for each prompt using stable diffusion,
+ Generates an image for each prompt using stable diffusion,
returning a list of file paths for those images
+
+ :param prompt: The prompt to generate the image for
+ :param width: The width of the image to generate
+ :param height: The height of the image to generate
"""
+ # lazy load because this is a heavy dependency
+ import stability_sdk.interfaces.gooseai.generation.generation_pb2 as generation
+ from stability_sdk.client import StabilityInference, process_artifacts_from_answers
# customize engine here if desired (default is newest)
- # i.e. engine='stable-diffusion-v1-5'
+ # e.g., engine='stable-diffusion-v1-5'
stability_api = StabilityInference(
STABILITY_HOST,
key=os.getenv('STABILITY_API_KEY'),
@@ -28,7 +33,7 @@ def generate_image(prompt: str, width=1024, height=1024):
)
artifacts = process_artifacts_from_answers(
- prefix="", prompt=prompt, answers=answers, write=False, verbose=False
+ prefix="", prompt=prompt, answers=answers, write=False, verbose=False,
)
img_path = None
diff --git a/sitcom_simulator/image_generator/image_generator.py b/sitcom_simulator/image_generator/image_generator.py
deleted file mode 100644
index b1b129c..0000000
--- a/sitcom_simulator/image_generator/image_generator.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from .integrations import stability, pillow
-from tqdm import tqdm
-from typing import List, Optional, Callable, Literal
-from sitcom_simulator.models import Script
-import os
-import atexit
-
-Engine = Literal["stability", "pillow"]
-
-def generate_images(
- script: Script,
- width=768,
- height=1344,
- on_image_generated: Optional[Callable[[int, str], None]] = None,
- engine:Engine="stability",
- ):
- """
- Generates and returns a list of image paths for the given script
- @param script: The script to generate images for
- @param width: The width of the images to generate
- @param height: The height of the images to generate
- @param on_image_generated: A callback to call after each image is generated
- @param engine: The engine to use for generating images
- """
- image_paths: List[str | None] = []
- image_prompts = [clip.image_prompt for clip in script.clips]
- for i, image_prompt in tqdm(enumerate(image_prompts), desc="Generating images", total=len(image_prompts)):
- if not image_prompt:
- image_paths.append(None)
- continue
- if engine == "stability":
- full_prompt = image_prompt + ', ' + script.metadata.art_style
- image_path = stability.generate_image(prompt=full_prompt, width=width, height=height)
- else: # debug engine
- image_path = pillow.generate_image(width, height)
- atexit.register(os.remove, image_path)
- image_paths.append(image_path)
- if on_image_generated:
- on_image_generated(i, image_path)
-
- return image_paths
-
-def add_images(
- script: Script,
- width=768,
- height=1344,
- on_image_generated: Optional[Callable[[int, str], None]] = None,
- engine:Engine="stability",
- ) -> Script:
- image_paths = generate_images(
- script=script,
- width=width,
- height=height,
- on_image_generated=on_image_generated,
- engine=engine)
- return script.replace(clips=[clip.replace(image_path=image_path) for clip, image_path in zip(script.clips, image_paths)])
\ No newline at end of file
diff --git a/sitcom_simulator/models.py b/sitcom_simulator/models.py
index d1b373f..138d600 100644
--- a/sitcom_simulator/models.py
+++ b/sitcom_simulator/models.py
@@ -1,26 +1,52 @@
from dataclasses import dataclass, replace
-@dataclass(frozen=True)
+@dataclass
class Character:
+ """
+ A character in a script and information about their voice.
+
+ :param name: The name of the character
+ :param voice_token: The token for the character's voice
+ """
name: str
voice_token: str
@staticmethod
- def from_dict(data: dict):
+ def from_dict(data: dict[str, str]):
+ """
+ Creates a Character from a dictionary with the same shape.
+ """
return Character(
name=data['name'],
voice_token=data['voice_token']
)
def replace(self, **kwargs):
+ """
+ Returns a new Character with the specified fields replaced.
+ """
return replace(self, **kwargs)
-@dataclass(frozen=True)
+@dataclass
class Clip:
+ """
+ A clip in a script, including the speaker, speech, and audio.
+
+ :param speaker: The name of the speaker
+ :param speech: The speech for the clip
+ :param image_prompt: The prompt for the image
+ :param image_url: The URL for the image (currently unused, but may be used in the future with a different image engine)
+ :param image_path: The path to the image
+ :param audio_url: The URL for the audio (currently unused, but may be used in the future with a different TTS engine)
+ :param audio_path: The path to the audio
+ :param title: The title of the clip
+ :param duration: The duration of the clip
+ """
speaker: str | None
speech: str | None
image_prompt: str | None
image_path: str | None
+ image_url: str | None
audio_url: str | None
audio_path: str | None
title: str | None
@@ -28,19 +54,30 @@ class Clip:
@property
def needs_audio(self):
- return self.speech and not (self.audio_path or self.audio_url)
+ """
+ Returns True if the clip needs audio, and False if it doesn't.
+ """
+ return bool(self.speech and not (self.audio_path or self.audio_url))
@property
def needs_image(self):
- return self.image_prompt and not self.image_path
+ """
+ Returns True if the clip needs an image, and False if it doesn't.
+ """
+ return bool(self.image_prompt and not (self.image_path or self.image_url))
@staticmethod
def from_dict(data: dict):
+ """
+ Creates a Clip from a dictionary with the same shape.
+ All fields are optional.
+ """
return Clip(
speaker=data.get('speaker'),
speech=data.get('speech'),
image_prompt=data.get('image_prompt'),
image_path=data.get('image_path'),
+ image_url=data.get('image_url'),
audio_url=data.get('audio_url'),
audio_path=data.get('audio_path'),
title=data.get('title'),
@@ -48,35 +85,83 @@ def from_dict(data: dict):
)
def replace(self, **kwargs):
+ """
+ Returns a new Clip with the specified fields replaced.
+ """
return replace(self, **kwargs)
-@dataclass(frozen=True)
+@dataclass
class ScriptMetadata:
- title: str
- bgm_style: str
- art_style: str
+ """
+ Metadata for a script.
+
+ :param title: The title of the script
+ :param bgm_style: The style of the background music
+ :param bgm_path: The path to the background music
+ :param bgm_url: The URL to the background music
+ :param art_style: The style of the art
+ :param prompt: The prompt for the script
+ :param bgm_path: The path to the background music
+ :param orientation: The orientation of the video
+ """
+ title: str | None
+ bgm_style: str | None
bgm_path: str | None
+ bgm_url: str | None
+ art_style: str | None
+ prompt: str | None
+ orientation: str | None
@staticmethod
def from_dict(data: dict):
+ """
+ Creates a ScriptMetadata from a dictionary with the same shape.
+ All fields are required except for bgm_path.
+ """
+ # creates misc from all data attributes besides the main ones
return ScriptMetadata(
- title=data['title'],
- bgm_style=data['bgm_style'],
- art_style=data['art_style'],
+ title=data.get('title'),
+ bgm_style=data.get('bgm_style'),
bgm_path=data.get('bgm_path'),
+ bgm_url=data.get('bgm_url'),
+ art_style=data.get('art_style'),
+ prompt=data.get('prompt'),
+ orientation=data.get('orientation'),
)
def replace(self, **kwargs):
+ """
+ Returns a new ScriptMetadata with the specified fields replaced.
+ """
return replace(self, **kwargs)
-@dataclass(frozen=True)
+@dataclass
class Script:
+ """
+ Contains all the data for a script, including characters, clips, and metadata.
+
+ The clips are ordered in the order they should be played.
+
+ In general, the fields should be populated in the following order:
+ 1. characters
+ 2. clips
+ 3. metadata
+
+ Metadata is last to give the language model more context before summarizing the script.
+
+ :param characters: A list of characters in the script
+ :param clips: A list of clips in the script
+ :param metadata: The metadata for the script
+ """
characters: list[Character]
clips: list[Clip]
metadata: ScriptMetadata
@staticmethod
def from_dict(data: dict):
+ """
+ Returns a Script from a dictionary with the same shape.
+ """
return Script(
characters=[Character.from_dict(character) for character in data['characters']],
clips=[Clip.from_dict(clip) for clip in data['clips']],
@@ -84,10 +169,20 @@ def from_dict(data: dict):
)
def replace(self, **kwargs):
+ """
+ Returns a new Script with the specified fields replaced.
+ """
return replace(self, **kwargs)
-@dataclass(frozen=True)
+@dataclass
class VideoResult:
+ """
+ The result of rendering a video.
+
+ :param path: The path to the rendered video
+ :param title: The title of the video
+ :param description: The description of the video
+ """
path: str
title: str
description: str
\ No newline at end of file
diff --git a/sitcom_simulator/music/__init__.py b/sitcom_simulator/music/__init__.py
new file mode 100644
index 0000000..ad24d0d
--- /dev/null
+++ b/sitcom_simulator/music/__init__.py
@@ -0,0 +1 @@
+from .music_generator import add_music, generate_music
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/integrations/fakeyou/__init__.py b/sitcom_simulator/music/integrations/__init__.py
similarity index 100%
rename from sitcom_simulator/script_generator/integrations/fakeyou/__init__.py
rename to sitcom_simulator/music/integrations/__init__.py
diff --git a/sitcom_simulator/music_generator/integrations/freepd.py b/sitcom_simulator/music/integrations/freepd.py
similarity index 66%
rename from sitcom_simulator/music_generator/integrations/freepd.py
rename to sitcom_simulator/music/integrations/freepd.py
index 24e73fc..ea73a65 100644
--- a/sitcom_simulator/music_generator/integrations/freepd.py
+++ b/sitcom_simulator/music/integrations/freepd.py
@@ -1,5 +1,3 @@
-import requests
-from bs4 import BeautifulSoup
import random
import os
from enum import Enum
@@ -7,6 +5,9 @@
import tempfile
class MusicCategory(Enum):
+ """
+ The different categories of music available on FreePD.
+ """
UPBEAT='upbeat'
EPIC='epic'
HORROR='horror'
@@ -19,9 +20,22 @@ class MusicCategory(Enum):
@classmethod
def values(cls):
+ """
+ Returns a list of the values of the enum members.
+ """
return [str(member.value) for name, member in cls.__members__.items()]
-def download_random_music(category: MusicCategory) -> str | None:
+def download_random_music(category: MusicCategory) -> tuple[str | None, str]:
+ """
+ Given a category, downloads a random song from FreePD in that category and returns the path to the downloaded file.
+
+ :param category: The category of music to download
+
+ :return: The path to the downloaded file
+ """
+ from bs4 import BeautifulSoup
+ import requests
+
# Send a GET request to the website
url = f"https://freepd.com/{category.value}.php"
response = requests.get(url)
@@ -39,9 +53,17 @@ def download_random_music(category: MusicCategory) -> str | None:
song_name = selected_song.find("b").text
download_link = "https://freepd.com" + selected_song.find("a", class_="downloadButton")["href"]
- return download_file(download_link)
+ return download_file(download_link), download_link
+
+def download_file(url: str):
+ """
+ Given a URL, downloads the file and returns the path to the downloaded file.
+
+ :param url: The URL of the file to download
-def download_file(url):
+ :return: The path to the downloaded file
+ """
+ import requests
response = requests.get(url)
if response.status_code == 200:
# Get the file name from the URL
diff --git a/sitcom_simulator/music/music_generator.py b/sitcom_simulator/music/music_generator.py
new file mode 100644
index 0000000..c419131
--- /dev/null
+++ b/sitcom_simulator/music/music_generator.py
@@ -0,0 +1,59 @@
+from typing import Literal, Callable, Optional
+import random
+from sitcom_simulator.models import Script
+import logging
+
+Engine = Literal["freepd"]
+
+def generate_music(
+ category: str | None,
+ engine:Engine="freepd",
+ music_url: str | None = None,
+ ) -> tuple[str, str]:
+ """
+ Generates and returns a path to a music file using the given engine.
+
+ More procedural in nature than add_music.
+
+ :param category: The category of music to generate
+ :param engine: The engine to use for generating music
+ :param music_url: The URL of the music to use. If provided, category is ignored.
+
+ :return: The path to the generated music file and the url of the music to use
+ """
+ from .integrations import freepd
+ if engine == "freepd":
+ if music_url:
+ logging.debug(f"Using music from URL: {music_url}")
+ return freepd.download_file(music_url), music_url
+ logging.debug(f"Generating music: {category}")
+ try:
+ freepd_category = freepd.MusicCategory(category)
+ except ValueError:
+ freepd_category = None
+ if freepd_category is None:
+ freepd_category = random.choice(list(freepd.MusicCategory))
+ return freepd.download_random_music(freepd_category)
+ else:
+ raise ValueError(f"Invalid engine: {engine}")
+
+def add_music(
+ script: Script,
+ engine:Engine="freepd",
+ music_url: str | None = None,
+ on_music_generated: Optional[Callable[[str], None]] = None,
+ ):
+ """
+ Given a script, returns the same script but with the music path filled in.
+
+ More functional in nature than generate_music.
+
+ :param script: The script to add music to
+ :param engine: The engine to use for generating music
+ :param music_url: The URL of the music to use. If provided, category is ignored.
+ :param on_music_generated: A callback to call after the music is generated which takes the path to the generated music
+ """
+ music_path, music_url = generate_music(category=script.metadata.bgm_style, music_url=music_url, engine=engine)
+ if on_music_generated:
+ on_music_generated(music_path)
+ return script.replace(metadata=script.metadata.replace(bgm_path=music_path, bgm_url=music_url))
\ No newline at end of file
diff --git a/sitcom_simulator/music_generator/__init__.py b/sitcom_simulator/music_generator/__init__.py
deleted file mode 100644
index d1fc97c..0000000
--- a/sitcom_simulator/music_generator/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .music_generator import add_music, generate_music, MusicCategory
\ No newline at end of file
diff --git a/sitcom_simulator/music_generator/music_generator.py b/sitcom_simulator/music_generator/music_generator.py
deleted file mode 100644
index 24c7241..0000000
--- a/sitcom_simulator/music_generator/music_generator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from .integrations import freepd
-from typing import Literal, Callable, Optional
-import random
-from sitcom_simulator.models import Script
-import logging
-
-Engine = Literal["freepd"]
-
-def generate_music(
- category: str | None,
- engine:Engine="freepd",
- ):
- if engine == "freepd":
- logging.debug(f"Generating music: {category}")
- try:
- freepd_category = freepd.MusicCategory(category)
- except ValueError:
- freepd_category = None
- if freepd_category is None:
- freepd_category = random.choice(list(MusicCategory))
- return freepd.download_random_music(freepd_category)
- else:
- raise ValueError(f"Invalid engine: {engine}")
-
-def add_music(
- script: Script,
- engine:Engine="freepd",
- category: str | None = None,
- on_music_generated: Optional[Callable[[str], None]] = None
- ):
- music_path = generate_music(category)
- if on_music_generated:
- on_music_generated(music_path)
- return script.replace(metadata=script.metadata.replace(bgm_path=music_path))
-
-MusicCategory = freepd.MusicCategory
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/__init__.py b/sitcom_simulator/script/__init__.py
similarity index 100%
rename from sitcom_simulator/script_generator/__init__.py
rename to sitcom_simulator/script/__init__.py
diff --git a/sitcom_simulator/script/integrations/__init__.py b/sitcom_simulator/script/integrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sitcom_simulator/script/integrations/chatgpt/__init__.py b/sitcom_simulator/script/integrations/chatgpt/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sitcom_simulator/script/integrations/chatgpt/chatgpt.py b/sitcom_simulator/script/integrations/chatgpt/chatgpt.py
new file mode 100644
index 0000000..53ee58e
--- /dev/null
+++ b/sitcom_simulator/script/integrations/chatgpt/chatgpt.py
@@ -0,0 +1,25 @@
+def chat(
+ prompt: str,
+ max_tokens:int=2048,
+ temperature:float=1,
+ model: str="gpt-4o-mini",
+ ):
+ """
+ Given a prompt, returns a response from ChatGPT.
+
+ :param prompt: The prompt for the chat
+ :param max_tokens: The maximum number of tokens to generate
+ :param temperature: The temperature to use when generating the response, which controls randomness. Higher values make the response more random, while lower values make the response more deterministic.
+ :param model: The model to use for the chat
+ """
+ import openai
+ completion = openai.ChatCompletion.create(
+ model=model,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ messages=[
+ {"role": "system", "content": "You are a helpful script-writing assistant."},
+ {"role": "user", "content": prompt}
+ ]
+ )
+ return completion.choices[0].message["content"].strip()
\ No newline at end of file
diff --git a/sitcom_simulator/script/integrations/fakeyou/__init__.py b/sitcom_simulator/script/integrations/fakeyou/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sitcom_simulator/script/integrations/fakeyou/character_extraction_instructions.txt b/sitcom_simulator/script/integrations/fakeyou/character_extraction_instructions.txt
new file mode 100644
index 0000000..d0f5164
--- /dev/null
+++ b/sitcom_simulator/script/integrations/fakeyou/character_extraction_instructions.txt
@@ -0,0 +1,8 @@
+Generate a list of potential characters to use in a short video of this prompt:
+
+{prompt}
+
+Your results will be searched for in the FakeYou database for potential AI voices to use.
+The characters must be likely to have an AI voice on the internet somewhere, e.g., famous people/characters.
+Keep the list short and focused on the user's prompt.
+Structure your output as a pure JSON list of strings, no markdown.
\ No newline at end of file
diff --git a/sitcom_simulator/script/integrations/fakeyou/character_extractor.py b/sitcom_simulator/script/integrations/fakeyou/character_extractor.py
new file mode 100644
index 0000000..f62e784
--- /dev/null
+++ b/sitcom_simulator/script/integrations/fakeyou/character_extractor.py
@@ -0,0 +1,143 @@
+import json
+import re
+import random
+from .narrators import BACKUP_NARRATORS
+from sitcom_simulator.models import Character
+import logging
+from typing import List
+import os
+import csv
+
+def normalize_string(s):
+ return re.sub(r'\W+', '', s).lower()
+
+def load_curated_voices():
+ """
+ Loads the curated voices from the 'curated_voices.csv' file in the same directory as this script.
+ Important for when fakeyou's ratings get wiped (which has happened before), we still have our own records.
+ """
+ curated_voices: dict[str, float] = {}
+ # note: needs to be in the same directory as this script, not the current working directory
+ path_to_curated_voices = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'curated_voices.csv')
+ with open(path_to_curated_voices, 'r') as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ name = row['model_name'].strip()
+ rating = row['rating'].strip()
+ curated_voices[name] = float(rating)
+ return curated_voices
+
+def generate_character_list(prompt: str, custom_instructions: str | None=None) -> List[Character]:
+ """
+ Uses a large language model to generate a list of possible famous characters related to the prompt.
+
+ :param prompt: The user-submitted prompt
+ :param custom_instructions: A string containing custom instructions for the language model. Must contain the placeholder '{prompt}'.
+ """
+ if custom_instructions:
+ instructions = custom_instructions
+ else:
+ from pathlib import Path
+ current_file_path = Path(__file__).resolve()
+ current_dir = current_file_path.parent
+ instructions_path = current_dir / 'character_extraction_instructions.txt'
+ with open(instructions_path, 'r') as f:
+ instructions = f.read()
+
+ if "{prompt}" not in instructions:
+ raise ValueError("Custom instructions file must contain the placeholder '{prompt}'")
+ instructions = instructions.format(prompt=prompt)
+
+ from sitcom_simulator.script.llm import chat
+ import requests
+
+ raw_response = chat(instructions)
+ logging.debug("Raw character extractor response from LLM:", raw_response)
+ character_names = json.loads(raw_response)
+ logging.debug("Characters proposed:", ", ".join(character_names), "\n")
+
+ # TODO: cache data from fakeyou to avoid lots of hits?
+ res = requests.get('https://api.fakeyou.com/tts/list')
+ fakeyou_character_list = res.json()['models']
+ name_to_model = pure_name_to_model(fakeyou_character_list)
+
+ curated_characters = load_curated_voices()
+ chosen_characters = []
+ for name in character_names:
+ # TODO (big maybe) if tts doesn't exist but vtv does, render tts in someone else's voice and then use vtv
+ from thefuzz import process
+ SIMILARITY_CUTOFF = 75 # out of 100
+ extraction = process.extractOne(normalize_string(name), list(name_to_model.keys()), score_cutoff=SIMILARITY_CUTOFF)
+ if extraction:
+ match, score = extraction
+ logging.debug(f"Matched {name} to {match} with score {score}")
+ voices = name_to_model[match.lower()]
+ # find the highest-rated match
+ highest_rated_voice = max(voices, key=lambda model: calculate_star_rating(model, curated_characters))
+ chosen_characters.append(Character(name=name, voice_token=highest_rated_voice['model_token']))
+ logging.info("Selected voices:", ", ".join([c.name for c in chosen_characters]))
+
+ # guarantee at least one voice (narrator)
+ if len(chosen_characters) == 0:
+ print("No voices selected. Defaulting to narrator.")
+ logging.info("No voices selected. Defaulting to narrator.")
+ chosen_characters.append(random.choice(BACKUP_NARRATORS))
+
+ print("Characters selected:", ", ".join([c.name for c in chosen_characters]), "\n")
+
+ return chosen_characters
+
+def pure_name_to_model(models_list: list[dict]):
+ """
+ Given a list of models from FakeYou, returns a dictionary mapping the pure name of the character to the list of models matching that name.
+
+ A pure name is the name of the character without any parenthetical information, e.g., "Velma (Scooby Doo)" -> "Velma"
+
+ :param models_list: A list of models from FakeYou
+ """
+ names_to_model = {}
+ for model in models_list:
+ pure_name = pure_character_name(model['title'])
+ if not pure_name:
+ continue
+ pure_name = pure_name.lower()
+ if pure_name not in names_to_model:
+ names_to_model[pure_name] = []
+ names_to_model[pure_name].append(model)
+ return names_to_model
+
+NAME_PATTERN = re.compile(r"^\s*([^\(\n]*[^\s\(])\s*(?:\([^\n]*)?$")
+def pure_character_name(raw_name: str):
+ """
+ Returns just the character's true name from a FakeYou listing.
+
+ FakeYou names are typically formatted like \"True Name (source)\" e.g., Velma (Scooby Doo)
+
+ :param raw_name: The raw name of the character from FakeYou
+ """
+ match = NAME_PATTERN.search(raw_name)
+ if match:
+ return normalize_string(match.group(1))
+ return None
+
+DEFAULT_RATING = 2 # not the worst possible, but pretty bad
+def calculate_star_rating(model, curated_voices: dict[str, float] | None=None):
+ """
+ Estimates the true ratio of positive to negative reviews. Intuition: 5 stars from 10 reviews is worse than 4.8 stars from 1000 reviews.
+ """
+
+ curated_rating = curated_voices.get(model['title'])
+ if curated_rating:
+ return curated_rating
+
+ if 'user_ratings' not in model: return DEFAULT_RATING
+ positive_count = model['user_ratings']['positive_count']
+ total_count = model['user_ratings']['total_count']
+
+ negative_count = total_count - positive_count
+ alpha_posterior = 1 + positive_count # Prior alpha = 1
+ beta_posterior = 1 + negative_count # Prior beta = 1
+ mean_proportion = alpha_posterior / (alpha_posterior + beta_posterior)
+ star_rating = 1 + 4 * mean_proportion
+
+ return star_rating
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/integrations/fakeyou/character_selector.py b/sitcom_simulator/script/integrations/fakeyou/character_selector.py
similarity index 86%
rename from sitcom_simulator/script_generator/integrations/fakeyou/character_selector.py
rename to sitcom_simulator/script/integrations/fakeyou/character_selector.py
index 28f95e8..82fdb41 100644
--- a/sitcom_simulator/script_generator/integrations/fakeyou/character_selector.py
+++ b/sitcom_simulator/script/integrations/fakeyou/character_selector.py
@@ -1,16 +1,20 @@
-import tomllib
+import toml
from sitcom_simulator.models import Character
import os
# Get the directory of the current script file
script_dir = os.path.dirname(os.path.realpath(__file__))
characters_path = os.path.join(script_dir, 'characters.toml')
-with open(characters_path, "rb") as f:
- curated_characters = tomllib.load(f)
+curated_characters = toml.load(characters_path)
# user selects which auto-detected characters to include in the script
# (including their voices if generating high-quality audio)
-def select_characters(possible_characters):
+def select_characters(possible_characters: dict[str, list[str]]):
+ """
+ A procedure to prompt the user to select which auto-detected characters to include in the script.
+
+ :param possible_characters: A dictionary of character names to a list of voice tokens
+ """
print("--- Character Voice Selection ---")
selected_characters = dict()
for name, voices in possible_characters.items():
diff --git a/sitcom_simulator/script_generator/integrations/fakeyou/characters.toml b/sitcom_simulator/script/integrations/fakeyou/characters.toml
similarity index 100%
rename from sitcom_simulator/script_generator/integrations/fakeyou/characters.toml
rename to sitcom_simulator/script/integrations/fakeyou/characters.toml
diff --git a/sitcom_simulator/script/integrations/fakeyou/curated_voices.csv b/sitcom_simulator/script/integrations/fakeyou/curated_voices.csv
new file mode 100644
index 0000000..9043b5e
--- /dev/null
+++ b/sitcom_simulator/script/integrations/fakeyou/curated_voices.csv
@@ -0,0 +1,17 @@
+model_name,rating
+"Luigi (Charles Martinet) (fixed version)",5
+"Mario (Charles Martinet, 1994-2023) (New!)",5
+"Shrek (New)",5
+"GLaDOS (Ellen McLain)",5
+"GLaDOS (Ellen McLain, Portal 2)",5
+"Morgan Freeman (New)",5
+"Kurzgesagt (New)",5
+"Weird Al" Yankovic,3.5
+"Shaggy Rogers (Scott Innes)",4
+"Shadow The Hedgehog (Jason Griffith)",3.75
+"Tom Cruise (New)",4.5
+"Elon Musk (New Version 2.0)",3.5
+"Donald Trump (Version 3.0)",4
+"Joe Biden (New, 46th U.S. President)",5
+"Barack Obama (NEW, 44th U.S. President)",5
+"Toad (Super Mario, Jen Taylor)",3
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/integrations/fakeyou/narrators.py b/sitcom_simulator/script/integrations/fakeyou/narrators.py
similarity index 100%
rename from sitcom_simulator/script_generator/integrations/fakeyou/narrators.py
rename to sitcom_simulator/script/integrations/fakeyou/narrators.py
diff --git a/sitcom_simulator/script_generator/llm.py b/sitcom_simulator/script/llm.py
similarity index 72%
rename from sitcom_simulator/script_generator/llm.py
rename to sitcom_simulator/script/llm.py
index 71e3f92..61dd19f 100644
--- a/sitcom_simulator/script_generator/llm.py
+++ b/sitcom_simulator/script/llm.py
@@ -1,4 +1,3 @@
-from .integrations.chatgpt import chatgpt
-
def chat(prompt: str, max_tokens:int=2048, temperature:float=1):
+ from .integrations.chatgpt import chatgpt
return chatgpt.chat(prompt, max_tokens, temperature)
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/integrations/chatgpt/instructions.py b/sitcom_simulator/script/llm_instructions.txt
similarity index 78%
rename from sitcom_simulator/script_generator/integrations/chatgpt/instructions.py
rename to sitcom_simulator/script/llm_instructions.txt
index 824d126..981cfbd 100644
--- a/sitcom_simulator/script_generator/integrations/chatgpt/instructions.py
+++ b/sitcom_simulator/script/llm_instructions.txt
@@ -1,4 +1,4 @@
-base_prompt = """You are a witty, avant-garde creative genius who writes short video scripts consisting of AI-generated still images and audio.
+You are a witty, avant-garde creative genius who writes short video scripts consisting of AI-generated still images and audio.
Your output should be structured in TOML.
Your output file will have these top level parts in this order: clips and metadata
@@ -33,7 +33,7 @@
[metadata]
title: a clever title for the video.
bgm_style: specifies the video's background music style from the set ({music_categories}). Avoid comedy when possible
-art_style: appended to each image prompt. Be specific, e.g., "1980s sitcom", "cinematic bokeh blur", "claymation", "trending on artstation"
+art_style: appended to each image prompt. Thorough and descriptive, e.g., "still from a 1980s sitcom on VHS, film grain with saturated colors and a low-budget aesthetic but impeccable composition."
metadata is a table, and is always last, to give you time to ponder the title and styles AFTER writing the script. Do NOT put title at the beginning of the file.
@@ -44,17 +44,14 @@
- Be bold and avante garde.
- Censor anything truly inappropriate like racism, but do not censor things like horror or dark themes.
- Scripts should be approximately 30-60 seconds in duration, and have at least 4-6 clips of dialog unless otherwise specified.
-- Take yourself seriously, but also crank it up to ELEVEN on the wierdness scale, baby.
- Keep famous characters in character
-- End with a twist. NO generic, boring happy endings.
-- The last clip should always be an unexpected, wacky twist.
-- Narrators should be used sparingly (it's better to hear from the characters directly)
-- No TOML comments (#)
+- Do not use narrators unless absolutely necessary.
+- No TOML comments (#) or markdown. Just pure TOML.
-Now, take a deep breath and a shot of whiskey, and write a script for the following video:
+Now write a script for the following video:
"{prompt}"
The characters at your disposal are: {characters}
-Have fun!"""
\ No newline at end of file
+Have fun!
\ No newline at end of file
diff --git a/sitcom_simulator/script/script_generator.py b/sitcom_simulator/script/script_generator.py
new file mode 100644
index 0000000..42f1ff8
--- /dev/null
+++ b/sitcom_simulator/script/script_generator.py
@@ -0,0 +1,100 @@
+from typing import Callable
+from ..models import Script
+import toml
+from dataclasses import asdict
+import logging
+
+def write_script(
+ prompt: str,
+ manual_character_selection=False,
+ max_tokens:int=2048,
+ require_approval:bool=False,
+ temperature:float=0.5,
+ model:str="gpt-4o-mini",
+ custom_script_instructions: str | None=None,
+ custom_character_instructions: str | None=None,
+ fakeyou_characters:bool=True,
+ narrator_dropout:bool=False,
+ ) -> Script:
+ """
+ Uses AI to generate a script matching the prompt.
+
+ If characters are passed in, the resulting dialog is constrained to those characters.
+ Otherwise, it prompts the user to select the appropriate characters.
+
+ :param prompt: The prompt for the script
+ :param manual_character_selection: Whether to prompt the user to select the characters. If manual_character_selection == False and characters == None, an LLM will extract characters.
+ :param max_tokens: The maximum number of tokens to generate
+ :param require_approval: Whether to prompt the user to approve the generated script
+ :param temperature: The temperature to use when generating the script
+ :param model: The language model to use
+ :param custom_script_instructions: A string containing custom instructions for the language model writing the script. Must contain the placeholders '{prompt}', '{music_categories}', and '{characters}'.
+ :param custom_character_instructions: A string containing custom instructions for the language model extracting the characters from the prompt. Must contain the placeholder '{prompt}'.
+ :param fakeyou_characters: Whether to restrict character selection to only voices from fakeyou.com
+ :param narrator_dropout: Whether to forcibly remove narrators from the script (ChatGPT often goes heavy on the narrators)
+ """
+ from ..speech.integrations.fakeyou import get_possible_characters_from_prompt
+ from .integrations.chatgpt import chatgpt
+ from .integrations.fakeyou.character_extractor import generate_character_list
+ from ..music.integrations.freepd import MusicCategory
+
+ if manual_character_selection:
+ from .integrations.fakeyou.character_selector import select_characters as fakeyou_select_characters
+ from ..user_input import select_characters as debug_select_characters
+ possible_characters = get_possible_characters_from_prompt(prompt)
+ select_characters: Callable = fakeyou_select_characters if fakeyou_characters else debug_select_characters
+ characters = select_characters(possible_characters)
+ else:
+ characters = generate_character_list(prompt, custom_instructions=custom_character_instructions)
+
+ characters_str = ", ".join([c.name for c in characters])
+ music_categories_str = ", ".join(MusicCategory.values())
+
+ if custom_script_instructions:
+ instructions = custom_script_instructions
+ else:
+ from pathlib import Path
+ current_file_path = Path(__file__).resolve()
+ current_dir = current_file_path.parent
+ instructions_path = current_dir / "llm_instructions.txt"
+ with open(instructions_path, 'r') as f:
+ instructions = f.read()
+
+ # check for placeholders
+ if "{prompt}" not in instructions or "{music_categories}" not in instructions or "{characters}" not in instructions:
+ raise ValueError("Custom instructions file must contain the placeholders '{prompt}', '{music_categories}', and '{characters}'")
+
+ full_prompt = instructions.format(prompt=prompt, characters=characters_str, max_tokens=max_tokens, music_categories=music_categories_str)
+ approved = False
+ while not approved:
+ raw_script= chatgpt.chat(full_prompt, temperature=temperature, max_tokens=max_tokens, model=model)
+ logging.debug("Raw script", raw_script)
+ toml_script = toml.loads(raw_script)
+ toml_script["characters"] = [asdict(c) for c in characters] # from characters to dict back to character. Refactor at some point.
+ script = Script.from_dict(toml_script)
+ if narrator_dropout:
+ script = script.replace(clips=[c for c in script.clips if c.speaker.lower().strip() != "narrator"])
+ if len(script.clips) == 0:
+ raise ValueError("Narrator dropout resulted in an empty script. Please try again.")
+ logging.debug("TOML script", script)
+ print(formatted_script(script), "\n")
+ if(require_approval):
+ validated = None
+ while validated not in ["y", "n", "q"]:
+ validated = input("Do you approve this script? (y/n/q): ").lower()
+ if validated == "y": approved = True
+ elif validated == "n": approved = False
+ elif validated == "q": exit()
+ else: print("Unrecognized input. Try again.")
+ else:
+ approved = True
+ return script
+
+def script_from_file(path: str) -> Script:
+ script = Script.from_dict(toml.load(path))
+ return script
+
+def formatted_script(script: Script) -> str:
+ metadata = f"Title: {script.metadata.title or ''}\nStyle: {script.metadata.art_style or ''}\n"
+ clips = "\n".join([f"{c.speaker}: {c.speech}" for c in script.clips if c.speaker])
+ return metadata + clips
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/integrations/chatgpt/chatgpt.py b/sitcom_simulator/script_generator/integrations/chatgpt/chatgpt.py
deleted file mode 100644
index c6d22b0..0000000
--- a/sitcom_simulator/script_generator/integrations/chatgpt/chatgpt.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import openai
-
-def chat(prompt: str, max_tokens:int=2048, temperature:float=1):
- completion = openai.ChatCompletion.create(
- model="gpt-3.5-turbo",
- temperature=temperature,
- max_tokens=max_tokens,
- messages=[
- {"role": "system", "content": "You are a helpful script-writing assistant."},
- {"role": "user", "content": prompt}
- ]
- )
- return completion.choices[0].message["content"].strip()
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/integrations/fakeyou/character_extractor.py b/sitcom_simulator/script_generator/integrations/fakeyou/character_extractor.py
deleted file mode 100644
index 8ca89c9..0000000
--- a/sitcom_simulator/script_generator/integrations/fakeyou/character_extractor.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from sitcom_simulator.script_generator.llm import chat
-import json
-import requests
-import re
-import random
-from .narrators import BACKUP_NARRATORS
-from sitcom_simulator.models import Character
-import logging
-from typing import List
-
-def generate_character_list(prompt: str) -> List[Character]:
- "Given a user-submitted prompt, return a list of characters (names + voice_tokens) from FakeYou for the characters in the script."
-
- instructions = f"""Generate a list of potential characters to use in a short video of this prompt:
-
- {prompt}
-
- Your results will be searched for in the FakeYou database for potential AI voices to use.
- The characters must be likely to have an AI voice on the internet somewhere.
- Keep the list short and focused.
- Structure your output as a JSON list of strings.
- """
-
- raw_response = chat(instructions)
- character_names = json.loads(raw_response)
- print("Characters proposed:", ", ".join(character_names))
-
- # TODO: cache data from fakeyou to avoid lots of hits?
- res = requests.get('https://api.fakeyou.com/tts/list')
- fakeyou_character_list = res.json()['models']
- name_to_model = pure_name_to_model(fakeyou_character_list)
-
- chosen_characters = []
- for name in character_names:
- # TODO (big maybe) if tts doesn't exist but vtv does, render tts in someone else's voice and then use vtv
- if name.lower() not in name_to_model:
- continue
- matches = name_to_model[name.lower()]
- # find the highest-rated match
- highest_rated_voice = max(matches, key=calculate_star_rating)
- chosen_characters.append(Character(name=name, voice_token=highest_rated_voice['model_token']))
- logging.info("Selected voices:", ", ".join([c.name for c in chosen_characters]))
-
- # guarantee at least one voice (narrator)
- chosen_characters.append(random.choice(BACKUP_NARRATORS))
-
- return chosen_characters
-
-def pure_name_to_model(models_list):
- names_to_model = {}
- for model in models_list:
- pure_name = pure_character_name(model['title'])
- if not pure_name:
- continue
- pure_name = pure_name.lower()
- if pure_name not in names_to_model:
- names_to_model[pure_name] = []
- names_to_model[pure_name].append(model)
- return names_to_model
-
-NAME_PATTERN = re.compile(r"^\s*([^\(\n]*[^\s\(])\s*(?:\([^\n]*)?$")
-def pure_character_name(raw_name):
- "Returns just the character's true name from a FakeYou listing. FakeYou names are typically formatted like \"True Name (source)\" e.g., Velma (Scooby Doo)"
- match = NAME_PATTERN.search(raw_name)
- if match:
- return match.group(1)
- return None
-
-DEFAULT_RATING = 2 # not the worst possible, but pretty bad
-def calculate_star_rating(model):
- "Estimates the true ratio of positive to negative reviews. Intuition: 5 stars from 10 reviews is worse than 4.8 stars from 1000 reviews."
-
- if 'user_ratings' not in model: return DEFAULT_RATING
- positive_count = model['user_ratings']['positive_count']
- total_count = model['user_ratings']['total_count']
-
- negative_count = total_count - positive_count
- alpha_posterior = 1 + positive_count # Prior alpha = 1
- beta_posterior = 1 + negative_count # Prior beta = 1
- mean_proportion = alpha_posterior / (alpha_posterior + beta_posterior)
- star_rating = 1 + 4 * mean_proportion
- return mean_proportion, star_rating
\ No newline at end of file
diff --git a/sitcom_simulator/script_generator/script_generator.py b/sitcom_simulator/script_generator/script_generator.py
deleted file mode 100644
index db2bf60..0000000
--- a/sitcom_simulator/script_generator/script_generator.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from .integrations.chatgpt import chatgpt, instructions
-from .integrations.fakeyou.character_extractor import generate_character_list
-from typing import Callable
-from ..speech_generator.integrations.fakeyou import get_possible_characters_from_prompt
-from ..user_input import select_characters as debug_select_characters
-from .integrations.fakeyou.character_selector import select_characters as fakeyou_select_characters
-from ..music_generator import MusicCategory
-from ..models import Script
-import tomllib
-from dataclasses import asdict
-import logging
-
-def write_script(
- prompt: str,
- manual_character_selection=False,
- max_tokens:int=2048,
- require_approval:bool=False,
- temperature:float=0.5,
- fakeyou_characters:bool=True,
- ) -> Script:
- """
- Uses AI to generate a script matching the prompt.
-
- If characters are passed in, the resulting dialog is constrained to those characters.
- Otherwise, it prompts the user to select the appropriate characters.
-
- @param prompt: The prompt for the script
- @param manual_character_selection: Whether to prompt the user to select the characters. If manual_character_selection == False and characters == None, an LLM will extract characters.
- @param max_tokens: The maximum number of tokens to generate
- @param require_approval: Whether to prompt the user to approve the generated script
- @param temperature: The temperature to use when generating the script
- @param fakeyou_characters: Whether to restrict character selection to only voices from fakeyou.com
- """
- if manual_character_selection:
- possible_characters = get_possible_characters_from_prompt(prompt)
- select_characters: Callable = fakeyou_select_characters if fakeyou_characters else debug_select_characters
- characters = select_characters(possible_characters)
- else:
- characters = generate_character_list(prompt)
-
- characters_str = ", ".join([c.name for c in characters])
- music_categories_str = ", ".join(MusicCategory.values())
- full_prompt = instructions.base_prompt.format(prompt=prompt, characters=characters_str, max_tokens=max_tokens, music_categories=music_categories_str)
- approved = False
- while not approved:
- raw_script= chatgpt.chat(full_prompt, temperature=temperature, max_tokens=max_tokens)
- toml_script = tomllib.loads(raw_script)
- toml_script["characters"] = [asdict(c) for c in characters] # from characters to dict back to character. Refactor at some point.
- script = Script.from_dict(toml_script)
- logging.debug(script)
- print(formatted_script(script))
- if(require_approval):
- validated = None
- while validated not in ["y", "n", "q"]:
- validated = input("Do you approve this script? (y/n/q): ").lower()
- if validated == "y": approved = True
- elif validated == "n": approved = False
- elif validated == "q": exit()
- else: print("Unrecognized input. Try again.")
- else:
- approved = True
- return script
-
-def script_from_file(path: str) -> Script:
- with open(path, "rb") as f:
- script = Script.from_dict(tomllib.load(f))
- print(type(script))
- return script
-
-def formatted_script(script: Script) -> str:
- metadata = f"Title: {script.metadata.title}\nStyle: {script.metadata.art_style}\n"
- clips = "\n".join([f"{c.speaker}: {c.speech}" for c in script.clips if c.speaker])
- return metadata + clips
\ No newline at end of file
diff --git a/sitcom_simulator/sitcom_creator.py b/sitcom_simulator/sitcom_creator.py
deleted file mode 100644
index 38d8746..0000000
--- a/sitcom_simulator/sitcom_creator.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from .models import Script, VideoResult
-from .script_generator import write_script
-from .speech_generator import add_voices
-from .image_generator import add_images
-from .music_generator import add_music
-from .video_generator import render_video
-from .script_generator import script_from_file
-from .social.yt_uploader import upload_to_yt
-
-def create_sitcom(
- prompt: str | None = None,
- art_style: str | None = None,
- script_path: str | None = None,
- debug: bool=False,
- font: str = '',
- max_tokens:int=2048,
- approve_script:bool=False,
- manual_select_characters:bool=True,
- upload_to_yt=False,
-):
- if(prompt == None and script_path == None):
- prompt = input("Enter a prompt to generate the video script: ")
-
- assert prompt or script_path, "You must provide a prompt or a script path"
-
- if prompt and not script_path:
- initial_script = write_script(
- prompt=prompt,
- manual_character_selection=manual_select_characters,
- max_tokens=max_tokens,
- require_approval=approve_script,
- fakeyou_characters=not debug,
- )
- elif script_path and not prompt:
- initial_script = script_from_file(script_path)
- else:
- raise ValueError("You must provide a prompt or a script path, not both")
-
- if art_style:
- initial_script = initial_script.replace(metadata=initial_script.metadata.replace(art_style=art_style))
-
- script_with_voices = add_voices(initial_script, engine="fakeyou" if not debug else "gtts")
- script_with_images = add_images(script_with_voices, engine="stability" if not debug else "pillow") # could theoretically be done in parallel with the audio
- script_with_music = add_music(script_with_images)
-
- final_script = script_with_music
-
- filename = final_script.metadata.title[:50].strip() or 'render'
- output_path = f"./{filename}.mp4"
- final_video_path = render_video(script=final_script, font=font, output_path=output_path)
-
- result = VideoResult(
- path=final_video_path,
- title=final_script.metadata.title,
- description=prompt or 'an AI-generated meme video created with Sitcom Simulator'
- )
-
- if upload_to_yt:
- title = prompt
- keywords = [word for word in prompt.split(' ') if len(word) > 3] if prompt else ["sitcom", "funny", "comedy", "ai", "deepfake"]
- upload_to_yt(result.path, result.title, result.description, keywords, "24", "public")
-
- return result
\ No newline at end of file
diff --git a/sitcom_simulator/speech/__init__.py b/sitcom_simulator/speech/__init__.py
new file mode 100644
index 0000000..028c663
--- /dev/null
+++ b/sitcom_simulator/speech/__init__.py
@@ -0,0 +1 @@
+from sitcom_simulator.speech.speech_generator import generate_voices, add_voices
\ No newline at end of file
diff --git a/sitcom_simulator/speech/integrations/__init__.py b/sitcom_simulator/speech/integrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sitcom_simulator/speech_generator/integrations/fakeyou.py b/sitcom_simulator/speech/integrations/fakeyou.py
similarity index 54%
rename from sitcom_simulator/speech_generator/integrations/fakeyou.py
rename to sitcom_simulator/speech/integrations/fakeyou.py
index ffec6d9..02b9fe0 100644
--- a/sitcom_simulator/speech_generator/integrations/fakeyou.py
+++ b/sitcom_simulator/speech/integrations/fakeyou.py
@@ -1,4 +1,3 @@
-import requests
from tqdm import tqdm
from typing import List, Set, Callable, Optional, Dict
import re
@@ -9,35 +8,57 @@
from ...models import Script
import logging
import random
-from sitcom_simulator.script_generator.integrations.fakeyou.narrators import BACKUP_NARRATORS
+from sitcom_simulator.script.integrations.fakeyou.narrators import BACKUP_NARRATORS
import urllib
import tempfile
import atexit
-JOB_DELAY = 20 # seconds
JOB_RANDOMNESS = 3 # +- this value, might help bypass rate limiting
-POLL_DELAY = 8
POLL_RANDOMNESS = 1
def download_voice(url: str):
- """Downloads audio from a given URL and saves it to a temporary file."""
- logging.info("downloading audio:", url)
+ """
+ Downloads audio from a given URL and saves it to a temporary file.
+
+ :param url: The URL of the audio to download
+ """
+ logging.info(f"Downloading audio from: {url}")
temp_audio_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
atexit.register(os.remove, temp_audio_file.name)
+
try:
- with urllib.request.urlopen(url) as response, open(temp_audio_file.name, 'wb') as out_file:
- data = response.read() # Read the content as bytes
+ # uses urllib because AWS lambda doesn't have requests (not that that matters anymore)
+ # Create a request with a browser-like User-Agent (otherwise 403 on FakeYou's new CDN)
+ req = urllib.request.Request(
+ url,
+ headers={
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
+ "Accept": "*/*",
+ "Connection": "keep-alive",
+ "Referer": "https://fakeyou.com/", # FakeYou might check this
+ }
+ )
+
+ # Open the URL and write the content to a file
+ with urllib.request.urlopen(req) as response, open(temp_audio_file.name, 'wb') as out_file:
+ data = response.read()
out_file.write(data)
+
+ logging.info(f"Audio downloaded to: {temp_audio_file.name}")
return temp_audio_file.name
+
except urllib.error.HTTPError as e:
- # Handle HTTP errors
raise Exception(f"Failed to download audio from URL: {url}. Status code: {e.code}")
except urllib.error.URLError as e:
- # Handle URL errors (e.g., network issues)
raise Exception(f"Failed to download audio from URL: {url}. Error: {e.reason}")
def fetch_voicelist():
+ """
+ Fetches the list of available voices from the FakeYou API.
+ """
+ import requests
response = requests.get('https://api.fakeyou.com/tts/list')
+ logging.info("Fetching voice list from fakeyou")
json = response.json()
if(json['success'] != True):
print("Error fetching voice list from fakeyou. Exiting.")
@@ -50,14 +71,28 @@ def string_to_keywords(string: str, stop_at_first_paren=False) -> Set[str]:
return {keyword.lower() for keyword in func(string).split(' ') if len(keyword) > 3 and keyword.lower() not in ['test', 'model']}
def alphanumeric_to_first_paren(string: str) -> str:
- string = string.split('(')[0].strip().replace('-', ' ')
+ """
+ Returns the input string up to the first parenthesis with all non-alphanumeric characters removed.
+
+ :param string: The input string
+ """
+ string = string.split('(')[0].strip().replace('-', ' ') # TODO: fix this for names like Reggie Fils-Aime
return alphanumeric(string)
def alphanumeric(string: str):
- return re.sub(r'[^a-zA-Z0-9 ]', '', string)
+ """
+ Strips all non-alphanumeric characters from the input string.
+
+ :param string: The input string
+ """
+ return re.sub(r'[^a-zA-Z0-9 ]', '_', string)
-# scan the prompt for character names
def get_possible_characters_from_prompt(prompt: str) -> dict:
+ """
+ Scans the prompt for character names and returns a dictionary of character names to a list of voice tokens.
+
+ :param prompt: The prompt for the script
+ """
possible_characters: Dict[str, List[str]] = dict()
voices = fetch_voicelist()
prompt_keywords = string_to_keywords(prompt, False)
@@ -79,25 +114,67 @@ def get_possible_characters_from_prompt(prompt: str) -> dict:
return possible_characters
-# takes in array of line models
-def generate_voices(script: Script, on_voice_generated: Optional[Callable[[int, str], None]] = None) -> List[str | None]:
+def sign_in(username_or_email: str, password: str) -> str:
+ """
+ Signs in to the FakeYou API and returns the session cookie.
+ """
+ import requests
+ response = requests.post('https://api.fakeyou.com/v1/login',
+ json={"username_or_email": username_or_email, "password": password}
+ )
+ auth_data = response.json()
+ if not auth_data['success']:
+ logging.exception("Failed to log in to FakeYou API")
+ else:
+ logging.info("Logged in to FakeYou API")
+ print("Logged in to FakeYou API")
+ cookie = response.headers.get('Set-Cookie')
+ cookie = re.search(r'\w+.=([^;]+)', cookie).group(1)
+ return cookie
+
+def generate_voices(
+ script: Script,
+ on_voice_url_generated: Optional[Callable[[int, str], None]] = None,
+ job_delay:float=30,
+ poll_delay:float=10,
+ cookie:str|None=None,
+ ) -> List[str | None]:
+ """
+ Sequentially generates voices for each line in the script using the FakeYou API.
+ It is intentionally slow to avoid getting rate limited.
+ It can be sped up by having FAKEYOU_USERNAME and FAKEYOU_PASSWORD set as environment variables.
+
+ :param script: The script to generate voices for
+ :param on_voice_generated: A callback function to call when a voice is generated which takes the clip index and the URL of the generated audio
+ :param job_delay: The number of seconds to wait between starting audio generation jobs. Lower values render faster but are more likely to get rate limited
+ :param poll_delay: The number of seconds to wait between polling for audio generation job completion
+ :param cookie: The session cookie to use for the FakeYou API (acquired from sign_in)
+ """
+ import requests
audio_urls: List[str | None] = []
for i, clip in tqdm(enumerate(script.clips), desc="Generating voices", total=len(script.clips)):
# skip if doesn't need audio, or if audio already exists (audio should never already exist, but just in case)
if not clip.speaker:
audio_urls.append(None)
continue
+ if clip.audio_url:
+ audio_urls.append(clip.audio_url)
+ continue
logging.debug(f'Starting voice job {i} ({clip.speaker}: {clip.speaker})')
try:
character = next((character for character in script.characters if character.name == clip.speaker))
- except Exception as e: # probably because character not in characters list
+ except: # probably because character not in characters list
character = random.choice(BACKUP_NARRATORS)
entropy = str(uuid.uuid4())
voice_token = character.voice_token
headers = {
'Accept': 'application/json',
- 'Content-Type': 'application/json'
+ 'Content-Type': 'application/json',
}
+ if cookie:
+ headers['cookie'] = f"session={cookie}"
+ headers["credentials"] = "include"
+
payload = {
"uuid_idempotency_token": entropy,
"tts_model_token": voice_token,
@@ -114,7 +191,7 @@ def generate_voices(script: Script, on_voice_generated: Optional[Callable[[int,
raise Exception("Some sort of FakeYou API error occured", json)
break
job_token = json['inference_job_token']
- rand_job_delay = random.randrange(JOB_DELAY-JOB_RANDOMNESS, JOB_DELAY+JOB_RANDOMNESS)
+ rand_job_delay = random.randrange(job_delay-JOB_RANDOMNESS, job_delay+JOB_RANDOMNESS)
# poll the job until complete
logging.debug(f'Polling voice job {i}')
@@ -125,7 +202,7 @@ def generate_voices(script: Script, on_voice_generated: Optional[Callable[[int,
'Accept': 'application/json'
}
while not completed:
- rand_delay = random.randrange(POLL_DELAY-POLL_RANDOMNESS, POLL_DELAY+POLL_RANDOMNESS)
+ rand_delay = random.randrange(poll_delay-POLL_RANDOMNESS, poll_delay+POLL_RANDOMNESS)
time.sleep(rand_delay)
response = requests.get(f'https://api.fakeyou.com/tts/job/{job_token}', headers=headers)
json = response.json()
@@ -139,10 +216,10 @@ def generate_voices(script: Script, on_voice_generated: Optional[Callable[[int,
completed = True
total_poll_time = time.time() - polling_start_time
audio_path = json["state"]["maybe_public_bucket_wav_audio_path"]
- audio_url = f'https://storage.googleapis.com/vocodes-public{audio_path}'
+ audio_url = f'https://cdn-2.fakeyou.com{audio_path}'
audio_urls.append(audio_url)
- if(on_voice_generated):
- on_voice_generated(i, audio_url)
+ if(on_voice_url_generated):
+ on_voice_url_generated(i, audio_url)
else:
raise Exception("job failed, aborting", json)
break
@@ -150,4 +227,4 @@ def generate_voices(script: Script, on_voice_generated: Optional[Callable[[int,
# sleep the remaining time before next job
remaining_delay = max(0, rand_job_delay - total_poll_time)
time.sleep(remaining_delay)
- return audio_urls
\ No newline at end of file
+ return audio_urls
diff --git a/sitcom_simulator/speech_generator/integrations/gtts.py b/sitcom_simulator/speech/integrations/gtts.py
similarity index 62%
rename from sitcom_simulator/speech_generator/integrations/gtts.py
rename to sitcom_simulator/speech/integrations/gtts.py
index 4802003..bf2051e 100644
--- a/sitcom_simulator/speech_generator/integrations/gtts.py
+++ b/sitcom_simulator/speech/integrations/gtts.py
@@ -1,5 +1,4 @@
import tempfile
-from gtts import gTTS
from typing import List
from ...models import Script
from tqdm import tqdm
@@ -7,7 +6,15 @@
import atexit
import os
-def generate_voices(script: Script, on_voice_generated: Optional[Callable[[int, str], None]] = None) -> List[str | None]:
+def generate_voices(script: Script, on_voice_generated: Optional[Callable[[int, str], None]] = None) -> List[str | None]:
+ """
+ Generates and returns a list of voice clip paths for the given script using the Google Text-to-Speech API.
+ Intended for debugging purposes and ironic memes only.
+
+ :param script: The script to generate voice clips for
+ :param on_voice_generated: A callback to call after each voice clip is generated which takes the clip index and path to the generated audio
+ """
+ from gtts import gTTS
filepaths: List[str | None] = []
for i, line in tqdm(enumerate(script.clips), "Generating voice clips", total=len(script.clips)):
if not line.speech:
diff --git a/sitcom_simulator/speech/speech_generator.py b/sitcom_simulator/speech/speech_generator.py
new file mode 100644
index 0000000..a02d4b5
--- /dev/null
+++ b/sitcom_simulator/speech/speech_generator.py
@@ -0,0 +1,84 @@
+from typing import List, Literal
+from sitcom_simulator.models import Script
+from typing import Optional, Callable
+import os
+
+Engine = Literal["fakeyou", "gtts"]
+
+def generate_voices(
+ script: Script,
+ engine:Engine="fakeyou",
+ on_voice_downloaded: Optional[Callable[[int, str], None]] = None,
+ fakeyou_on_voice_url_generated: Optional[Callable[[int, str], None]] = None,
+ fakeyou_job_delay:int=30,
+ fakeyou_poll_delay:int=10,
+ ):
+ """
+ Generates and returns a list of voice clip paths for the given script using the given engine.
+
+ More procedural in nature than add_voices.
+ This function is typically not used directly, since add_voices is more pleasant to work with.
+
+ :param script: The script to generate voice clips for
+ :param engine: The engine to use for generating voice clips
+ :param on_voice_downloaded: A callback to call after each voice clip is downloaded which takes the clip index and path to the downloaded audio
+ :param fakeyou_on_voice_url_generated: A callback to call after each FakeYou voice clip is generated which takes the clip index and url of the generated audio
+ :param fakeyou_job_delay: The number of seconds to wait between starting audio generation jobs. Lower values render faster but are more likely to get rate limited
+ :param fakeyou_poll_delay: The number of seconds to wait between polling for audio generation job completion
+ """
+ from .integrations import fakeyou as fakeyou
+ from .integrations import gtts as gtts
+ # generating voice clips can take a LONG time if args.high_quality_audio == True
+ # because of long delays to avoid API timeouts on FakeYou.com
+ if engine == "fakeyou":
+ username_or_email = os.environ.get('FAKEYOU_USERNAME')
+ password = os.environ.get('FAKEYOU_PASSWORD')
+ fakeyou_cookie = None
+ if username_or_email and password:
+ fakeyou_cookie = fakeyou.sign_in(username_or_email, password)
+ audio_urls = fakeyou.generate_voices(
+ script,
+ fakeyou_on_voice_url_generated,
+ fakeyou_job_delay,
+ fakeyou_poll_delay,
+ cookie=fakeyou_cookie,
+ )
+ audio_paths = []
+ for i, audio_url in enumerate(audio_urls):
+ if audio_url is None: continue
+ audio_path = fakeyou.download_voice(audio_url)
+ audio_paths.append(audio_path)
+ if on_voice_downloaded:
+ on_voice_downloaded(i, audio_path)
+ return audio_paths
+ else:
+ audio_paths = gtts.generate_voices(script, on_voice_downloaded)
+ return audio_paths
+
+
+def add_voices(
+ script: Script,
+ engine:Engine="fakeyou",
+ on_voice_generated: Optional[Callable[[int, str], None]] = None,
+ fakeyou_job_delay:int=30,
+ fakeyou_poll_delay:int=10,
+ ):
+ """
+ Given a script, returns the same script but with the audio paths filled in.
+
+ More functional in nature than generate_voices.
+
+ :param script: The script to add voices to
+ :param engine: The engine to use for generating voice clips
+ :param on_voice_generated: A callback to call after each voice clip is generated which takes the clip index and path to the generated audio
+ :param fakeyou_job_delay: The number of seconds to wait between starting audio generation jobs. Lower values render faster but are more likely to get rate limited. (FakeYou only)
+ :param fakeyou_poll_delay: The number of seconds to wait between polling for audio generation job completion. (FakeYou only)
+ """
+ audio_paths = generate_voices(
+ script,
+ engine=engine,
+ fakeyou_on_voice_url_generated=on_voice_generated,
+ fakeyou_job_delay=fakeyou_job_delay,
+ fakeyou_poll_delay=fakeyou_poll_delay,
+ )
+ return script.replace(clips=[clip.replace(audio_path=audio_path) for clip, audio_path in zip(script.clips, audio_paths)])
\ No newline at end of file
diff --git a/sitcom_simulator/speech_generator/__init__.py b/sitcom_simulator/speech_generator/__init__.py
deleted file mode 100644
index 6a6f1b7..0000000
--- a/sitcom_simulator/speech_generator/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from sitcom_simulator.speech_generator.speech_generator import generate_voices, add_voices
\ No newline at end of file
diff --git a/sitcom_simulator/speech_generator/speech_generator.py b/sitcom_simulator/speech_generator/speech_generator.py
deleted file mode 100644
index 3fd1f88..0000000
--- a/sitcom_simulator/speech_generator/speech_generator.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from .integrations import fakeyou as fakeyou
-from .integrations import gtts as gtts
-from typing import List, Literal
-from sitcom_simulator.models import Script
-from typing import Optional, Callable
-
-Engine = Literal["fakeyou", "gtts"]
-
-def generate_voices(
- script: Script,
- engine:Engine="fakeyou",
- on_voice_generated: Optional[Callable[[int, str], None]] = None
- ):
- # generating voice clips can take a LONG time if args.high_quality_audio == True
- # because of long delays to avoid API timeouts on FakeYou.com
- if engine == "fakeyou":
- audio_urls = fakeyou.generate_voices(script, on_voice_generated)
- audio_paths = [fakeyou.download_voice(audio_url) if audio_url else None for audio_url in audio_urls]
- else:
- audio_paths = gtts.generate_voices(script, on_voice_generated)
- return audio_paths
-
-
-def add_voices(
- script: Script,
- engine:Engine="fakeyou",
- on_voice_generated: Optional[Callable[[int, str], None]] = None
- ):
- audio_paths = generate_voices(script, engine=engine, on_voice_generated=on_voice_generated)
- return script.replace(clips=[clip.replace(audio_path=audio_path) for clip, audio_path in zip(script.clips, audio_paths)])
\ No newline at end of file
diff --git a/sitcom_simulator/user_input.py b/sitcom_simulator/user_input.py
index e0e294c..7c10f8f 100644
--- a/sitcom_simulator/user_input.py
+++ b/sitcom_simulator/user_input.py
@@ -1,10 +1,14 @@
-import tomllib
-import random
from sitcom_simulator.models import Character
-# user selects which auto-detected characters to include in the script
-# debug only, fakeyou has another method for selecting characters
-def select_characters(possible_characters):
+def select_characters(possible_characters: dict[str, list[str]]):
+ """
+ Generic character selection procedure in which the user
+ selects which auto-detected characters to include in the script.
+
+ This function is currently unused since FakeYou has its own character selection procedure.
+
+ :param possible_characters: A dictionary of character names to a list of voice tokens
+ """
print("--- Character Voice Selection ---")
selected_characters = dict()
for name, voices in possible_characters.items():
@@ -18,8 +22,14 @@ def select_characters(possible_characters):
assert len(selected_characters) > 0, "No voices selected. Exiting."
return [Character(name, voice) for name, voice in selected_characters.items()]
-def describe_characters(characters):
- " get visual descriptions for each character from the user "
+def describe_characters(characters: dict[str, str]):
+ """
+ A procedure to prompt the user to visually describe the characters in the script.
+
+ This function is currently unused since the language model descriptions are used instead.
+
+ :param characters: A dictionary of character names to voice tokens (although this should change to a list of Character objects in the future)
+ """
print("\n--- Image Prompt Descriptions ---\n")
character_descriptions = {}
diff --git a/sitcom_simulator/video_generator/__init__.py b/sitcom_simulator/video/__init__.py
similarity index 100%
rename from sitcom_simulator/video_generator/__init__.py
rename to sitcom_simulator/video/__init__.py
diff --git a/sitcom_simulator/video/integrations/__init__.py b/sitcom_simulator/video/integrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sitcom_simulator/video/integrations/ffmpeg.py b/sitcom_simulator/video/integrations/ffmpeg.py
new file mode 100644
index 0000000..1b132bc
--- /dev/null
+++ b/sitcom_simulator/video/integrations/ffmpeg.py
@@ -0,0 +1,387 @@
+import random
+from ...models import Script, Clip
+from typing import List
+import os
+import textwrap
+from tqdm import tqdm
+import tempfile
+import atexit
+from dataclasses import dataclass
+import math
+from typing import Literal
+
+FRAME_RATE = 24
+MAX_CLIP_SECONDS = 15
+FFMPEG_QUALITY:Literal["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow"] = "slow"
+
+@dataclass
+class ShadowSettings:
+ """
+ Settings for shadows in a video.
+
+ :param color: The color of the shadow
+ :param alpha: The alpha of the shadow
+ :param x: The x offset of the shadow
+ :param y: The y offset of the shadow
+ """
+ color: str = 'black'
+ alpha: float = 0.7
+ x: int = 5
+ y: int = 5
+
+ def to_dict(self):
+ """
+ Returns a dictionary representation of the shadow settings for use in an FFmpeg filter.
+ """
+ return {
+ "shadowcolor": f"{self.color}@{self.alpha}",
+ "shadowx": self.x,
+ "shadowy": self.y
+ }
+
+@dataclass
+class BoxSettings:
+ """
+ Settings for boxes in a video.
+
+ :param color: The color of the box
+ :param alpha: The alpha of the box
+ :param border_width: The width of the box border
+ """
+ color: str = 'black'
+ alpha: float = 0.5
+ border_width: int = 10
+
+ def to_dict(self):
+ """
+ Returns a dictionary representation of the box settings for use in an FFmpeg filter.
+ """
+ return {
+ "box": 1,
+ "boxcolor": f"{self.color}@{self.alpha}",
+ "boxborderw": self.border_width
+ }
+
+@dataclass
+class CaptionSettings:
+ """
+ Settings for captions in a video.
+
+ :param font: The path to the font file to use for the captions
+ :param max_width: The maximum width of the captions, in characters
+ :param y_ratio_from_bottom: The y ratio from the bottom of the screen to place the captions
+ """
+ font: str = 'Arial'
+ max_width: int = 30
+ y_ratio_from_bottom: float = 6/24
+
+ def formatted_caption(self, text: str):
+ """
+ Renders a caption with the given text and returns the caption string.
+
+ :param text: The text of the caption
+ :param width: The width of the video
+ :param height: The height of the video
+ """
+ return textwrap.fill(text, width=self.max_width)
+
+@dataclass
+class ClipSettings:
+ """
+ Settings for rendering video clips.
+
+ :param clip_buffer_seconds: How much time to wait after characters finish talking
+ :param min_clip_seconds: The minimum time to hold on a clip
+ :param speaking_delay_seconds: Delay before the audio kicks in
+ :param max_zoom_factor: The maximum zoom factor for the pan and zoom effect
+ :param min_zoom_factor: The minimum zoom factor for the pan and zoom effect. At least some zoom is necessary for panning.
+ :param max_pan_speed: The maximum speed of the pan and zoom effect
+ """
+ clip_buffer_seconds:float=0.15
+ min_clip_seconds:float=1.5
+ speaking_delay_seconds:float=0.12
+ max_zoom_factor:float=1.3 # magic number that seems to work well
+ min_zoom_factor:float=1.05 # magic number that seems to work well
+ max_pan_speed:float=6 # magic number that seems to work well
+
+failed_image_captions = [
+ "This image has been seized by the FBI",
+ "REDACTED",
+ "This image has been classified",
+ "CENSORED",
+ "This image has been confiscated",
+ "This image has been banned in your country",
+ "This image has been quarantined",
+ "[image too dangerous to be seen by human eyes]",
+ "[Intense Violence]",
+ "[Innappropriate Content]",
+ "[Explicit Content]",
+ "[Scandalous Content]",
+ "Image seized by the government",
+]
+
+def render_clip(
+ clip: Clip,
+ width:int=720,
+ height:int=1280,
+ speed:float=1.0,
+ pan_and_zoom:bool=True,
+ clip_settings:ClipSettings=ClipSettings(),
+ caption_settings:CaptionSettings=CaptionSettings(),
+ caption_bg_settings:BoxSettings|ShadowSettings=BoxSettings(),
+ audio_codec:Literal['mp3', 'aac']='mp3',
+ ):
+ """
+ Renders a video clip from the given clip object and returns the path to the rendered video file.
+
+ :param clip: The clip to render
+ :param width: The width of the video
+ :param height: The height of the video
+ :param speed: The speed of the final video. 1.0 is normal speed
+ :param pan_and_zoom: If True, the pan and zoom effect on images will be enabled
+ :param clip_settings: The settings for rendering the video clip
+ :param caption_settings: The settings for the captions
+ :param caption_bg_settings: The settings for the caption background
+ :param audio_codec: The audio codec to use for the output video
+ """
+ width = int(round(width))
+ height = int(round(height))
+
+ import ffmpeg
+ caption = clip.speech or clip.title
+ title_clip = not not clip.title
+ if caption:
+ caption = caption_settings.formatted_caption(caption)
+
+ scale_factor = min(width, height) / 720 # 720 is the reference screen width
+
+ if clip.audio_path:
+ try:
+ audio_path = clip.audio_path.replace('/', '\\') if os.name == 'nt' else clip.audio_path
+ audio_duration = float(ffmpeg.probe(audio_path)['streams'][0]['duration']) if clip.audio_path else 0
+ except Exception as e:
+ print(f"Error probing audio duration: {e}.\nHave you put ffmpeg and ffprobe binaries into the root project directory?")
+ print(clip.audio_path)
+ audio_duration = 0
+ else:
+ audio_duration = 0
+
+ duration = audio_duration + clip_settings.clip_buffer_seconds + clip_settings.speaking_delay_seconds
+ duration = max(duration, clip_settings.min_clip_seconds)
+ duration = min(duration, MAX_CLIP_SECONDS) # maximum duration for a clip (to prevent long AI audio glitches)
+ duration = duration / speed
+ if clip.duration and not clip.speaker: # 'not speaker' in case the llm forgets proper syntax
+ duration = clip.duration
+
+ no_image = clip.image_path is None
+ seized_image = clip.image_path is None and not title_clip
+
+ if no_image or seized_image:
+ video_input = ffmpeg.input(f'color=c=black:s={width}x{height}:d=5', f='lavfi')
+ else:
+ video_input = ffmpeg.input(clip.image_path, loop=1, framerate=FRAME_RATE)
+ # the zoom effect is jittery for some strange reason
+ # but if we upscale the image first, the jitter is less noticeable
+ # at the cost of slower rendering
+ prezoom_scale_factor = 2 if pan_and_zoom else 1
+ prezoom_scale_width = int(width * prezoom_scale_factor)
+ prezoom_scale_height = int(height * prezoom_scale_factor)
+ video_input = (
+ video_input
+ .filter('scale', prezoom_scale_width, prezoom_scale_height, force_original_aspect_ratio="increase")
+ .filter('crop', prezoom_scale_width, prezoom_scale_height)
+ )
+ if pan_and_zoom:
+ zoom_start = clip_settings.min_zoom_factor # Start with no zoom
+ zoom_end = random.uniform(clip_settings.min_zoom_factor, clip_settings.max_zoom_factor) # Target end zoom level, adjust as needed
+ zoom_out = random.choice([True, False]) # Randomly zoom in or out
+ if zoom_out:
+ zoom_start, zoom_end = zoom_end, zoom_start # Reverse the zoom levels for a zoom out effect
+ total_frames = int(duration * FRAME_RATE) # Total frames based on video duration and frame rate
+
+ # Ensure zoom continues smoothly for the entire duration
+ zoom_expr = f'{zoom_start}+(on/{total_frames})*{zoom_end-zoom_start}'
+
+ # Randomly pan the image
+ max_pan = clip_settings.max_pan_speed * (min(width, height) / 720) * prezoom_scale_factor # Maximum pan speed (pixels per frame, scaled to 720p reference screen width
+ # the sqrt(total_frames) is to make the pan speed scale with the duration of the clip
+ # so that shorter clips are punchier and longer clips are smoother
+ frame_offset = f"((on-{total_frames/2})/{math.sqrt(total_frames)})"
+ x_expr = f'(iw/2.0-(iw/zoom/2.0))+{random.uniform(-max_pan, max_pan)}*{frame_offset}'
+ y_expr = f'(ih/2.0-(ih/zoom/2.0))+{random.uniform(-max_pan, max_pan)}*{frame_offset}'
+
+ video_input = video_input.zoompan(
+ z=zoom_expr,
+ x=x_expr,
+ y=y_expr,
+ d=1, # Apply the effect continuously across frames
+ s=f'{width}x{height}',
+ fps=FRAME_RATE,
+ )
+
+ speaking_delay_ms = clip_settings.speaking_delay_seconds * 1000
+
+ # make sure every clip has an audio track, even if it's silent
+ if clip.audio_path is None:
+ audio_input = ffmpeg.input('anullsrc', format='lavfi', t=duration).audio
+ else:
+ audio_input = (
+ ffmpeg
+ .input(clip.audio_path)
+ .filter('adelay', f'{speaking_delay_ms}|{speaking_delay_ms}')
+ .filter('apad', pad_dur=duration)
+ .filter('atempo', speed)
+ .filter('speechnorm')
+ )
+
+ caption_bg_dict = caption_bg_settings.to_dict() if isinstance(caption_bg_settings, BoxSettings) else caption_bg_settings.to_dict()
+
+ if caption or seized_image:
+ video_input = video_input.filter(
+ 'drawtext',
+ text=caption if caption else random.choice(failed_image_captions),
+ fontfile=caption_settings.font,
+ fontsize=48 * scale_factor, # scales the font size with 720px as the reference screen width
+ fontcolor='white',
+ text_align="M+C", # had to dig deep into FFmpeg source code to learn that you combine flags with a plus sign
+ x='(w - text_w) / 2',
+ y=f'(h - (text_h / 2)) - h*{caption_settings.y_ratio_from_bottom if not title_clip else 0.5}',
+ **caption_bg_dict,
+ )
+
+ video_input = video_input.filter('setpts', f'PTS/{speed}')
+
+ try:
+ input_streams = [video_input] if audio_input is None else [video_input, audio_input]
+ with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
+ intermediate_clip = (
+ ffmpeg.output(*input_streams, temp_file.name, vcodec='libx264', preset='superfast', acodec=audio_codec, t=duration)
+ .overwrite_output()
+ .run(capture_stderr=True, overwrite_output=True)
+ )
+ atexit.register(os.remove, temp_file.name)
+ return temp_file.name
+ except ffmpeg.Error as e:
+ print('FFmpeg Error:', e.stderr.decode() if e.stderr else str(e)) # Decoding the stderr for better readability
+ raise Exception(f"ffmpeg error: {e.stderr.decode() if e.stderr else str(e)}")
+
+
+def concatenate_clips(
+ filenames: List[str],
+ output_filename: str,
+ background_music:str|None=None,
+ bgm_volume:float=-24,
+ audio_codec:Literal['mp3', 'aac']='mp3',
+ ):
+ """
+ Combines the given video clips into a single video file and returns the path to the concatenated video file.
+
+ :param filenames: The list of video file paths to combine
+ :param output_filename: The name of the output file
+ :param background_music: The path to the background music file
+ :param bgm_volume: The volume of the background music, between 0 and 1
+ :param audio_codec: The audio codec to use for the output video
+ """
+ import ffmpeg
+
+ # Create input sets for each file in the list
+ input_clips = [ffmpeg.input(f) for f in filenames]
+
+ # Split the video and audio streams
+ video_streams = [clip.video for clip in input_clips]
+ audio_streams = [clip.audio for clip in input_clips]
+
+ # Concatenate each stream type separately
+ concatenated_video = ffmpeg.concat(*video_streams, v=1, a=0)
+ concatenated_audio = ffmpeg.concat(*audio_streams, v=0, a=1)
+
+ total_audio_duration = sum([float(ffmpeg.probe(f)['streams'][0]['duration']) for f in filenames])
+
+ # If background music is provided, adjust its volume and mix it with concatenated audio
+ if background_music:
+ bgm_input = (
+ ffmpeg
+ .input(background_music)
+ # .filter('volume', str(bgm_volume)) # old way, ~.25 worked well
+ .filter('loudnorm', i=bgm_volume) # new way, more consistent
+ .filter('atrim', duration=total_audio_duration)
+ )
+ concatenated_audio = ffmpeg.filter([concatenated_audio, bgm_input], 'amix') # Mix concatenated audio and bgm
+
+ sanitized_filename = output_filename.replace(':', '').replace('?', '')
+
+ # Output the concatenated streams
+ (
+ ffmpeg
+ .output(
+ concatenated_video,
+ concatenated_audio,
+ sanitized_filename,
+ vcodec='libx264',
+ pix_fmt='yuv420p', # necessary for compatibility
+ acodec=audio_codec,
+ r=FRAME_RATE,
+ preset=FFMPEG_QUALITY,
+ **{'b:v': '8000K'}
+ )
+ .overwrite_output()
+ .run(capture_stderr=True)
+ )
+
+ return sanitized_filename
+
+def render_video(
+ script: Script,
+ output_path: str='output.mp4',
+ width:int=720,
+ height:int=1280,
+ speed:float=1.0,
+ pan_and_zoom:bool=True,
+ clip_settings:ClipSettings=ClipSettings(),
+ caption_settings:CaptionSettings=CaptionSettings(),
+ caption_bg_settings:BoxSettings|ShadowSettings=BoxSettings(),
+ bgm_volume:float=-24,
+ audio_codec:Literal['mp3', 'aac']='mp3',
+ ):
+ """
+ Renders a video from the given script and returns the path to the rendered video file.
+
+ At present, only 9:16 aspect ratio is supported, but 16:9 and 1:1 will be supported in the future.
+
+ :param script: The script to render
+ :param output_path: The path to save the rendered video
+ :param width: The width of the video
+ :param height: The height of the video
+ :param speed: The speed of the final video. 1.0 is normal speed
+ :param pan_and_zoom: If True, the pan and zoom effect on images will be enabled
+ :param clip_settings: The settings for rendering the video clip
+ :param caption_settings: The settings for the captions
+ :param caption_bg_settings: The settings for the caption background
+ :param bgm_volume: The volume of the background music, good values are between -24 and -16
+ :param audio_codec: The audio codec to use for the output video
+ """
+ intermediate_clips = []
+ for clip in tqdm(script.clips, desc="Rendering intermediate video clips"):
+ clip_file = render_clip(
+ clip=clip,
+ width=width,
+ height=height,
+ clip_settings=clip_settings,
+ caption_settings=caption_settings,
+ caption_bg_settings=caption_bg_settings,
+ speed=speed,
+ pan_and_zoom=pan_and_zoom,
+ audio_codec=audio_codec,
+ )
+ intermediate_clips.append(clip_file)
+
+ print("Rendering final video...")
+ final_video_path = concatenate_clips(
+ intermediate_clips,
+ output_path,
+ background_music=script.metadata.bgm_path,
+ bgm_volume=bgm_volume,
+ audio_codec=audio_codec,
+ )
+
+ return final_video_path
\ No newline at end of file
diff --git a/sitcom_simulator/video/integrations/moviepy.py b/sitcom_simulator/video/integrations/moviepy.py
new file mode 100644
index 0000000..46edf53
--- /dev/null
+++ b/sitcom_simulator/video/integrations/moviepy.py
@@ -0,0 +1,68 @@
+# from moviepy.editor import *
+# from ...models import SpeechClip
+# from typing import List
+
+# def generate_movie(
+# dialogs: List[SpeechClip],
+# font: str,
+# output_path="output.mp4",
+# width:int=720,
+# height:int=1280,
+# clip_buffer_seconds=0.35, # how much time to wait after characters finish talking
+# min_clip_length=1.5, # minimum time to hold on a clip
+# ):
+# """
+# MoviePy backend for generating videos.
+
+# While it still mostly works, it is more limited in functionality than the FFmpeg backend and has thus been deprecated.
+# """
+# dialog_clips = []
+# for dialog in dialogs:
+
+# voiceover = AudioFileClip(dialog.audio)
+
+# # calculate the duration
+# duration = voiceover.duration + clip_buffer_seconds
+# if(duration < min_clip_length):
+# duration = min_clip_length
+
+# # black background
+# bg = ColorClip(size=(width,height), color=[0,0,0])
+# bg = bg.set_duration(duration)
+# bg = bg.set_audio(voiceover)
+
+# # the image
+# img_clip = ImageClip(dialog.image)
+# img_clip = img_clip.resize(width/img_clip.w,height/img_clip.h)
+# img_clip = img_clip.set_duration(duration)
+# img_clip = img_clip.set_fps(24)
+# img_clip = img_clip.set_position(('center', 'top'))
+
+# # the caption
+# raw_caption = dialog.caption
+# raw_caption_queue = raw_caption
+# caption = ""
+# # generate line breaks as necessary
+# max_chars_per_line = 30
+# char_counter = 0
+# while(len(raw_caption_queue) > 0):
+# split = raw_caption_queue.split(' ')
+# if(char_counter + len(split[0]) + 1 < max_chars_per_line):
+# caption += " "
+# char_counter += 1
+# else:
+# caption += "\n"
+# char_counter = 0
+# caption += split[0]
+# char_counter += len(split[0])
+# raw_caption_queue = " ".join(split[1:])
+
+# txt_clip = TextClip(caption, fontsize=48, font=font, color='white', size=(width, height - img_clip.h))
+# txt_clip = txt_clip.set_position(('center', 1-float(height-img_clip.h)/float(height)), relative=True).set_duration(duration)
+
+# video = CompositeVideoClip([bg, img_clip, txt_clip])
+# video = video.set_fps(24)
+# dialog_clips.append(video)
+
+# final_clip = concatenate_videoclips(dialog_clips)
+# final_clip.write_videofile(output_path)
diff --git a/sitcom_simulator/video/video_generator.py b/sitcom_simulator/video/video_generator.py
new file mode 100644
index 0000000..ecc14a0
--- /dev/null
+++ b/sitcom_simulator/video/video_generator.py
@@ -0,0 +1,130 @@
+from typing import List, Literal
+from ..models import Script
+
+def render_video(
+ script: Script,
+ font: str,
+ output_path="output.mp4",
+ resolution:int=1080,
+ orientation:str="portrait",
+ speed:float=1.0,
+ pan_and_zoom:bool=True,
+ clip_buffer_seconds:float=0.35,
+ min_clip_seconds:float=1.5,
+ speaking_delay_seconds:float=0.12,
+ caption_bg_style:Literal['box_shadow', 'text_shadow', 'none']='text_shadow',
+ caption_bg_alpha:float=0.6,
+ caption_bg_color:str="black",
+ caption_bg_shadow_distance_x:float=5,
+ caption_bg_shadow_distance_y:float=5,
+ max_zoom_factor:float=1.3,
+ min_zoom_factor:float=1.05,
+ max_pan_speed:float=6,
+ bgm_volume:float=-24,
+ audio_codec:Literal['mp3', 'aac']='mp3',
+ ):
+ """
+ Renders a video from the given script and returns the path to the rendered video.
+
+ :param script: The script to render
+ :param font: The path to the font file to use
+ :param output_path: The path to save the rendered video to
+ :param resolution: The width of the video to render assuming portrait mode. This takes into account the orientation parameter.
+ :param orientation: The orientation of the video. "landscape", "portrait", or "square".
+ :param speed: The speed of the final video. 1.0 is normal speed.
+ :param pan_and_zoom: If True, the pan and zoom effect on images will be enabled.
+ :param clip_buffer_seconds: How much time to wait after characters finish talking
+ :param min_clip_length: The minimum time to hold on a clip
+ :param speaking_delay_seconds: How much time to wait after a character starts talking
+ :param caption_bg_style: The style of the background behind the captions
+ :param caption_bg_alpha: The alpha of the background behind the captions
+ :param caption_bg_color: The color of the background behind the captions
+ :param caption_bg_shadow_distance_x: The x distance of the shadow behind the captions
+ :param caption_bg_shadow_distance_y: The y distance of the shadow behind the captions
+ :param max_zoom_factor: The maximum zoom factor for pan and zoom
+ :param min_zoom_factor: The minimum zoom factor for pan and zoom
+ :param max_pan_speed: The maximum pan speed for pan and zoom
+ :param bgm_volume: The volume of the background music
+ :param audio_codec: The audio codec to use for the video. mp3 seems to be more compatible with more video players, but aac is higher quality and is necessary for viewing videos in an iPhone browser.
+ """
+
+ # rely on image_path first, but if it's not there and image_url is, download the image
+ import requests
+ import tempfile
+ for i, clip in enumerate(script.clips):
+ if clip.image_path:
+ continue
+ if clip.image_url:
+ try:
+ response = requests.get(clip.image_url)
+ image_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
+ with open(image_path, 'wb') as f:
+ f.write(response.content)
+ clip.image_path = image_path
+ except Exception as e:
+ import logging
+ logging.error(f"Failed to download image for clip {i}: {e}")
+
+ # same thing but with audio
+ for i, clip in enumerate(script.clips):
+ if clip.audio_path:
+ continue
+ if clip.audio_url:
+ try:
+ ext = clip.audio_url.split('.')[-1]
+ response = requests.get(clip.audio_url)
+ audio_path = tempfile.NamedTemporaryFile(suffix=ext, delete=False).name
+ with open(audio_path, 'wb') as f:
+ f.write(response.content)
+ clip.audio_path = audio_path
+ except Exception as e:
+ import logging
+ logging.error(f"Failed to download audio for clip {i}: {e}")
+
+ from .integrations import ffmpeg
+ from .integrations.ffmpeg import ClipSettings, CaptionSettings, BoxSettings, ShadowSettings
+
+ caption_bg_settings = None
+ if caption_bg_style == 'box_shadow':
+ caption_bg_settings = BoxSettings(
+ alpha=caption_bg_alpha,
+ color=caption_bg_color,
+ )
+ elif caption_bg_style == 'text_shadow':
+ caption_bg_settings = ShadowSettings(
+ alpha=caption_bg_alpha,
+ color='black',
+ x=caption_bg_shadow_distance_x,
+ y=caption_bg_shadow_distance_y,
+ )
+
+ aspect_ratio = 16 / 9
+
+ width, height = {
+ "landscape": (resolution * aspect_ratio, resolution),
+ "portrait": (resolution, resolution * aspect_ratio),
+ "square": (resolution, resolution),
+ }[orientation]
+
+ return ffmpeg.render_video(
+ script=script,
+ output_path=output_path,
+ width=width,
+ height=height,
+ speed=speed,
+ pan_and_zoom=pan_and_zoom,
+ caption_settings=CaptionSettings(
+ font=font,
+ ),
+ clip_settings=ClipSettings(
+ clip_buffer_seconds=clip_buffer_seconds,
+ min_clip_seconds=min_clip_seconds,
+ speaking_delay_seconds=speaking_delay_seconds,
+ max_zoom_factor=max_zoom_factor,
+ min_zoom_factor=min_zoom_factor,
+ max_pan_speed=max_pan_speed,
+ ),
+ caption_bg_settings=caption_bg_settings,
+ bgm_volume=bgm_volume,
+ audio_codec=audio_codec,
+ )
\ No newline at end of file
diff --git a/sitcom_simulator/video_generator/integrations/ffmpeg.py b/sitcom_simulator/video_generator/integrations/ffmpeg.py
deleted file mode 100644
index c16d302..0000000
--- a/sitcom_simulator/video_generator/integrations/ffmpeg.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import ffmpeg
-from ...models import Script, Clip
-from typing import List
-import os
-import textwrap
-from tqdm import tqdm
-import tempfile
-import atexit
-
-def render_clip(
- clip: Clip,
- font: str,
- width:int=720,
- height:int=1280,
- clip_buffer_seconds=0.15, # how much time to wait after characters finish talking
- min_clip_seconds=1.5, # minimum time to hold on a clip
- speaking_delay_seconds=0.12, # how long after the clip the audio kicks in
- caption_max_width=30,
- ):
- caption = clip.speech
- if caption:
- caption = textwrap.fill(caption, width=caption_max_width)
-
- subtitle_y_ratio_from_bottom = 6/24
- scale_factor = width / 720
-
- # If you want to add a shadow:
- shadow_style = ":shadowcolor=black@0.7:shadowx=3:shadowy=3"
- # If you want to add a transparent grey background box:
- box_style = ":box=1:boxcolor=black@0.4:boxborderw=10"
- subtitle_style = box_style # + box_style # mix and match as desired
-
- try:
- audio_duration = float(ffmpeg.probe(clip.audio_path.replace('/', '\\'))['streams'][0]['duration']) if clip.audio_path else 0
- except Exception as e:
- print(f"Error probing audio duration: {e}.\nHave you put ffmpeg and ffprobe binaries into the root project directory?")
- raise e
-
- duration = audio_duration + clip_buffer_seconds + speaking_delay_seconds
- duration = max(duration, min_clip_seconds)
- if clip.duration and not clip.speaker: # 'not speaker' in case the llm forgets proper syntax
- duration = clip.duration
-
- if clip.image_path is None:
- video_input = ffmpeg.input(f'color=c=black:s={width}x{height}:d=5', f='lavfi')
- else:
- video_input = (
- ffmpeg.input(clip.image_path, loop=1, framerate=24)
- .filter('scale', width, height, force_original_aspect_ratio="increase")
- .filter('crop', width, height)
- )
-
- speaking_delay_ms =speaking_delay_seconds * 1000
-
- # make sure every clip has an audio track, even if it's silent
- if clip.audio_path is None:
- audio_input = ffmpeg.input('anullsrc', format='lavfi', t=duration).audio
- else:
- audio_input = (
- ffmpeg
- .input(clip.audio_path)
- .filter('adelay', f'{speaking_delay_ms}|{speaking_delay_ms}')
- .filter('apad', pad_dur=duration)
- )
-
- # Modify the video input to include subtitles
-
- if caption:
- video_input = video_input.filter(
- 'drawtext',
- text=caption,
- fontfile=font,
- fontsize=42 * scale_factor, # scales the font size with 720px as the reference screen width
- fontcolor='white',
- text_align="M+C", # had to dig deep into FFmpeg source code to learn that you combine flags with a plus sign
- x='(w - text_w) / 2',
- y=f'(h - (text_h / 2)) - h*{subtitle_y_ratio_from_bottom}', **{
- "shadowcolor": "black@0.6",
- "shadowx": -4 * scale_factor,
- "shadowy": 4 * scale_factor,
- } if subtitle_style == shadow_style else {
- "box": 1,
- "boxcolor": "black@0.5",
- "boxborderw": 10 * scale_factor
- })
-
- try:
- input_streams = [video_input] if audio_input is None else [video_input, audio_input]
- with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
- intermediate_clip = (
- ffmpeg.output(*input_streams, temp_file.name, vcodec='libx264', preset='superfast', acodec='mp3', t=duration)
- .run(overwrite_output=True, capture_stderr=True)
- )
- atexit.register(os.remove, temp_file.name)
- return temp_file.name
- except ffmpeg.Error as e:
- print('FFmpeg Error:', e.stderr.decode() if e.stderr else str(e)) # Decoding the stderr for better readability
- raise Exception("ffmpeg error:", e.stderr if e.stderr else str(e))
-
-
-def concatenate_clips(
- filenames: List[str],
- output_filename: str,
- background_music:str|None=None,
- bgm_volume:float=0.25,
- ):
-
- # Create input sets for each file in the list
- input_clips = [ffmpeg.input(f) for f in filenames]
-
- # Split the video and audio streams
- video_streams = [clip.video for clip in input_clips]
- audio_streams = [clip.audio for clip in input_clips]
-
- # Concatenate each stream type separately
- concatenated_video = ffmpeg.concat(*video_streams, v=1, a=0)
- concatenated_audio = ffmpeg.concat(*audio_streams, v=0, a=1)
-
- total_audio_duration = sum([float(ffmpeg.probe(f)['streams'][0]['duration']) for f in filenames])
-
- # If background music is provided, adjust its volume and mix it with concatenated audio
- if background_music:
- bgm_input = (
- ffmpeg
- .input(background_music)
- .filter('volume', str(bgm_volume))
- .filter('atrim', duration=total_audio_duration)
- )
- concatenated_audio = ffmpeg.filter([concatenated_audio, bgm_input], 'amix') # Mix concatenated audio and bgm
-
- sanitized_filename = output_filename.replace(':', '').replace('?', '')
-
- # Output the concatenated streams
- (
- ffmpeg
- .output(
- concatenated_video,
- concatenated_audio,
- sanitized_filename,
- vcodec='libx264',
- pix_fmt='yuv420p', # necessary for compatibility
- acodec='mp3',
- r=24,
- **{'b:v': '8000K'}
- )
- .overwrite_output()
- .run()
- )
-
- return sanitized_filename
-
-# TODO: support aspect ratios 16:9 and 1:1
-def render_video(
- script: Script,
- font: str,
- output_path: str = 'output.mp4',
- width:int=720,
- height:int=1280,
- clip_buffer_seconds=0.15, # how much time to wait after characters finish talking
- min_clip_length=1.5, # minimum time to hold on a clip
- speaking_delay_seconds=0.12, # how long after the clip the audio kicks in
- caption_max_width=30,
- ):
- intermediate_clips = []
- for clip in tqdm(script.clips, desc="Rendering intermediate video clips"):
- clip_file = render_clip(
- clip=clip,
- font=font,
- width=width,
- height=height,
- clip_buffer_seconds=clip_buffer_seconds,
- min_clip_seconds=min_clip_length,
- speaking_delay_seconds=speaking_delay_seconds,
- caption_max_width=caption_max_width,
- )
- intermediate_clips.append(clip_file)
-
- final_video_path = concatenate_clips(intermediate_clips, output_path, background_music=script.metadata.bgm_path)
-
- return final_video_path
\ No newline at end of file
diff --git a/sitcom_simulator/video_generator/integrations/moviepy.py b/sitcom_simulator/video_generator/integrations/moviepy.py
deleted file mode 100644
index ece6204..0000000
--- a/sitcom_simulator/video_generator/integrations/moviepy.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from moviepy.editor import *
-from ...models import SpeechClip
-from typing import List
-
-def generate_movie(
- dialogs: List[SpeechClip],
- font: str,
- output_path="output.mp4",
- width:int=720,
- height:int=1280,
- clip_buffer_seconds=0.35, # how much time to wait after characters finish talking
- min_clip_length=1.5, # minimum time to hold on a clip
- ):
- """
- MoviePy backend for generating videos.
-
- While it still mostly works, it is more limited in functionality than the FFmpeg backend and has thus been deprecated.
- """
- dialog_clips = []
- for dialog in dialogs:
-
- voiceover = AudioFileClip(dialog.audio)
-
- # calculate the duration
- duration = voiceover.duration + clip_buffer_seconds
- if(duration < min_clip_length):
- duration = min_clip_length
-
- # black background
- bg = ColorClip(size=(width,height), color=[0,0,0])
- bg = bg.set_duration(duration)
- bg = bg.set_audio(voiceover)
-
- # the image
- img_clip = ImageClip(dialog.image)
- img_clip = img_clip.resize(width/img_clip.w,height/img_clip.h)
- img_clip = img_clip.set_duration(duration)
- img_clip = img_clip.set_fps(24)
- img_clip = img_clip.set_position(('center', 'top'))
-
- # the caption
- raw_caption = dialog.caption
- raw_caption_queue = raw_caption
- caption = ""
- # generate line breaks as necessary
- max_chars_per_line = 30
- char_counter = 0
- while(len(raw_caption_queue) > 0):
- split = raw_caption_queue.split(' ')
- if(char_counter + len(split[0]) + 1 < max_chars_per_line):
- caption += " "
- char_counter += 1
- else:
- caption += "\n"
- char_counter = 0
- caption += split[0]
- char_counter += len(split[0])
- raw_caption_queue = " ".join(split[1:])
-
- txt_clip = TextClip(caption, fontsize=48, font=font, color='white', size=(width, height - img_clip.h))
- txt_clip = txt_clip.set_position(('center', 1-float(height-img_clip.h)/float(height)), relative=True).set_duration(duration)
-
- video = CompositeVideoClip([bg, img_clip, txt_clip])
- video = video.set_fps(24)
- dialog_clips.append(video)
-
- final_clip = concatenate_videoclips(dialog_clips)
- final_clip.write_videofile(output_path)
diff --git a/sitcom_simulator/video_generator/video_generator.py b/sitcom_simulator/video_generator/video_generator.py
deleted file mode 100644
index e5749c9..0000000
--- a/sitcom_simulator/video_generator/video_generator.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from typing import List
-from ..models import Script
-from .integrations import ffmpeg
-
-def render_video(
- script: Script,
- font: str,
- output_path="output.mp4",
- width:int=1080,
- height:int=1920,
- clip_buffer_seconds=0.35, # how much time to wait after characters finish talking
- min_clip_length=1.5, # minimum time to hold on a clip
- ):
- return ffmpeg.render_video(
- script=script,
- font=font,
- output_path=output_path,
- width=width,
- height=height,
- clip_buffer_seconds=clip_buffer_seconds,
- min_clip_length=min_clip_length
- )
\ No newline at end of file