Naive 2-word splitter. Refs #1

ceefour · Mar 3, 2015 · 2b3eeb6 · 2b3eeb6
1 parent f8c96f5
commit 2b3eeb6
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -14,20 +14,6 @@ Until [TarsosDSP](https://github.com/JorenSix/TarsosDSP) is in Maven Central, yo
 
         mvn install:install-file -Dfile=/together/project_amanah/lumen/speech/TarsosDSP-2.0-with-sources.jar -Dpackaging=jar -DgroupId=be.tarsos.dsp -DartifactId=tarsosdsp -Dversion=2.0 -Dclassifier=sources
 
-## JMathStudio -- NO LONGER USED
-
-Until [JMathStudio](http://sourceforge.net/projects/jmathstudio/) is in Maven Central, you'll need to install it first.
-
-1. Download [JMathStudio](http://sourceforge.net/projects/jmathstudio/) ZIP and extract to `~/tmp`.
-2. Install the main JAR:
-
-        mvn install:install-file -Dfile=$HOME/tmp/JMathStudio_Package/Bin/JMathStudio.jar -Dpackaging=jar -DgroupId=org.jmathstudio -DartifactId=jmathstudio -Dversion=1.2.0
-
-3. Create the Javadoc JAR then install it:
-
-        jar -cvf ~/tmp/jmathstudio-1.2.0-javadoc.jar -C "$HOME/tmp/JMathStudio_Package/API Doc" .
-        mvn install:install-file -Dfile=$HOME/tmp/jmathstudio-1.2.0-javadoc.jar -Dpackaging=jar -DgroupId=org.jmathstudio -DartifactId=jmathstudio -Dversion=1.2.0 -Dclassifier=javadoc
-
 ## Preparing WAV PCM Mono Audio
 
 TarsosDSP only supports mono. While Java only supports WAV PCM (among other less useful formats).
@@ -43,3 +29,65 @@ If you want to take only left channel:
 If you want to take only right channel:
 
     avconv -i Dongeng_Anak_Pengantar_Tidur_Balas_Budi_Burung_Bangau.mp4 -vn -af pan=1:c0=c1 dongeng-bangau-right.wav
+
+## Audio Output Format
+
+We'll be using ACID Loop File format (i.e. enhanced WAV) when generating all output audios.
+
+Thanks, I've found a RIFF viewer, and specification of the riff chunks now.
+In my example wav, I see a chunk called 'acid' which is 24 bytes. I suppose this will contain the tempo information, but I haven't figured out yet how this field is structured.
+
+Update: My test file had tempo 138.00 BPM
+I couldn't find 138 either in asci or in integer format in the acid tag, but 138 appears to be 00 00 0A 43 in floating point format, and this were exactly the last 4 bytes of the acid chunk.
+Now I still need to find out if the tempo is at a fixed offset in the tag, or if there's some other way to know where the tempo is located.
+The acid chunk that Fruity Loops created was 24 bytes long btw.
+
+Via http://www.kvraudio.com/forum/viewtopic.php?p=3061898#p3061898 :
+
+    ** The acid chunk goes a little something like this:
+    **
+    ** 4 bytes          'acid'
+    ** 4 bytes (int)     length of chunk starting at next byte
+    **
+    ** 4 bytes (int)     type of file:
+    **        this appears to be a bit mask,however some combinations
+    **        are probably impossible and/or qualified as "errors"
+    **
+    **        0x01 On: One Shot         Off: Loop
+    **        0x02 On: Root note is Set Off: No root
+    **        0x04 On: Stretch is On,   Off: Strech is OFF
+    **        0x08 On: Disk Based       Off: Ram based
+    **        0x10 On: ??????????       Off: ????????? (Acidizer puts that ON)
+    **
+    ** 2 bytes (short)      root note
+    **        if type 0x10 is OFF : [C,C#,(...),B] -> [0x30 to 0x3B]
+    **        if type 0x10 is ON  : [C,C#,(...),B] -> [0x3C to 0x47]
+    **         (both types fit on same MIDI pitch albeit different octaves, so who cares)
+    **
+    ** 2 bytes (short)      ??? always set to 0x8000
+    ** 4 bytes (float)      ??? seems to be always 0
+    ** 4 bytes (int)        number of beats
+    ** 2 bytes (short)      meter denominator   //always 4 in SF/ACID
+    ** 2 bytes (short)      meter numerator     //always 4 in SF/ACID
+    **                      //are we sure about the order?? usually its num/denom
+    ** 4 bytes (float)      tempo
+
+TBD: Use FluidSynth's / GrandOrgue's format?
+TBD: RIFF Wave Cue-Point chunks: http://sharkysoft.com/archive/lava/docs/javadocs/lava/riff/wave/doc-files/riffwave-content.htm
+
+## Legacy Documentation
+
+### JMathStudio -- NO LONGER USED
+
+Until [JMathStudio](http://sourceforge.net/projects/jmathstudio/) is in Maven Central, you'll need to install it first.
+
+1. Download [JMathStudio](http://sourceforge.net/projects/jmathstudio/) ZIP and extract to `~/tmp`.
+2. Install the main JAR:
+
+        mvn install:install-file -Dfile=$HOME/tmp/JMathStudio_Package/Bin/JMathStudio.jar -Dpackaging=jar -DgroupId=org.jmathstudio -DartifactId=jmathstudio -Dversion=1.2.0
+
+3. Create the Javadoc JAR then install it:
+
+        jar -cvf ~/tmp/jmathstudio-1.2.0-javadoc.jar -C "$HOME/tmp/JMathStudio_Package/API Doc" .
+        mvn install:install-file -Dfile=$HOME/tmp/jmathstudio-1.2.0-javadoc.jar -Dpackaging=jar -DgroupId=org.jmathstudio -DartifactId=jmathstudio -Dversion=1.2.0 -Dclassifier=javadoc
+
diff --git a/pom.xml b/pom.xml
@@ -35,6 +35,11 @@
             <artifactId>guava</artifactId>
             <version>18.0</version>
         </dependency>
+        <dependency>
+            <groupId>commons-io</groupId>
+            <artifactId>commons-io</artifactId>
+            <version>2.4</version>
+        </dependency>
 
         <dependency>
 			<groupId>org.springframework.boot</groupId>

diff --git a/src/main/java/id/ac/itb/lumen/sinsymaker/LineExtractor.java b/src/main/java/id/ac/itb/lumen/sinsymaker/LineExtractor.java
@@ -40,14 +40,15 @@ public void init() {
         subtitle = subtitleReader.read(subtitleFile);
     }
 
-    public void extractAll() {
+    public void extractAll(File destDir) {
+        destDir.mkdirs();
         final ArrayList<SegmentWaveformWriter> segmentWriters = new ArrayList<>();
         for (final Caption caption : subtitle.captions.values()) {
             int strictFrameStart = Math.round(format.getSampleRate() * caption.start.getMseconds() / 1000f);
             int strictFrameEnd = Math.round(format.getSampleRate() * caption.end.getMseconds() / 1000f);
             int frameStart = Math.round(strictFrameStart - ATTACK_MS / 1000f * format.getSampleRate());
             int frameEnd = Math.round(strictFrameEnd + DECAY_MS / 1000f * format.getSampleRate());
-            final File file = new File(System.getProperty("user.home") + "/tmp", caption.content + ".wav");
+            final File file = new File(destDir, caption.content + ".wav");
             final SegmentWaveformWriter segmentWriter = new SegmentWaveformWriter(format, file.getPath(), frameStart, frameEnd, ATTACK_MS, DECAY_MS);
             segmentWriters.add(segmentWriter);
         }

diff --git a/src/main/java/id/ac/itb/lumen/sinsymaker/Segment2WordsApp.java b/src/main/java/id/ac/itb/lumen/sinsymaker/Segment2WordsApp.java
@@ -0,0 +1,55 @@
+package id.ac.itb.lumen.sinsymaker;
+
+import be.tarsos.dsp.AudioDispatcher;
+import be.tarsos.dsp.io.TarsosDSPAudioFormat;
+import be.tarsos.dsp.io.jvm.AudioDispatcherFactory;
+import com.google.common.base.Splitter;
+import org.apache.commons.io.FilenameUtils;
+import org.springframework.boot.CommandLineRunner;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+import org.springframework.boot.builder.SpringApplicationBuilder;
+import org.springframework.context.annotation.Profile;
+
+import java.io.File;
+import java.util.List;
+
+@SpringBootApplication
+@Profile("segment2words")
+public class Segment2WordsApp implements CommandLineRunner {
+
+    public static void main(String[] args) {
+        new SpringApplicationBuilder(Segment2WordsApp.class)
+                .profiles("segment2words")
+                .run(args);
+    }
+
+    @Override
+    public void run(String... args) throws Exception {
+        final TarsosDSPAudioFormat format = new TarsosDSPAudioFormat(44100, 16, 1, true, false);
+        final File clauseAudioFile = new File("/together/project_amanah/lumen/speech/expressive/clauses/perangkap oleh.wav");
+        final File targetDir = new File("/together/project_amanah/lumen/speech/expressive/words");
+        targetDir.mkdirs();
+
+        final String clause = FilenameUtils.getBaseName(clauseAudioFile.getName());
+        final List<String> splitClause = Splitter.on(' ').splitToList(clause);
+        double estMedianPos = splitClause.get(0).length() * 1d / (splitClause.get(0).length() + splitClause.get(1).length());
+
+        final AudioDispatcher dispatcher = AudioDispatcherFactory.fromFile(clauseAudioFile, 2048, 0);
+        long estMedianFrame = Math.round(estMedianPos * dispatcher.durationInFrames());
+        final int ATTACK_MS = 500;
+        final int RELEASE_MS = 500;
+        final int attackFrames = Math.round(ATTACK_MS / 1000f * format.getSampleRate());
+        final int releaseFrames = Math.round(RELEASE_MS / 1000f * format.getSampleRate());
+        // with attack + release
+        dispatcher.addAudioProcessor(new SegmentWaveformWriter(format, new File(targetDir, splitClause.get(0) + ".wav").getPath(),
+            0, estMedianFrame + releaseFrames, attackFrames, releaseFrames));
+        dispatcher.addAudioProcessor(new SegmentWaveformWriter(format, new File(targetDir, splitClause.get(1) + ".wav").getPath(),
+            estMedianFrame - attackFrames, dispatcher.durationInFrames(), attackFrames, releaseFrames));
+        // strict splitter
+//        dispatcher.addAudioProcessor(new SegmentWaveformWriter(format, new File(targetDir, splitClause.get(0) + ".wav").getPath(),
+//            attackFrames, estMedianFrame, 0, 0));
+//        dispatcher.addAudioProcessor(new SegmentWaveformWriter(format, new File(targetDir, splitClause.get(1) + ".wav").getPath(),
+//            estMedianFrame, dispatcher.durationInFrames() - releaseFrames, 0, 0));
+        dispatcher.run();
+    }
+}
diff --git a/src/main/java/id/ac/itb/lumen/sinsymaker/SegmentWaveformWriter.java b/src/main/java/id/ac/itb/lumen/sinsymaker/SegmentWaveformWriter.java
@@ -22,8 +22,8 @@ public class SegmentWaveformWriter implements AudioProcessor {
     private final AudioFormat format;
     private final File rawOutputFile;
     private final String fileName;
-    private final int frameStart;
-    private final int frameEnd;
+    private final long frameStart;
+    private final long frameEnd;
     private final int attack;
     private final int decay;
     private BufferedOutputStream rawOutputStream;
@@ -45,7 +45,7 @@ public class SegmentWaveformWriter implements AudioProcessor {
      * @param format The format of the received bytes.
      * @param fileName The name of the wav file to store.
      */
-    public SegmentWaveformWriter(final AudioFormat format, final String fileName, int frameStart, int frameEnd,
+    public SegmentWaveformWriter(final AudioFormat format, final String fileName, long frameStart, long frameEnd,
                                  int attackFrames, int decayFrames){
         this.format = format;
         this.frameStart = frameStart;
@@ -73,16 +73,16 @@ public SegmentWaveformWriter(final AudioFormat format, final String fileName, in
         }
     }
 
-    public SegmentWaveformWriter(final TarsosDSPAudioFormat format, final String fileName, int frameStart, int frameEnd,
+    public SegmentWaveformWriter(final TarsosDSPAudioFormat format, final String fileName, long frameStart, long frameEnd,
                                  int attack, int decay) {
         this(JVMAudioInputStream.toAudioFormat(format), fileName, frameStart, frameEnd, attack, decay);
     }
 
-    public int getFrameStart() {
+    public long getFrameStart() {
         return frameStart;
     }
 
-    public int getFrameEnd() {
+    public long getFrameEnd() {
         return frameEnd;
     }
 

diff --git a/src/main/java/id/ac/itb/lumen/sinsymaker/SinsyMakerApplication.java b/src/main/java/id/ac/itb/lumen/sinsymaker/SinsyMakerApplication.java
@@ -24,6 +24,6 @@ public void run(String... args) throws Exception {
         lineExtractor.sourceAudioFile = new File("/together/project_amanah/lumen/speech/expressive/dongeng-bangau.wav");
         lineExtractor.subtitleFile = new File("/together/project_amanah/lumen/speech/expressive/dongeng-bangau.ass");
         lineExtractor.init();
-        lineExtractor.extractAll();
+        lineExtractor.extractAll(new File("/together/project_amanah/lumen/speech/expressive/clauses"));
     }
 }