Skip to content

Voice Command Project – Phase 1 installing Python 3.10.10 and Whisper model packages, running in localhost server 8000. Phase 2 Unity integration with Canvas (Start/Stop buttons, Dropdown, Text), mic input, JSON command config, character animation control (Move, Stop, Run, Dance, Jump) using voice commands.

Notifications You must be signed in to change notification settings

abhishak26/Voice-Command-System-with-Python-Whisper-API-Unity-Integration

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

2 Commits
 
 
 
 
 
 
 
 

Repository files navigation

🎙️ Voice Command System with Python Whisper API & Unity Integration

################################################################################
# 📌 PHASE 1 – PYTHON + WHISPER API SERVER SETUP
################################################################################

# Step 1: Install Python 3.10.10
# 👉 Download from: https://www.python.org/downloads/release/python-31010/
python --version
# Expected output: Python 3.10.10

# Step 2: Create & Activate Virtual Environment
python -m venv venv
.\venv\Scripts\activate

# If activation blocked in PowerShell:
Set-ExecutionPolicy RemoteSigned -Scope CurrentUser

# Step 3: Install Required Packages
python.exe -m pip install --upgrade pip
pip install fastapi uvicorn python-multipart
pip install git+https://github.com/openai/whisper.git
pip install torch
pip install faster-whisper

# Step 4: Create API Server File (whisper_api_server.py)
cat > whisper_api_server.py << 'EOF'
from fastapi import FastAPI, File, UploadFile
from faster_whisper import WhisperModel
from typing import Dict
import io

app = FastAPI()
model = WhisperModel("tiny", device="cpu")

@app.post("/transcribe")
async def transcribe(audio: UploadFile = File(...)) -> Dict[str, str]:
    try:
        audio_content = await audio.read()
        audio_file_like = io.BytesIO(audio_content)
        segments, _ = model.transcribe(audio_file_like)
        transcription_text = ""
        for segment in segments:
            transcription_text += segment.text + " "
        return {"transcription": transcription_text.strip()}
    except Exception as e:
        return {"error": str(e)}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
EOF

# Step 5: Run the Server
python whisper_api_server.py
# Server running at: http://0.0.0.0:8000
# Stop with CTRL+C


################################################################################
# 📌 PHASE 2 – UNITY PROJECT SETUP
################################################################################

# Folder structure (inside Assets/)
# ┣ Scripts/
# ┃ ┣ SpeechRecognitionTest.cs
# ┃ ┗ SavWav.cs
# ┣ Resources/
# ┃ ┗ VoiceCommandConfig.json
# ┣ Scenes/
# ┃ ┗ SampleScene.unity

# Step 1: Create VoiceCommandConfig.json
cat > Assets/Resources/VoiceCommandConfig.json << 'EOF'
[
  { "command": "MOVE", "trigger": "Move" },
  { "command": "STOP", "trigger": "Stop" },
  { "command": "RUN", "trigger": "Run" },
  { "command": "DANCE", "trigger": "Dance" },
  { "command": "JUMP", "trigger": "Jump" }
]
EOF

# Step 2: Create SpeechRecognitionTest.cs
cat > Assets/Scripts/SpeechRecognitionTest.cs << 'EOF'
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Networking;
using TMPro;
using UnityEngine.UI;
using System.IO;

[System.Serializable]
public class VoiceCommand {
    public string command;
    public string trigger;
}

public class SpeechRecognitionTest : MonoBehaviour {
    public Button startButton;
    public Button stopButton;
    public TMP_Text resultText;
    public TMP_Dropdown micDropdown;
    public Animator characterAnimator;
    private AudioClip recording;
    private string micDevice;
    private string filePath;

    void Start() {
        startButton.onClick.AddListener(StartRecording);
        stopButton.onClick.AddListener(StopRecording);
        micDropdown.ClearOptions();
        micDropdown.AddOptions(new List<string>(Microphone.devices));
    }

    void StartRecording() {
        micDevice = micDropdown.options[micDropdown.value].text;
        recording = Microphone.Start(micDevice, false, 10, 44100);
        resultText.text = "Listening...";
    }

    void StopRecording() {
        Microphone.End(micDevice);
        SavWav.Save("recording", recording);
        filePath = Path.Combine(Application.persistentDataPath, "recording.wav");
        StartCoroutine(Upload(filePath));
    }

    IEnumerator Upload(string filePath) {
        WWWForm form = new WWWForm();
        byte[] fileData = File.ReadAllBytes(filePath);
        form.AddBinaryData("audio", fileData, "recording.wav", "audio/wav");

        using (UnityWebRequest www = UnityWebRequest.Post("http://127.0.0.1:8000/transcribe", form)) {
            yield return www.SendWebRequest();

            if (www.result != UnityWebRequest.Result.Success) {
                resultText.text = "Error: " + www.error;
            } else {
                string json = www.downloadHandler.text;
                resultText.text = json;
                HandleCommand(json);
            }
        }
    }

    void HandleCommand(string json) {
        string lower = json.ToLower();
        if (lower.Contains("move")) characterAnimator.SetTrigger("Move");
        if (lower.Contains("stop")) characterAnimator.SetTrigger("Stop");
        if (lower.Contains("run")) characterAnimator.SetTrigger("Run");
        if (lower.Contains("dance")) characterAnimator.SetTrigger("Dance");
        if (lower.Contains("jump")) characterAnimator.SetTrigger("Jump");
    }
}
EOF

# Step 3: Create SavWav.cs
cat > Assets/Scripts/SavWav.cs << 'EOF'
using System;
using System.IO;
using UnityEngine;

public static class SavWav {
    const int HEADER_SIZE = 44;

    public static bool Save(string filename, AudioClip clip) {
        var filepath = Path.Combine(Application.persistentDataPath, filename + ".wav");
        Directory.CreateDirectory(Path.GetDirectoryName(filepath));
        using (var fileStream = CreateEmpty(filepath)) {
            ConvertAndWrite(fileStream, clip);
            WriteHeader(fileStream, clip);
        }
        return true;
    }

    static FileStream CreateEmpty(string filepath) {
        var fileStream = new FileStream(filepath, FileMode.Create);
        byte emptyByte = new byte();
        for (int i = 0; i < HEADER_SIZE; i++) {
            fileStream.WriteByte(emptyByte);
        }
        return fileStream;
    }

    static void ConvertAndWrite(FileStream fileStream, AudioClip clip) {
        var samples = new float[clip.samples * clip.channels];
        clip.GetData(samples, 0);
        Int16[] intData = new Int16[samples.Length];
        Byte[] bytesData = new Byte[samples.Length * 2];

        const float rescaleFactor = 32767;
        for (int i = 0; i < samples.Length; i++) {
            intData[i] = (short)(samples[i] * rescaleFactor);
            Byte[] byteArr = BitConverter.GetBytes(intData[i]);
            byteArr.CopyTo(bytesData, i * 2);
        }
        fileStream.Write(bytesData, 0, bytesData.Length);
    }

    static void WriteHeader(FileStream fileStream, AudioClip clip) {
        fileStream.Seek(0, SeekOrigin.Begin);
        Byte[] riff = System.Text.Encoding.UTF8.GetBytes("RIFF");
        fileStream.Write(riff, 0, 4);
        Byte[] chunkSize = BitConverter.GetBytes(fileStream.Length - 8);
        fileStream.Write(chunkSize, 0, 4);
        Byte[] wave = System.Text.Encoding.UTF8.GetBytes("WAVE");
        fileStream.Write(wave, 0, 4);
        Byte[] fmt = System.Text.Encoding.UTF8.GetBytes("fmt ");
        fileStream.Write(fmt, 0, 4);
        Byte[] subChunk1 = BitConverter.GetBytes(16);
        fileStream.Write(subChunk1, 0, 4);
        UInt16 audioFormat = 1;
        Byte[] audioFormatBytes = BitConverter.GetBytes(audioFormat);
        fileStream.Write(audioFormatBytes, 0, 2);
        UInt16 numChannels = (ushort)clip.channels;
        fileStream.Write(BitConverter.GetBytes(numChannels), 0, 2);
        Byte[] sampleRate = BitConverter.GetBytes(clip.frequency);
        fileStream.Write(sampleRate, 0, 4);
        Byte[] byteRate = BitConverter.GetBytes(clip.frequency * clip.channels * 2);
        fileStream.Write(byteRate, 0, 4);
        UInt16 blockAlign = (ushort)(clip.channels * 2);
        fileStream.Write(BitConverter.GetBytes(blockAlign), 0, 2);
        UInt16 bps = 16;
        fileStream.Write(BitConverter.GetBytes(bps), 0, 2);
        Byte[] datastring = System.Text.Encoding.UTF8.GetBytes("data");
        fileStream.Write(datastring, 0, 4);
        Byte[] subChunk2 = BitConverter.GetBytes(clip.samples * clip.channels * 2);
        fileStream.Write(subChunk2, 0, 4);
    }
}
EOF


################################################################################
# 📌 PHASE 3 – RUN & USE
################################################################################

# Step 1: Start Python API
python whisper_api_server.py

# Step 2: Run Unity scene
# - Select microphone in dropdown
# - Click Start → Speak: "Move", "Stop", "Run", "Dance", "Jump"
# - Click Stop → Character animates

# Step 3 (Optional): Use iVCam mobile mic
# - Install iVCam on phone + PC (https://www.e2esoft.com/ivcam/)
# - Connect both to same WiFi/hotspot
# - In Unity mic dropdown → choose "iVCam Mic"

About

Voice Command Project – Phase 1 installing Python 3.10.10 and Whisper model packages, running in localhost server 8000. Phase 2 Unity integration with Canvas (Start/Stop buttons, Dropdown, Text), mic input, JSON command config, character animation control (Move, Stop, Run, Dance, Jump) using voice commands.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published